diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 1.8281873727150824, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.06036572530865669, + "logits/rejected": 0.15200476348400116, + "logps/chosen": -1.7157948017120361, + "logps/rejected": -1.889754056930542, + "loss": 0.1875, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.7157948017120361, + "rewards/margins": 0.17395934462547302, + "rewards/rejected": -1.889754056930542, + "sft_loss": 1.4684072732925415, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 1.3261214622264328, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": 0.015721673145890236, + "logits/rejected": 0.14082524180412292, + "logps/chosen": -1.803401231765747, + "logps/rejected": -1.8462854623794556, + "loss": 0.1915, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.803401231765747, + "rewards/margins": 0.042884208261966705, + "rewards/rejected": -1.8462854623794556, + "sft_loss": 1.5086901187896729, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 1.6869277012824744, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.0386616587638855, + "logits/rejected": 0.061269234865903854, + "logps/chosen": -1.6346614360809326, + "logps/rejected": -1.7652347087860107, + "loss": 0.209, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6346614360809326, + "rewards/margins": 0.130573108792305, + "rewards/rejected": -1.7652347087860107, + "sft_loss": 1.5001634359359741, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 2.184541215763592, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.044674623757600784, + "logits/rejected": 0.04231274500489235, + "logps/chosen": -1.7240028381347656, + "logps/rejected": -1.8060672283172607, + "loss": 0.2066, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7240028381347656, + "rewards/margins": 0.0820644274353981, + "rewards/rejected": -1.8060672283172607, + "sft_loss": 1.5000752210617065, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 2.2885509048614607, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.061998527497053146, + "logits/rejected": 0.023830315098166466, + "logps/chosen": -1.869637131690979, + "logps/rejected": -1.7783664464950562, + "loss": 0.2291, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.869637131690979, + "rewards/margins": -0.09127076715230942, + "rewards/rejected": -1.7783664464950562, + "sft_loss": 1.545493245124817, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 1.6851104456657149, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.0781058818101883, + "logits/rejected": 0.018343383446335793, + "logps/chosen": -1.9102665185928345, + "logps/rejected": -1.833251714706421, + "loss": 0.1924, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9102665185928345, + "rewards/margins": -0.07701461762189865, + "rewards/rejected": -1.833251714706421, + "sft_loss": 1.6474205255508423, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 2.1116446404871723, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.06157956272363663, + "logits/rejected": 0.10128965228796005, + "logps/chosen": -1.847612738609314, + "logps/rejected": -1.9992172718048096, + "loss": 0.2024, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.847612738609314, + "rewards/margins": 0.1516043245792389, + "rewards/rejected": -1.9992172718048096, + "sft_loss": 1.5620036125183105, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 1.606280602217098, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.05405878275632858, + "logits/rejected": 0.2346314936876297, + "logps/chosen": -1.8864805698394775, + "logps/rejected": -1.7479203939437866, + "loss": 0.2067, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8864805698394775, + "rewards/margins": -0.13856001198291779, + "rewards/rejected": -1.7479203939437866, + "sft_loss": 1.5202641487121582, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 2.0142313565096983, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.020334195345640182, + "logits/rejected": 0.22058598697185516, + "logps/chosen": -1.8413622379302979, + "logps/rejected": -1.8763911724090576, + "loss": 0.1998, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8413622379302979, + "rewards/margins": 0.03502892702817917, + "rewards/rejected": -1.8763911724090576, + "sft_loss": 1.5375398397445679, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 2.4004776884800254, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.048713017255067825, + "logits/rejected": 0.1055837869644165, + "logps/chosen": -1.9055683612823486, + "logps/rejected": -1.7839202880859375, + "loss": 0.2037, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.9055683612823486, + "rewards/margins": -0.12164795398712158, + "rewards/rejected": -1.7839202880859375, + "sft_loss": 1.585984468460083, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 1.9065344756371017, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.1268903762102127, + "logits/rejected": 0.09180887043476105, + "logps/chosen": -1.8452562093734741, + "logps/rejected": -1.8789564371109009, + "loss": 0.1967, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.8452562093734741, + "rewards/margins": 0.03370007127523422, + "rewards/rejected": -1.8789564371109009, + "sft_loss": 1.5881474018096924, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 1.9726561828800409, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.09329613298177719, + "logits/rejected": 0.1027790755033493, + "logps/chosen": -1.8032081127166748, + "logps/rejected": -1.9089362621307373, + "loss": 0.1881, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.8032081127166748, + "rewards/margins": 0.10572835057973862, + "rewards/rejected": -1.9089362621307373, + "sft_loss": 1.5484545230865479, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 1.4158996749388524, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.026859384030103683, + "logits/rejected": 0.12450633198022842, + "logps/chosen": -1.6532108783721924, + "logps/rejected": -1.7873064279556274, + "loss": 0.1998, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6532108783721924, + "rewards/margins": 0.13409557938575745, + "rewards/rejected": -1.7873064279556274, + "sft_loss": 1.4820306301116943, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 2.7550451891140657, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.07876388728618622, + "logits/rejected": 0.07389497011899948, + "logps/chosen": -1.78818678855896, + "logps/rejected": -1.8362791538238525, + "loss": 0.2083, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -1.78818678855896, + "rewards/margins": 0.04809259623289108, + "rewards/rejected": -1.8362791538238525, + "sft_loss": 1.6427921056747437, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 1.6257388034299363, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.05901254341006279, + "logits/rejected": 0.12799496948719025, + "logps/chosen": -1.81658935546875, + "logps/rejected": -2.0846848487854004, + "loss": 0.1872, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.81658935546875, + "rewards/margins": 0.26809555292129517, + "rewards/rejected": -2.0846848487854004, + "sft_loss": 1.582075595855713, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 1.6945858724148233, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": 0.014201399870216846, + "logits/rejected": 0.12179889529943466, + "logps/chosen": -1.7666809558868408, + "logps/rejected": -1.7983417510986328, + "loss": 0.2031, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.7666809558868408, + "rewards/margins": 0.03166085481643677, + "rewards/rejected": -1.7983417510986328, + "sft_loss": 1.5483434200286865, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 1.4872099012995739, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.14098253846168518, + "logits/rejected": 0.11290383338928223, + "logps/chosen": -1.849591612815857, + "logps/rejected": -2.038990020751953, + "loss": 0.1918, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.849591612815857, + "rewards/margins": 0.18939858675003052, + "rewards/rejected": -2.038990020751953, + "sft_loss": 1.5175174474716187, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 1.6072015338976098, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.07844671607017517, + "logits/rejected": 0.04320339113473892, + "logps/chosen": -1.8240934610366821, + "logps/rejected": -1.8337669372558594, + "loss": 0.2062, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.8240934610366821, + "rewards/margins": 0.009673514403402805, + "rewards/rejected": -1.8337669372558594, + "sft_loss": 1.4806925058364868, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 1.1704935247873707, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.07325179874897003, + "logits/rejected": 0.08035645633935928, + "logps/chosen": -1.9085460901260376, + "logps/rejected": -1.9914019107818604, + "loss": 0.194, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9085460901260376, + "rewards/margins": 0.08285579085350037, + "rewards/rejected": -1.9914019107818604, + "sft_loss": 1.562260627746582, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 1.485734092812147, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.051618821918964386, + "logits/rejected": 0.014512482099235058, + "logps/chosen": -1.7798315286636353, + "logps/rejected": -1.8910728693008423, + "loss": 0.1933, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7798315286636353, + "rewards/margins": 0.11124144494533539, + "rewards/rejected": -1.8910728693008423, + "sft_loss": 1.5264718532562256, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 1.2246905118137938, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.06890567392110825, + "logits/rejected": 0.09699970483779907, + "logps/chosen": -1.75466787815094, + "logps/rejected": -1.9174325466156006, + "loss": 0.1861, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.75466787815094, + "rewards/margins": 0.16276490688323975, + "rewards/rejected": -1.9174325466156006, + "sft_loss": 1.4836585521697998, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 1.3983137398048286, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.014802386984229088, + "logits/rejected": 0.11312691867351532, + "logps/chosen": -1.8402084112167358, + "logps/rejected": -1.9002002477645874, + "loss": 0.2021, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.8402084112167358, + "rewards/margins": 0.059991706162691116, + "rewards/rejected": -1.9002002477645874, + "sft_loss": 1.541327714920044, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 1.542976037363866, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.044870324432849884, + "logits/rejected": 0.259618878364563, + "logps/chosen": -1.8326823711395264, + "logps/rejected": -2.162672281265259, + "loss": 0.1796, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.8326823711395264, + "rewards/margins": 0.32998955249786377, + "rewards/rejected": -2.162672281265259, + "sft_loss": 1.6590197086334229, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 1.121394950130821, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.05395837500691414, + "logits/rejected": 0.12947091460227966, + "logps/chosen": -1.975813627243042, + "logps/rejected": -2.1206367015838623, + "loss": 0.1781, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.975813627243042, + "rewards/margins": 0.14482299983501434, + "rewards/rejected": -2.1206367015838623, + "sft_loss": 1.6822599172592163, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 1.738390891126455, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.05913634970784187, + "logits/rejected": 0.077244333922863, + "logps/chosen": -1.8747756481170654, + "logps/rejected": -1.791603446006775, + "loss": 0.211, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.8747756481170654, + "rewards/margins": -0.0831720232963562, + "rewards/rejected": -1.791603446006775, + "sft_loss": 1.5983225107192993, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 1.2172011134571397, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.07068384438753128, + "logits/rejected": 0.21463127434253693, + "logps/chosen": -1.9429035186767578, + "logps/rejected": -2.06502103805542, + "loss": 0.1788, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.9429035186767578, + "rewards/margins": 0.12211757898330688, + "rewards/rejected": -2.06502103805542, + "sft_loss": 1.6812289953231812, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 1.4439255764123966, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.017868880182504654, + "logits/rejected": 0.10886181890964508, + "logps/chosen": -2.0224013328552246, + "logps/rejected": -1.997601866722107, + "loss": 0.191, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.0224013328552246, + "rewards/margins": -0.02479952946305275, + "rewards/rejected": -1.997601866722107, + "sft_loss": 1.6454761028289795, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 1.9215015654493501, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.02103680931031704, + "logits/rejected": 0.1541782170534134, + "logps/chosen": -1.9884147644042969, + "logps/rejected": -2.2401504516601562, + "loss": 0.1721, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.9884147644042969, + "rewards/margins": 0.25173547863960266, + "rewards/rejected": -2.2401504516601562, + "sft_loss": 1.6845868825912476, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 1.2117294815525885, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": -0.007556894328445196, + "logits/rejected": 0.1537446826696396, + "logps/chosen": -1.9553453922271729, + "logps/rejected": -2.1021084785461426, + "loss": 0.1804, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.9553453922271729, + "rewards/margins": 0.14676345884799957, + "rewards/rejected": -2.1021084785461426, + "sft_loss": 1.6186004877090454, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 1.4324321514957685, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.04060187563300133, + "logits/rejected": 0.13013625144958496, + "logps/chosen": -1.9446437358856201, + "logps/rejected": -1.9469516277313232, + "loss": 0.1945, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.9446437358856201, + "rewards/margins": 0.002307900693267584, + "rewards/rejected": -1.9469516277313232, + "sft_loss": 1.4851183891296387, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 1.410377603703526, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.052264392375946045, + "logits/rejected": -0.0019641772378236055, + "logps/chosen": -2.0606913566589355, + "logps/rejected": -2.109433650970459, + "loss": 0.1869, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.0606913566589355, + "rewards/margins": 0.048742227256298065, + "rewards/rejected": -2.109433650970459, + "sft_loss": 1.6540358066558838, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 1.4434488798396297, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.1261916607618332, + "logits/rejected": 0.024200741201639175, + "logps/chosen": -2.2658863067626953, + "logps/rejected": -2.241018533706665, + "loss": 0.1891, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.2658863067626953, + "rewards/margins": -0.024867888540029526, + "rewards/rejected": -2.241018533706665, + "sft_loss": 1.7461020946502686, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 1.339027685977903, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.02971971035003662, + "logits/rejected": 0.15595880150794983, + "logps/chosen": -2.002155065536499, + "logps/rejected": -2.307286024093628, + "loss": 0.1813, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.002155065536499, + "rewards/margins": 0.3051307797431946, + "rewards/rejected": -2.307286024093628, + "sft_loss": 1.5998609066009521, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 1.4548486638159581, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.05048539489507675, + "logits/rejected": 0.00913523230701685, + "logps/chosen": -2.278296947479248, + "logps/rejected": -2.2565479278564453, + "loss": 0.1819, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -2.278296947479248, + "rewards/margins": -0.021749010309576988, + "rewards/rejected": -2.2565479278564453, + "sft_loss": 1.7058801651000977, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 1.3337472174712208, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.07039393484592438, + "logits/rejected": 0.07238060235977173, + "logps/chosen": -2.1607205867767334, + "logps/rejected": -2.224153995513916, + "loss": 0.1957, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -2.1607205867767334, + "rewards/margins": 0.06343330442905426, + "rewards/rejected": -2.224153995513916, + "sft_loss": 1.7343257665634155, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 1.0106930560920084, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": 0.08963946998119354, + "logits/rejected": 0.09271235764026642, + "logps/chosen": -2.2524635791778564, + "logps/rejected": -2.2211055755615234, + "loss": 0.1764, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.2524635791778564, + "rewards/margins": -0.03135796636343002, + "rewards/rejected": -2.2211055755615234, + "sft_loss": 1.715368628501892, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 1.3427419510653615, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.09551338851451874, + "logits/rejected": 0.0052245319820940495, + "logps/chosen": -2.188047170639038, + "logps/rejected": -2.277782917022705, + "loss": 0.1871, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.188047170639038, + "rewards/margins": 0.08973531424999237, + "rewards/rejected": -2.277782917022705, + "sft_loss": 1.6850707530975342, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 1.301877363775769, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.01349000446498394, + "logits/rejected": 0.12422996759414673, + "logps/chosen": -2.699043035507202, + "logps/rejected": -2.5563666820526123, + "loss": 0.1669, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -2.699043035507202, + "rewards/margins": -0.14267598092556, + "rewards/rejected": -2.5563666820526123, + "sft_loss": 1.9406452178955078, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 1.086372047902067, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": 0.08391423523426056, + "logits/rejected": 0.25889503955841064, + "logps/chosen": -2.082772731781006, + "logps/rejected": -2.147904872894287, + "loss": 0.1832, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.082772731781006, + "rewards/margins": 0.06513235718011856, + "rewards/rejected": -2.147904872894287, + "sft_loss": 1.587127685546875, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 1.0784548518531638, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.045540787279605865, + "logits/rejected": 0.10638017952442169, + "logps/chosen": -2.497420310974121, + "logps/rejected": -2.2494847774505615, + "loss": 0.1807, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.497420310974121, + "rewards/margins": -0.24793526530265808, + "rewards/rejected": -2.2494847774505615, + "sft_loss": 1.8374172449111938, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 1.2841841345947111, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.010509648360311985, + "logits/rejected": 0.15566954016685486, + "logps/chosen": -2.733661413192749, + "logps/rejected": -2.442354202270508, + "loss": 0.167, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.733661413192749, + "rewards/margins": -0.29130715131759644, + "rewards/rejected": -2.442354202270508, + "sft_loss": 1.7773916721343994, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 1.4421977705539422, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.11572986841201782, + "logits/rejected": 0.0990462675690651, + "logps/chosen": -2.5400633811950684, + "logps/rejected": -2.898448944091797, + "loss": 0.1456, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.5400633811950684, + "rewards/margins": 0.3583856523036957, + "rewards/rejected": -2.898448944091797, + "sft_loss": 1.8469970226287842, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 1.506528371919276, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.11365026235580444, + "logits/rejected": 0.1664947271347046, + "logps/chosen": -2.3687429428100586, + "logps/rejected": -2.4806742668151855, + "loss": 0.1593, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.3687429428100586, + "rewards/margins": 0.11193136870861053, + "rewards/rejected": -2.4806742668151855, + "sft_loss": 1.8512741327285767, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 1.4541076330685845, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.11326134204864502, + "logits/rejected": 0.23379270732402802, + "logps/chosen": -2.7988970279693604, + "logps/rejected": -3.2029125690460205, + "loss": 0.1425, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.7988970279693604, + "rewards/margins": 0.40401554107666016, + "rewards/rejected": -3.2029125690460205, + "sft_loss": 2.0269691944122314, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 1.4193672051877066, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.07774774730205536, + "logits/rejected": 0.11152136325836182, + "logps/chosen": -2.676217555999756, + "logps/rejected": -2.757519483566284, + "loss": 0.1467, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.676217555999756, + "rewards/margins": 0.08130187541246414, + "rewards/rejected": -2.757519483566284, + "sft_loss": 1.8078447580337524, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 1.4655988363625254, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": 0.04009638726711273, + "logits/rejected": 0.13166067004203796, + "logps/chosen": -3.2687041759490967, + "logps/rejected": -3.2889716625213623, + "loss": 0.1471, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.2687041759490967, + "rewards/margins": 0.020267415791749954, + "rewards/rejected": -3.2889716625213623, + "sft_loss": 1.96890127658844, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 1.278825876101477, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": 0.03988439589738846, + "logits/rejected": 0.2017788141965866, + "logps/chosen": -3.112320899963379, + "logps/rejected": -3.4062983989715576, + "loss": 0.1284, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.112320899963379, + "rewards/margins": 0.29397743940353394, + "rewards/rejected": -3.4062983989715576, + "sft_loss": 1.9640719890594482, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 1.063516787032469, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": 0.04343884810805321, + "logits/rejected": 0.190804585814476, + "logps/chosen": -3.3919873237609863, + "logps/rejected": -3.4484734535217285, + "loss": 0.1373, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.3919873237609863, + "rewards/margins": 0.05648590251803398, + "rewards/rejected": -3.4484734535217285, + "sft_loss": 2.2613186836242676, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 1.246833107441079, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": 0.08618511259555817, + "logits/rejected": 0.22703281044960022, + "logps/chosen": -3.113327980041504, + "logps/rejected": -3.773731231689453, + "loss": 0.1335, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.113327980041504, + "rewards/margins": 0.6604034900665283, + "rewards/rejected": -3.773731231689453, + "sft_loss": 2.246939182281494, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 1.550906778055372, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": 0.008564489893615246, + "logits/rejected": 0.19193480908870697, + "logps/chosen": -4.5200276374816895, + "logps/rejected": -4.392758369445801, + "loss": 0.1367, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -4.5200276374816895, + "rewards/margins": -0.12726902961730957, + "rewards/rejected": -4.392758369445801, + "sft_loss": 2.322331190109253, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 1.0017344608881895, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": 0.027799557894468307, + "logits/rejected": 0.1999567598104477, + "logps/chosen": -3.2353675365448, + "logps/rejected": -3.8548312187194824, + "loss": 0.1278, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.2353675365448, + "rewards/margins": 0.6194636821746826, + "rewards/rejected": -3.8548312187194824, + "sft_loss": 2.0332932472229004, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 1.1722811528835204, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.13097763061523438, + "logits/rejected": -0.003967789001762867, + "logps/chosen": -4.378108024597168, + "logps/rejected": -3.8029494285583496, + "loss": 0.1114, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -4.378108024597168, + "rewards/margins": -0.5751584768295288, + "rewards/rejected": -3.8029494285583496, + "sft_loss": 2.4523839950561523, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 0.8598376845470599, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": 0.04610385373234749, + "logits/rejected": 0.14736303687095642, + "logps/chosen": -5.376759052276611, + "logps/rejected": -4.316792964935303, + "loss": 0.1299, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -5.376759052276611, + "rewards/margins": -1.0599651336669922, + "rewards/rejected": -4.316792964935303, + "sft_loss": 3.3849892616271973, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 0.659100840868797, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": 0.05620427802205086, + "logits/rejected": 0.22956471145153046, + "logps/chosen": -4.417783737182617, + "logps/rejected": -4.7367353439331055, + "loss": 0.1148, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -4.417783737182617, + "rewards/margins": 0.3189517557621002, + "rewards/rejected": -4.7367353439331055, + "sft_loss": 2.5751936435699463, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 0.944667145643773, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": 0.1821393221616745, + "logits/rejected": 0.29745563864707947, + "logps/chosen": -5.001216888427734, + "logps/rejected": -5.648111820220947, + "loss": 0.1278, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.001216888427734, + "rewards/margins": 0.646894633769989, + "rewards/rejected": -5.648111820220947, + "sft_loss": 3.029219150543213, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 0.62958069078015, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": 0.06474583595991135, + "logits/rejected": 0.26421093940734863, + "logps/chosen": -5.785727500915527, + "logps/rejected": -5.513718128204346, + "loss": 0.111, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -5.785727500915527, + "rewards/margins": -0.2720094919204712, + "rewards/rejected": -5.513718128204346, + "sft_loss": 3.7501449584960938, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 0.7707223965415297, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": 0.07362186163663864, + "logits/rejected": 0.24137923121452332, + "logps/chosen": -5.315040111541748, + "logps/rejected": -5.234798431396484, + "loss": 0.1171, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -5.315040111541748, + "rewards/margins": -0.08024124801158905, + "rewards/rejected": -5.234798431396484, + "sft_loss": 2.7596487998962402, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 0.43821944195019613, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": 0.06858251988887787, + "logits/rejected": 0.4367496967315674, + "logps/chosen": -4.310262203216553, + "logps/rejected": -5.49105167388916, + "loss": 0.0853, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.310262203216553, + "rewards/margins": 1.1807892322540283, + "rewards/rejected": -5.49105167388916, + "sft_loss": 2.8444995880126953, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 0.534101197488112, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": 0.11962026357650757, + "logits/rejected": 0.191305473446846, + "logps/chosen": -5.814216613769531, + "logps/rejected": -5.191082954406738, + "loss": 0.1136, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -5.814216613769531, + "rewards/margins": -0.6231337785720825, + "rewards/rejected": -5.191082954406738, + "sft_loss": 3.5382983684539795, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 0.7825104689335549, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": 0.09059463441371918, + "logits/rejected": 0.3025878965854645, + "logps/chosen": -6.152596950531006, + "logps/rejected": -6.278104782104492, + "loss": 0.1085, + "rewards/accuracies": 0.59375, + "rewards/chosen": -6.152596950531006, + "rewards/margins": 0.12550795078277588, + "rewards/rejected": -6.278104782104492, + "sft_loss": 3.1514294147491455, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 0.7203056918661761, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": 0.15295490622520447, + "logits/rejected": 0.23979385197162628, + "logps/chosen": -5.263061046600342, + "logps/rejected": -5.25927734375, + "loss": 0.1103, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -5.263061046600342, + "rewards/margins": -0.003783750580623746, + "rewards/rejected": -5.25927734375, + "sft_loss": 3.4355788230895996, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 0.48118015251670226, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.04459068924188614, + "logits/rejected": 0.07622343301773071, + "logps/chosen": -6.208052158355713, + "logps/rejected": -6.269270420074463, + "loss": 0.099, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -6.208052158355713, + "rewards/margins": 0.061218809336423874, + "rewards/rejected": -6.269270420074463, + "sft_loss": 3.9879536628723145, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 0.5130287780473973, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": 0.1535053700208664, + "logits/rejected": 0.35032719373703003, + "logps/chosen": -5.845208644866943, + "logps/rejected": -5.75873327255249, + "loss": 0.0997, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -5.845208644866943, + "rewards/margins": -0.08647508919239044, + "rewards/rejected": -5.75873327255249, + "sft_loss": 3.6303811073303223, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 0.894923831994968, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": 0.07708640396595001, + "logits/rejected": 0.23186373710632324, + "logps/chosen": -5.41533088684082, + "logps/rejected": -5.087583065032959, + "loss": 0.1109, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.41533088684082, + "rewards/margins": -0.32774776220321655, + "rewards/rejected": -5.087583065032959, + "sft_loss": 3.4817776679992676, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 0.6602942214082753, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.001248963177204132, + "logits/rejected": 0.1617216020822525, + "logps/chosen": -6.740938663482666, + "logps/rejected": -7.048771858215332, + "loss": 0.0954, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -6.740938663482666, + "rewards/margins": 0.30783215165138245, + "rewards/rejected": -7.048771858215332, + "sft_loss": 3.575293779373169, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 2.0172656785700736, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": 0.08358623087406158, + "logits/rejected": 0.2782168984413147, + "logps/chosen": -4.833988666534424, + "logps/rejected": -6.85193395614624, + "loss": 0.0975, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.833988666534424, + "rewards/margins": 2.0179455280303955, + "rewards/rejected": -6.85193395614624, + "sft_loss": 3.4718756675720215, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 1.559804678106283, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": 0.021460741758346558, + "logits/rejected": 0.18631306290626526, + "logps/chosen": -7.669114589691162, + "logps/rejected": -7.59591007232666, + "loss": 0.0806, + "rewards/accuracies": 0.53125, + "rewards/chosen": -7.669114589691162, + "rewards/margins": -0.07320408523082733, + "rewards/rejected": -7.59591007232666, + "sft_loss": 4.107626438140869, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 3.600551249598769, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": 0.0006690695881843567, + "logits/rejected": 0.20934703946113586, + "logps/chosen": -7.419314384460449, + "logps/rejected": -7.942813873291016, + "loss": 0.0665, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -7.419314384460449, + "rewards/margins": 0.5234988331794739, + "rewards/rejected": -7.942813873291016, + "sft_loss": 5.120023250579834, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 3.051133199140238, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": 0.11144141852855682, + "logits/rejected": 0.18817485868930817, + "logps/chosen": -7.210999965667725, + "logps/rejected": -7.412866115570068, + "loss": 0.0692, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.210999965667725, + "rewards/margins": 0.20186543464660645, + "rewards/rejected": -7.412866115570068, + "sft_loss": 5.065989971160889, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 2.8496280707460535, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": 0.15946859121322632, + "logits/rejected": 0.3110240697860718, + "logps/chosen": -7.066911220550537, + "logps/rejected": -6.517140865325928, + "loss": 0.0691, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -7.066911220550537, + "rewards/margins": -0.5497702956199646, + "rewards/rejected": -6.517140865325928, + "sft_loss": 4.872412204742432, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 1.301936365963939, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": 0.07622610032558441, + "logits/rejected": 0.35027334094047546, + "logps/chosen": -6.583775997161865, + "logps/rejected": -6.518033027648926, + "loss": 0.0641, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -6.583775997161865, + "rewards/margins": -0.06574312597513199, + "rewards/rejected": -6.518033027648926, + "sft_loss": 5.07278299331665, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 2.4767014699621797, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": 0.1512000560760498, + "logits/rejected": 0.24605056643486023, + "logps/chosen": -5.883225440979004, + "logps/rejected": -6.043740272521973, + "loss": 0.0636, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -5.883225440979004, + "rewards/margins": 0.1605152040719986, + "rewards/rejected": -6.043740272521973, + "sft_loss": 4.487223148345947, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 2.5470960757144923, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": 0.1374529004096985, + "logits/rejected": 0.23795314133167267, + "logps/chosen": -5.908981800079346, + "logps/rejected": -5.590696811676025, + "loss": 0.0649, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -5.908981800079346, + "rewards/margins": -0.3182848393917084, + "rewards/rejected": -5.590696811676025, + "sft_loss": 4.756215572357178, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 1.213454043776727, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": 0.21467037498950958, + "logits/rejected": 0.3665415644645691, + "logps/chosen": -5.693985462188721, + "logps/rejected": -5.656804084777832, + "loss": 0.07, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -5.693985462188721, + "rewards/margins": -0.037180982530117035, + "rewards/rejected": -5.656804084777832, + "sft_loss": 4.857564926147461, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 1.4920484603725634, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": 0.01740451343357563, + "logits/rejected": 0.19893547892570496, + "logps/chosen": -5.846138954162598, + "logps/rejected": -6.645530700683594, + "loss": 0.0605, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -5.846138954162598, + "rewards/margins": 0.7993919253349304, + "rewards/rejected": -6.645530700683594, + "sft_loss": 4.869112968444824, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 5.257896059031984, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.005385799799114466, + "logits/rejected": 0.09482140839099884, + "logps/chosen": -5.7430901527404785, + "logps/rejected": -6.084932804107666, + "loss": 0.0593, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -5.7430901527404785, + "rewards/margins": 0.3418427109718323, + "rewards/rejected": -6.084932804107666, + "sft_loss": 4.753641128540039, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 1.2399413549639557, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": -0.023536410182714462, + "logits/rejected": 0.0942329689860344, + "logps/chosen": -4.997963905334473, + "logps/rejected": -5.332228183746338, + "loss": 0.0568, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.997963905334473, + "rewards/margins": 0.33426347374916077, + "rewards/rejected": -5.332228183746338, + "sft_loss": 4.493335723876953, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 1.1537665725561919, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": 0.06892560422420502, + "logits/rejected": 0.3140993118286133, + "logps/chosen": -5.147521018981934, + "logps/rejected": -5.322199821472168, + "loss": 0.061, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -5.147521018981934, + "rewards/margins": 0.17467857897281647, + "rewards/rejected": -5.322199821472168, + "sft_loss": 4.530567646026611, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 1.9743142113650651, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.0006369724869728088, + "logits/rejected": 0.22242622077465057, + "logps/chosen": -5.135289192199707, + "logps/rejected": -5.416306972503662, + "loss": 0.061, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.135289192199707, + "rewards/margins": 0.2810174524784088, + "rewards/rejected": -5.416306972503662, + "sft_loss": 4.482254981994629, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 12.527675546846213, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": 0.0736534595489502, + "logits/rejected": 0.21511869132518768, + "logps/chosen": -5.2505717277526855, + "logps/rejected": -5.617761135101318, + "loss": 0.0563, + "rewards/accuracies": 0.5625, + "rewards/chosen": -5.2505717277526855, + "rewards/margins": 0.3671889901161194, + "rewards/rejected": -5.617761135101318, + "sft_loss": 4.406624794006348, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.5372059345245361, + "eval_logits/rejected": 0.6512095928192139, + "eval_logps/chosen": -5.745402812957764, + "eval_logps/rejected": -6.024581432342529, + "eval_loss": 0.05729706957936287, + "eval_rewards/accuracies": 0.5445103645324707, + "eval_rewards/chosen": -5.745402812957764, + "eval_rewards/margins": 0.27917909622192383, + "eval_rewards/rejected": -6.024581432342529, + "eval_runtime": 44.2063, + "eval_samples_per_second": 30.426, + "eval_sft_loss": 4.8351593017578125, + "eval_steps_per_second": 7.623, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 3.482450373185867, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": 0.12481657415628433, + "logits/rejected": 0.24873094260692596, + "logps/chosen": -5.8155927658081055, + "logps/rejected": -6.231393814086914, + "loss": 0.0616, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -5.8155927658081055, + "rewards/margins": 0.41580113768577576, + "rewards/rejected": -6.231393814086914, + "sft_loss": 5.256585121154785, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 1.7016472746253801, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": 0.06780445575714111, + "logits/rejected": 0.23064598441123962, + "logps/chosen": -5.035314559936523, + "logps/rejected": -5.458802700042725, + "loss": 0.0577, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -5.035314559936523, + "rewards/margins": 0.4234878420829773, + "rewards/rejected": -5.458802700042725, + "sft_loss": 4.297842502593994, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 2.6585148249522117, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": -0.00957435928285122, + "logits/rejected": 0.0749988928437233, + "logps/chosen": -5.311757564544678, + "logps/rejected": -5.30417537689209, + "loss": 0.0591, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -5.311757564544678, + "rewards/margins": -0.007582643534988165, + "rewards/rejected": -5.30417537689209, + "sft_loss": 4.704192161560059, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 0.9301731458001584, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.13617399334907532, + "logits/rejected": 0.18578466773033142, + "logps/chosen": -4.937038898468018, + "logps/rejected": -5.141440391540527, + "loss": 0.0566, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.937038898468018, + "rewards/margins": 0.2044021338224411, + "rewards/rejected": -5.141440391540527, + "sft_loss": 4.429440021514893, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 3.770695836704577, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -0.023957863450050354, + "logits/rejected": 0.19839075207710266, + "logps/chosen": -5.234984397888184, + "logps/rejected": -5.377315521240234, + "loss": 0.056, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -5.234984397888184, + "rewards/margins": 0.14233139157295227, + "rewards/rejected": -5.377315521240234, + "sft_loss": 4.4770612716674805, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 3.156316119175797, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.19132229685783386, + "logits/rejected": 0.07280747592449188, + "logps/chosen": -5.19472599029541, + "logps/rejected": -5.726254940032959, + "loss": 0.0559, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -5.19472599029541, + "rewards/margins": 0.5315293073654175, + "rewards/rejected": -5.726254940032959, + "sft_loss": 4.581809043884277, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 2.4555124360440823, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": -0.2125493586063385, + "logits/rejected": -0.1108977347612381, + "logps/chosen": -5.088704586029053, + "logps/rejected": -5.045218467712402, + "loss": 0.056, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -5.088704586029053, + "rewards/margins": -0.043486569076776505, + "rewards/rejected": -5.045218467712402, + "sft_loss": 4.416959285736084, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 2.1633149663163147, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": -0.12068028748035431, + "logits/rejected": 0.02915067970752716, + "logps/chosen": -5.012939929962158, + "logps/rejected": -5.2914557456970215, + "loss": 0.0575, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -5.012939929962158, + "rewards/margins": 0.2785159945487976, + "rewards/rejected": -5.2914557456970215, + "sft_loss": 4.793717384338379, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 2.234715768390998, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": -0.013025635853409767, + "logits/rejected": 0.12180577218532562, + "logps/chosen": -4.833644866943359, + "logps/rejected": -5.177438259124756, + "loss": 0.0553, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.833644866943359, + "rewards/margins": 0.34379321336746216, + "rewards/rejected": -5.177438259124756, + "sft_loss": 4.411333084106445, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 1.531338508012952, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": 0.06976038217544556, + "logits/rejected": 0.22492310404777527, + "logps/chosen": -4.981315612792969, + "logps/rejected": -5.218815326690674, + "loss": 0.0552, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.981315612792969, + "rewards/margins": 0.2375001460313797, + "rewards/rejected": -5.218815326690674, + "sft_loss": 4.548120021820068, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 1.4188376332791823, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": 0.060088079422712326, + "logits/rejected": 0.1696506291627884, + "logps/chosen": -5.040231704711914, + "logps/rejected": -5.448383331298828, + "loss": 0.0552, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.040231704711914, + "rewards/margins": 0.4081522524356842, + "rewards/rejected": -5.448383331298828, + "sft_loss": 4.615390300750732, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 0.8274857650787919, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.16978515684604645, + "logits/rejected": 0.03128058463335037, + "logps/chosen": -4.783569812774658, + "logps/rejected": -4.997244358062744, + "loss": 0.0578, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.783569812774658, + "rewards/margins": 0.2136746346950531, + "rewards/rejected": -4.997244358062744, + "sft_loss": 4.518199443817139, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 1.1140822228366611, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": -0.03868691250681877, + "logits/rejected": 0.047852348536252975, + "logps/chosen": -4.818562030792236, + "logps/rejected": -5.142088413238525, + "loss": 0.0569, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.818562030792236, + "rewards/margins": 0.3235262334346771, + "rewards/rejected": -5.142088413238525, + "sft_loss": 4.523900508880615, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 1.0831999599710989, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": -0.041154105216264725, + "logits/rejected": -0.16716539859771729, + "logps/chosen": -5.088252067565918, + "logps/rejected": -5.1696577072143555, + "loss": 0.0583, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -5.088252067565918, + "rewards/margins": 0.08140526711940765, + "rewards/rejected": -5.1696577072143555, + "sft_loss": 4.869572639465332, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 1.0609505577965346, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -0.41082948446273804, + "logits/rejected": -0.15013936161994934, + "logps/chosen": -4.697094917297363, + "logps/rejected": -5.290956974029541, + "loss": 0.0541, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.697094917297363, + "rewards/margins": 0.5938615202903748, + "rewards/rejected": -5.290956974029541, + "sft_loss": 4.516491889953613, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 1.0628021961229914, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": -0.310746967792511, + "logits/rejected": -0.06656353920698166, + "logps/chosen": -4.562114715576172, + "logps/rejected": -4.903138160705566, + "loss": 0.0551, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.562114715576172, + "rewards/margins": 0.3410232663154602, + "rewards/rejected": -4.903138160705566, + "sft_loss": 4.2797112464904785, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 1.6375184623561623, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": -0.07968901097774506, + "logits/rejected": 0.021457534283399582, + "logps/chosen": -4.943936824798584, + "logps/rejected": -5.136715888977051, + "loss": 0.0556, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.943936824798584, + "rewards/margins": 0.192779541015625, + "rewards/rejected": -5.136715888977051, + "sft_loss": 4.519631385803223, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 2.9319822414945302, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": 0.015240554697811604, + "logits/rejected": 0.0742521733045578, + "logps/chosen": -4.907310485839844, + "logps/rejected": -5.221567153930664, + "loss": 0.0555, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.907310485839844, + "rewards/margins": 0.31425637006759644, + "rewards/rejected": -5.221567153930664, + "sft_loss": 4.523907661437988, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 1.0031833756740196, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": -0.14841726422309875, + "logits/rejected": -0.13759180903434753, + "logps/chosen": -4.806763648986816, + "logps/rejected": -4.975509166717529, + "loss": 0.057, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -4.806763648986816, + "rewards/margins": 0.1687450259923935, + "rewards/rejected": -4.975509166717529, + "sft_loss": 4.490440845489502, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 0.7041566026153324, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": -0.28130513429641724, + "logits/rejected": -0.15879981219768524, + "logps/chosen": -4.831118583679199, + "logps/rejected": -5.1757378578186035, + "loss": 0.0545, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.831118583679199, + "rewards/margins": 0.34461960196495056, + "rewards/rejected": -5.1757378578186035, + "sft_loss": 4.525076389312744, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 0.6823847318552707, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": -0.36276552081108093, + "logits/rejected": -0.19994986057281494, + "logps/chosen": -4.660523891448975, + "logps/rejected": -4.733887195587158, + "loss": 0.0567, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -4.660523891448975, + "rewards/margins": 0.07336314022541046, + "rewards/rejected": -4.733887195587158, + "sft_loss": 4.3608832359313965, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 1.1486157742033887, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": -0.1318567544221878, + "logits/rejected": -0.0561264343559742, + "logps/chosen": -4.911768913269043, + "logps/rejected": -5.159637451171875, + "loss": 0.0556, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -4.911768913269043, + "rewards/margins": 0.24786880612373352, + "rewards/rejected": -5.159637451171875, + "sft_loss": 4.5656256675720215, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 1.0872297295675921, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": -0.1332496702671051, + "logits/rejected": 0.011695345863699913, + "logps/chosen": -4.879103183746338, + "logps/rejected": -5.090020179748535, + "loss": 0.0553, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.879103183746338, + "rewards/margins": 0.21091759204864502, + "rewards/rejected": -5.090020179748535, + "sft_loss": 4.485103607177734, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 0.5462026811962317, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": -0.168059304356575, + "logits/rejected": 0.027810195460915565, + "logps/chosen": -4.999017238616943, + "logps/rejected": -5.254492282867432, + "loss": 0.056, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.999017238616943, + "rewards/margins": 0.255475252866745, + "rewards/rejected": -5.254492282867432, + "sft_loss": 4.662232398986816, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 0.8810552591658657, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": -0.09420142322778702, + "logits/rejected": 0.04737439751625061, + "logps/chosen": -4.771590709686279, + "logps/rejected": -5.034438133239746, + "loss": 0.0554, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.771590709686279, + "rewards/margins": 0.26284775137901306, + "rewards/rejected": -5.034438133239746, + "sft_loss": 4.294004917144775, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 0.6477536515618159, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": -0.17829468846321106, + "logits/rejected": -0.11911626160144806, + "logps/chosen": -4.77829122543335, + "logps/rejected": -5.030237197875977, + "loss": 0.0557, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.77829122543335, + "rewards/margins": 0.251945436000824, + "rewards/rejected": -5.030237197875977, + "sft_loss": 4.557742118835449, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 0.7898205513417741, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.3551715910434723, + "logits/rejected": 0.03357185795903206, + "logps/chosen": -4.911800384521484, + "logps/rejected": -5.16738224029541, + "loss": 0.0551, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.911800384521484, + "rewards/margins": 0.25558188557624817, + "rewards/rejected": -5.16738224029541, + "sft_loss": 4.680028438568115, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 0.670936398200854, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": -0.2937626242637634, + "logits/rejected": -0.15557271242141724, + "logps/chosen": -4.6293230056762695, + "logps/rejected": -4.820884704589844, + "loss": 0.057, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -4.6293230056762695, + "rewards/margins": 0.1915610134601593, + "rewards/rejected": -4.820884704589844, + "sft_loss": 4.376486778259277, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 0.6738876951312873, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": -0.3255738317966461, + "logits/rejected": -0.04851512238383293, + "logps/chosen": -5.104310035705566, + "logps/rejected": -5.27868127822876, + "loss": 0.055, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -5.104310035705566, + "rewards/margins": 0.1743713766336441, + "rewards/rejected": -5.27868127822876, + "sft_loss": 4.744593620300293, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 2.1219096999806935, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -0.24361948668956757, + "logits/rejected": -0.17571952939033508, + "logps/chosen": -4.638392448425293, + "logps/rejected": -4.891458988189697, + "loss": 0.0551, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.638392448425293, + "rewards/margins": 0.25306588411331177, + "rewards/rejected": -4.891458988189697, + "sft_loss": 4.369989395141602, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 0.8293066005958115, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": -0.2626475989818573, + "logits/rejected": -0.07505345344543457, + "logps/chosen": -4.890946865081787, + "logps/rejected": -5.142005443572998, + "loss": 0.0558, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.890946865081787, + "rewards/margins": 0.251058965921402, + "rewards/rejected": -5.142005443572998, + "sft_loss": 4.533967971801758, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 0.6161840677717771, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": -0.17927469313144684, + "logits/rejected": -0.12405016273260117, + "logps/chosen": -4.814286231994629, + "logps/rejected": -5.052570343017578, + "loss": 0.0553, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -4.814286231994629, + "rewards/margins": 0.2382839024066925, + "rewards/rejected": -5.052570343017578, + "sft_loss": 4.54550313949585, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 1.1352561464004647, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": -0.26553431153297424, + "logits/rejected": -0.14662417769432068, + "logps/chosen": -4.51534366607666, + "logps/rejected": -4.847154140472412, + "loss": 0.0553, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.51534366607666, + "rewards/margins": 0.3318101763725281, + "rewards/rejected": -4.847154140472412, + "sft_loss": 4.323355197906494, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 0.8650373367494478, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": -0.2807005047798157, + "logits/rejected": 0.030847817659378052, + "logps/chosen": -4.749510765075684, + "logps/rejected": -5.021355628967285, + "loss": 0.0544, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.749510765075684, + "rewards/margins": 0.2718445062637329, + "rewards/rejected": -5.021355628967285, + "sft_loss": 4.40215539932251, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 0.825097287674202, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": -0.2592643201351166, + "logits/rejected": -0.25759127736091614, + "logps/chosen": -4.920784950256348, + "logps/rejected": -5.223763465881348, + "loss": 0.0561, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -4.920784950256348, + "rewards/margins": 0.302978515625, + "rewards/rejected": -5.223763465881348, + "sft_loss": 4.647104263305664, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 0.6235577350217273, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": -0.3262614607810974, + "logits/rejected": -0.0489344522356987, + "logps/chosen": -4.673770904541016, + "logps/rejected": -5.098756790161133, + "loss": 0.0551, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.673770904541016, + "rewards/margins": 0.42498579621315, + "rewards/rejected": -5.098756790161133, + "sft_loss": 4.476217746734619, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 0.740239920335634, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": -0.3703567683696747, + "logits/rejected": -0.2420303076505661, + "logps/chosen": -4.8338303565979, + "logps/rejected": -5.058407783508301, + "loss": 0.0545, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.8338303565979, + "rewards/margins": 0.22457735240459442, + "rewards/rejected": -5.058407783508301, + "sft_loss": 4.397377014160156, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 0.7683188347707338, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": -0.3597511649131775, + "logits/rejected": -0.2020716369152069, + "logps/chosen": -4.76621675491333, + "logps/rejected": -4.9511847496032715, + "loss": 0.0545, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.76621675491333, + "rewards/margins": 0.1849685162305832, + "rewards/rejected": -4.9511847496032715, + "sft_loss": 4.394301891326904, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 0.6797907024001023, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": -0.24955956637859344, + "logits/rejected": -0.025243768468499184, + "logps/chosen": -4.7654948234558105, + "logps/rejected": -5.149549961090088, + "loss": 0.055, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.7654948234558105, + "rewards/margins": 0.384054571390152, + "rewards/rejected": -5.149549961090088, + "sft_loss": 4.515573024749756, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 1.6465081989407226, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": -0.2952226996421814, + "logits/rejected": -0.09677503257989883, + "logps/chosen": -4.662662506103516, + "logps/rejected": -4.912552833557129, + "loss": 0.0548, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.662662506103516, + "rewards/margins": 0.2498904913663864, + "rewards/rejected": -4.912552833557129, + "sft_loss": 4.407889366149902, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 1.2335372988895321, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": -0.2944491505622864, + "logits/rejected": -0.18662339448928833, + "logps/chosen": -4.888004302978516, + "logps/rejected": -5.157434940338135, + "loss": 0.0557, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.888004302978516, + "rewards/margins": 0.2694306969642639, + "rewards/rejected": -5.157434940338135, + "sft_loss": 4.625906467437744, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 1.3244626800898112, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": -0.22974035143852234, + "logits/rejected": 0.04643644392490387, + "logps/chosen": -4.5863938331604, + "logps/rejected": -4.8641676902771, + "loss": 0.0552, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.5863938331604, + "rewards/margins": 0.27777382731437683, + "rewards/rejected": -4.8641676902771, + "sft_loss": 4.253309726715088, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 0.5535449490055506, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": -0.1554524004459381, + "logits/rejected": -0.05251539871096611, + "logps/chosen": -4.844357490539551, + "logps/rejected": -5.277754783630371, + "loss": 0.0542, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.844357490539551, + "rewards/margins": 0.4333969056606293, + "rewards/rejected": -5.277754783630371, + "sft_loss": 4.583644866943359, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 1.856104039503372, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": -0.08841142803430557, + "logits/rejected": 0.03570820018649101, + "logps/chosen": -4.618399620056152, + "logps/rejected": -4.836398601531982, + "loss": 0.0545, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.618399620056152, + "rewards/margins": 0.21799834072589874, + "rewards/rejected": -4.836398601531982, + "sft_loss": 4.201367378234863, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 1.072425302622145, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": -0.10054608434438705, + "logits/rejected": 0.11284986883401871, + "logps/chosen": -4.950314521789551, + "logps/rejected": -5.514838218688965, + "loss": 0.0538, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.950314521789551, + "rewards/margins": 0.5645238161087036, + "rewards/rejected": -5.514838218688965, + "sft_loss": 4.627020835876465, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 0.5453150161564525, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": -0.2218894064426422, + "logits/rejected": -0.04821163788437843, + "logps/chosen": -4.688741207122803, + "logps/rejected": -5.064606189727783, + "loss": 0.054, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.688741207122803, + "rewards/margins": 0.3758644163608551, + "rewards/rejected": -5.064606189727783, + "sft_loss": 4.479228973388672, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 1.591931090937354, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": -0.3113294839859009, + "logits/rejected": 0.02164360322058201, + "logps/chosen": -4.350485801696777, + "logps/rejected": -4.76249361038208, + "loss": 0.0541, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.350485801696777, + "rewards/margins": 0.41200733184814453, + "rewards/rejected": -4.76249361038208, + "sft_loss": 4.126279354095459, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 0.7619272038790944, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": -0.26695743203163147, + "logits/rejected": 0.017921606078743935, + "logps/chosen": -4.865670204162598, + "logps/rejected": -5.221670627593994, + "loss": 0.0548, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.865670204162598, + "rewards/margins": 0.3560001254081726, + "rewards/rejected": -5.221670627593994, + "sft_loss": 4.485020637512207, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 1.1133191585439717, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": -0.23877230286598206, + "logits/rejected": -0.1729026436805725, + "logps/chosen": -4.773536682128906, + "logps/rejected": -5.113900661468506, + "loss": 0.0545, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.773536682128906, + "rewards/margins": 0.3403640389442444, + "rewards/rejected": -5.113900661468506, + "sft_loss": 4.5196027755737305, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 0.4838798037479677, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": -0.180585116147995, + "logits/rejected": -0.07650501281023026, + "logps/chosen": -4.630809783935547, + "logps/rejected": -5.039240837097168, + "loss": 0.0547, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.630809783935547, + "rewards/margins": 0.4084309935569763, + "rewards/rejected": -5.039240837097168, + "sft_loss": 4.399133682250977, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 0.7799457617182248, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": -0.17215296626091003, + "logits/rejected": -0.14876854419708252, + "logps/chosen": -4.6841607093811035, + "logps/rejected": -5.022492408752441, + "loss": 0.055, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.6841607093811035, + "rewards/margins": 0.33833178877830505, + "rewards/rejected": -5.022492408752441, + "sft_loss": 4.436097621917725, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 0.7840566134412431, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": -0.31777292490005493, + "logits/rejected": -0.12179327011108398, + "logps/chosen": -4.860751628875732, + "logps/rejected": -4.970133304595947, + "loss": 0.0563, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.860751628875732, + "rewards/margins": 0.10938136279582977, + "rewards/rejected": -4.970133304595947, + "sft_loss": 4.530073642730713, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 0.9380159419430601, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": -0.3308666944503784, + "logits/rejected": -0.13465926051139832, + "logps/chosen": -4.901317596435547, + "logps/rejected": -5.171170234680176, + "loss": 0.0547, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.901317596435547, + "rewards/margins": 0.2698523998260498, + "rewards/rejected": -5.171170234680176, + "sft_loss": 4.707381725311279, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 1.5810380912043656, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": -0.39763838052749634, + "logits/rejected": -0.135453462600708, + "logps/chosen": -4.548798561096191, + "logps/rejected": -4.874077320098877, + "loss": 0.055, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.548798561096191, + "rewards/margins": 0.32527926564216614, + "rewards/rejected": -4.874077320098877, + "sft_loss": 4.289718151092529, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 0.7655930328404614, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": -0.384846031665802, + "logits/rejected": -0.2601172924041748, + "logps/chosen": -4.761540412902832, + "logps/rejected": -5.05265998840332, + "loss": 0.0553, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.761540412902832, + "rewards/margins": 0.29111921787261963, + "rewards/rejected": -5.05265998840332, + "sft_loss": 4.518373012542725, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 0.935266037635124, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": -0.34337860345840454, + "logits/rejected": -0.14144375920295715, + "logps/chosen": -4.823574542999268, + "logps/rejected": -4.984274864196777, + "loss": 0.0562, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.823574542999268, + "rewards/margins": 0.16069956123828888, + "rewards/rejected": -4.984274864196777, + "sft_loss": 4.562578201293945, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 0.5615696193518576, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": -0.37192243337631226, + "logits/rejected": -0.1106470599770546, + "logps/chosen": -4.682257652282715, + "logps/rejected": -5.030256271362305, + "loss": 0.0535, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.682257652282715, + "rewards/margins": 0.347998708486557, + "rewards/rejected": -5.030256271362305, + "sft_loss": 4.349669933319092, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 1.4194745294621536, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": -0.2427678108215332, + "logits/rejected": -0.1963764727115631, + "logps/chosen": -4.648349761962891, + "logps/rejected": -4.934103488922119, + "loss": 0.0552, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.648349761962891, + "rewards/margins": 0.2857532203197479, + "rewards/rejected": -4.934103488922119, + "sft_loss": 4.321009635925293, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 0.8866737162217125, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": -0.24945712089538574, + "logits/rejected": -0.12678943574428558, + "logps/chosen": -4.920681476593018, + "logps/rejected": -5.2684712409973145, + "loss": 0.0548, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.920681476593018, + "rewards/margins": 0.3477899730205536, + "rewards/rejected": -5.2684712409973145, + "sft_loss": 4.634685516357422, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 0.8355538491486008, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": -0.42114168405532837, + "logits/rejected": -0.21116065979003906, + "logps/chosen": -4.6978044509887695, + "logps/rejected": -5.096659183502197, + "loss": 0.0538, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.6978044509887695, + "rewards/margins": 0.39885538816452026, + "rewards/rejected": -5.096659183502197, + "sft_loss": 4.44577169418335, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 1.2264999824699427, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": -0.30539703369140625, + "logits/rejected": -0.014449876733124256, + "logps/chosen": -4.504247188568115, + "logps/rejected": -4.868443489074707, + "loss": 0.0542, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.504247188568115, + "rewards/margins": 0.3641965389251709, + "rewards/rejected": -4.868443489074707, + "sft_loss": 4.2012739181518555, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 0.8131909730707197, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": -0.19957628846168518, + "logits/rejected": -0.03655420243740082, + "logps/chosen": -4.724609375, + "logps/rejected": -5.090517997741699, + "loss": 0.0546, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.724609375, + "rewards/margins": 0.36590883135795593, + "rewards/rejected": -5.090517997741699, + "sft_loss": 4.364399433135986, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 0.4792104571469811, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": -0.282558798789978, + "logits/rejected": -0.0352824404835701, + "logps/chosen": -4.992825508117676, + "logps/rejected": -5.230543613433838, + "loss": 0.0549, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.992825508117676, + "rewards/margins": 0.23771806061267853, + "rewards/rejected": -5.230543613433838, + "sft_loss": 4.564342021942139, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 0.8052355596151808, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": -0.20845279097557068, + "logits/rejected": 0.08761949837207794, + "logps/chosen": -4.708680152893066, + "logps/rejected": -5.044828414916992, + "loss": 0.0537, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.708680152893066, + "rewards/margins": 0.3361477851867676, + "rewards/rejected": -5.044828414916992, + "sft_loss": 4.384383678436279, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 1.0124337160016392, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": -0.3460480570793152, + "logits/rejected": -0.10860247910022736, + "logps/chosen": -4.608756065368652, + "logps/rejected": -4.939072132110596, + "loss": 0.054, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.608756065368652, + "rewards/margins": 0.33031561970710754, + "rewards/rejected": -4.939072132110596, + "sft_loss": 4.346649169921875, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 1.285300410149873, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": -0.33846408128738403, + "logits/rejected": -0.14779387414455414, + "logps/chosen": -4.764631748199463, + "logps/rejected": -5.01378059387207, + "loss": 0.054, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.764631748199463, + "rewards/margins": 0.24914869666099548, + "rewards/rejected": -5.01378059387207, + "sft_loss": 4.407692909240723, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 0.440695821147554, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": -0.3593650460243225, + "logits/rejected": -0.1944659948348999, + "logps/chosen": -4.49599552154541, + "logps/rejected": -4.8249711990356445, + "loss": 0.0543, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.49599552154541, + "rewards/margins": 0.3289756178855896, + "rewards/rejected": -4.8249711990356445, + "sft_loss": 4.26392936706543, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 1.0055567756075858, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": -0.32162588834762573, + "logits/rejected": -0.07832889258861542, + "logps/chosen": -4.858660697937012, + "logps/rejected": -5.215462684631348, + "loss": 0.0528, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.858660697937012, + "rewards/margins": 0.3568021357059479, + "rewards/rejected": -5.215462684631348, + "sft_loss": 4.455192565917969, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 1.1135407344664665, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": -0.23036710917949677, + "logits/rejected": -0.05882059410214424, + "logps/chosen": -4.685460567474365, + "logps/rejected": -5.1761980056762695, + "loss": 0.0552, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.685460567474365, + "rewards/margins": 0.4907374382019043, + "rewards/rejected": -5.1761980056762695, + "sft_loss": 4.325514316558838, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 0.49216293494483937, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": -0.24461698532104492, + "logits/rejected": 0.05077634006738663, + "logps/chosen": -4.6956305503845215, + "logps/rejected": -5.1042890548706055, + "loss": 0.054, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.6956305503845215, + "rewards/margins": 0.40865880250930786, + "rewards/rejected": -5.1042890548706055, + "sft_loss": 4.272480010986328, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 0.7538062835686686, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": -0.13682588934898376, + "logits/rejected": 0.09025086462497711, + "logps/chosen": -4.984869003295898, + "logps/rejected": -5.451809406280518, + "loss": 0.0538, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.984869003295898, + "rewards/margins": 0.46694087982177734, + "rewards/rejected": -5.451809406280518, + "sft_loss": 4.631129264831543, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 0.8804951698404386, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": -0.23067674040794373, + "logits/rejected": -0.09110057353973389, + "logps/chosen": -4.5784502029418945, + "logps/rejected": -4.966479778289795, + "loss": 0.0545, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -4.5784502029418945, + "rewards/margins": 0.3880303204059601, + "rewards/rejected": -4.966479778289795, + "sft_loss": 4.248000144958496, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 0.9443455201928435, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": -0.13407650589942932, + "logits/rejected": -0.029496919363737106, + "logps/chosen": -4.822766304016113, + "logps/rejected": -5.181981086730957, + "loss": 0.0547, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.822766304016113, + "rewards/margins": 0.35921525955200195, + "rewards/rejected": -5.181981086730957, + "sft_loss": 4.538924217224121, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 0.615353021347065, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": -0.10198304802179337, + "logits/rejected": 0.09545397013425827, + "logps/chosen": -4.82249641418457, + "logps/rejected": -5.266402244567871, + "loss": 0.0536, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.82249641418457, + "rewards/margins": 0.44390565156936646, + "rewards/rejected": -5.266402244567871, + "sft_loss": 4.410745143890381, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 0.5160785518479237, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": -0.21874277293682098, + "logits/rejected": 0.046989113092422485, + "logps/chosen": -4.701724052429199, + "logps/rejected": -5.032949447631836, + "loss": 0.0543, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.701724052429199, + "rewards/margins": 0.33122485876083374, + "rewards/rejected": -5.032949447631836, + "sft_loss": 4.272125244140625, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 0.5850128791360416, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": -0.38717955350875854, + "logits/rejected": -0.12629468739032745, + "logps/chosen": -4.6548614501953125, + "logps/rejected": -4.998035907745361, + "loss": 0.0537, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.6548614501953125, + "rewards/margins": 0.34317439794540405, + "rewards/rejected": -4.998035907745361, + "sft_loss": 4.339907646179199, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 0.8364339689134905, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": -0.2304932177066803, + "logits/rejected": -0.08832928538322449, + "logps/chosen": -4.8688859939575195, + "logps/rejected": -5.244830131530762, + "loss": 0.054, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.8688859939575195, + "rewards/margins": 0.37594443559646606, + "rewards/rejected": -5.244830131530762, + "sft_loss": 4.49772310256958, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 0.5878980222924488, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": -0.2880062460899353, + "logits/rejected": -0.13873964548110962, + "logps/chosen": -4.635272026062012, + "logps/rejected": -5.012479782104492, + "loss": 0.0529, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.635272026062012, + "rewards/margins": 0.37720683217048645, + "rewards/rejected": -5.012479782104492, + "sft_loss": 4.257678985595703, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 0.44959914295393133, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": -0.32320109009742737, + "logits/rejected": -0.042316682636737823, + "logps/chosen": -4.49575138092041, + "logps/rejected": -5.073681354522705, + "loss": 0.0531, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.49575138092041, + "rewards/margins": 0.5779297947883606, + "rewards/rejected": -5.073681354522705, + "sft_loss": 4.208576202392578, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 0.7144637825313324, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": -0.17591574788093567, + "logits/rejected": -0.07191745191812515, + "logps/chosen": -4.874433517456055, + "logps/rejected": -5.258727073669434, + "loss": 0.0533, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.874433517456055, + "rewards/margins": 0.38429397344589233, + "rewards/rejected": -5.258727073669434, + "sft_loss": 4.46193265914917, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.164407417178154, + "eval_logits/rejected": 0.2938738465309143, + "eval_logps/chosen": -4.695379734039307, + "eval_logps/rejected": -5.077708721160889, + "eval_loss": 0.052394524216651917, + "eval_rewards/accuracies": 0.6157270073890686, + "eval_rewards/chosen": -4.695379734039307, + "eval_rewards/margins": 0.3823291063308716, + "eval_rewards/rejected": -5.077708721160889, + "eval_runtime": 43.3284, + "eval_samples_per_second": 31.042, + "eval_sft_loss": 4.234010696411133, + "eval_steps_per_second": 7.778, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 0.5270049225438456, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": -0.3387565016746521, + "logits/rejected": -0.06523511558771133, + "logps/chosen": -4.50264835357666, + "logps/rejected": -5.163118362426758, + "loss": 0.0521, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.50264835357666, + "rewards/margins": 0.6604706048965454, + "rewards/rejected": -5.163118362426758, + "sft_loss": 4.194108009338379, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 0.523313066003409, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": -0.2286236733198166, + "logits/rejected": -0.14836743474006653, + "logps/chosen": -4.752197742462158, + "logps/rejected": -5.03873348236084, + "loss": 0.0558, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.752197742462158, + "rewards/margins": 0.28653571009635925, + "rewards/rejected": -5.03873348236084, + "sft_loss": 4.4536638259887695, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 0.7802337043968409, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": -0.20195576548576355, + "logits/rejected": -0.011835225857794285, + "logps/chosen": -4.779173851013184, + "logps/rejected": -5.223197937011719, + "loss": 0.053, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.779173851013184, + "rewards/margins": 0.4440239369869232, + "rewards/rejected": -5.223197937011719, + "sft_loss": 4.434736728668213, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 0.8536421132761614, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": -0.14910456538200378, + "logits/rejected": -0.1316080391407013, + "logps/chosen": -4.600480556488037, + "logps/rejected": -5.0173115730285645, + "loss": 0.0537, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.600480556488037, + "rewards/margins": 0.4168310761451721, + "rewards/rejected": -5.0173115730285645, + "sft_loss": 4.223259925842285, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 0.9865375408237489, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": -0.3155757784843445, + "logits/rejected": -0.13682445883750916, + "logps/chosen": -4.623326301574707, + "logps/rejected": -4.900997161865234, + "loss": 0.0547, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.623326301574707, + "rewards/margins": 0.27767083048820496, + "rewards/rejected": -4.900997161865234, + "sft_loss": 4.3923540115356445, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 0.7418103771522546, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": -0.18393446505069733, + "logits/rejected": 0.002429759595543146, + "logps/chosen": -4.682926177978516, + "logps/rejected": -5.0224385261535645, + "loss": 0.0543, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.682926177978516, + "rewards/margins": 0.33951207995414734, + "rewards/rejected": -5.0224385261535645, + "sft_loss": 4.493451118469238, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 0.6110152314789478, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": -0.24163761734962463, + "logits/rejected": -0.09300851076841354, + "logps/chosen": -4.854322910308838, + "logps/rejected": -5.115272045135498, + "loss": 0.0547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.854322910308838, + "rewards/margins": 0.2609490752220154, + "rewards/rejected": -5.115272045135498, + "sft_loss": 4.595727920532227, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 0.7110654327268647, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": -0.332312673330307, + "logits/rejected": -0.02082272246479988, + "logps/chosen": -4.5603837966918945, + "logps/rejected": -4.936267852783203, + "loss": 0.0542, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.5603837966918945, + "rewards/margins": 0.3758838176727295, + "rewards/rejected": -4.936267852783203, + "sft_loss": 4.269760608673096, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 0.6116882986091782, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": -0.26116663217544556, + "logits/rejected": -0.2063770592212677, + "logps/chosen": -4.58666467666626, + "logps/rejected": -4.9023871421813965, + "loss": 0.0536, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.58666467666626, + "rewards/margins": 0.3157220482826233, + "rewards/rejected": -4.9023871421813965, + "sft_loss": 4.305259704589844, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 0.5190734803088864, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": -0.3095361590385437, + "logits/rejected": -0.1564372330904007, + "logps/chosen": -4.769049167633057, + "logps/rejected": -5.169442653656006, + "loss": 0.0535, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.769049167633057, + "rewards/margins": 0.40039342641830444, + "rewards/rejected": -5.169442653656006, + "sft_loss": 4.430898189544678, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 0.6090834044108917, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": -0.2508835196495056, + "logits/rejected": -0.04892207309603691, + "logps/chosen": -4.640468597412109, + "logps/rejected": -5.314169883728027, + "loss": 0.0544, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.640468597412109, + "rewards/margins": 0.6737015843391418, + "rewards/rejected": -5.314169883728027, + "sft_loss": 4.312269687652588, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 0.6570091722216572, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": -0.3884831666946411, + "logits/rejected": -0.15569785237312317, + "logps/chosen": -4.644750118255615, + "logps/rejected": -5.068819999694824, + "loss": 0.0532, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.644750118255615, + "rewards/margins": 0.4240697920322418, + "rewards/rejected": -5.068819999694824, + "sft_loss": 4.279450416564941, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 0.8599736694295398, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": -0.3107752501964569, + "logits/rejected": -0.13118943572044373, + "logps/chosen": -4.980652332305908, + "logps/rejected": -5.300137519836426, + "loss": 0.0549, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.980652332305908, + "rewards/margins": 0.31948500871658325, + "rewards/rejected": -5.300137519836426, + "sft_loss": 4.596789360046387, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 0.5817887629532634, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": -0.47135478258132935, + "logits/rejected": -0.10487018525600433, + "logps/chosen": -4.572172164916992, + "logps/rejected": -4.95493221282959, + "loss": 0.0537, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.572172164916992, + "rewards/margins": 0.38275963068008423, + "rewards/rejected": -4.95493221282959, + "sft_loss": 4.365973949432373, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 0.8546776312375493, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": -0.305539071559906, + "logits/rejected": -0.18519474565982819, + "logps/chosen": -4.459600925445557, + "logps/rejected": -4.81002140045166, + "loss": 0.0551, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.459600925445557, + "rewards/margins": 0.3504212200641632, + "rewards/rejected": -4.81002140045166, + "sft_loss": 4.2054009437561035, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 0.9477596077907503, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": -0.3723621070384979, + "logits/rejected": -0.12102153152227402, + "logps/chosen": -4.758284091949463, + "logps/rejected": -5.178713798522949, + "loss": 0.0539, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.758284091949463, + "rewards/margins": 0.42043009400367737, + "rewards/rejected": -5.178713798522949, + "sft_loss": 4.508724212646484, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 0.5092167105949217, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": -0.3669915199279785, + "logits/rejected": -0.2551936209201813, + "logps/chosen": -4.825407981872559, + "logps/rejected": -5.157772064208984, + "loss": 0.0533, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.825407981872559, + "rewards/margins": 0.33236438035964966, + "rewards/rejected": -5.157772064208984, + "sft_loss": 4.503109931945801, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 0.9726713592254466, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.39609891176223755, + "logits/rejected": -0.21996548771858215, + "logps/chosen": -4.433319091796875, + "logps/rejected": -4.735190391540527, + "loss": 0.0546, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.433319091796875, + "rewards/margins": 0.301870733499527, + "rewards/rejected": -4.735190391540527, + "sft_loss": 4.102834224700928, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 0.7422075006162999, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": -0.27023711800575256, + "logits/rejected": -0.12920354306697845, + "logps/chosen": -4.826740741729736, + "logps/rejected": -4.9720458984375, + "loss": 0.0548, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.826740741729736, + "rewards/margins": 0.1453053057193756, + "rewards/rejected": -4.9720458984375, + "sft_loss": 4.488923072814941, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 0.4899561851874869, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": -0.33869457244873047, + "logits/rejected": -0.20442715287208557, + "logps/chosen": -4.991418361663818, + "logps/rejected": -5.21295690536499, + "loss": 0.0537, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.991418361663818, + "rewards/margins": 0.22153854370117188, + "rewards/rejected": -5.21295690536499, + "sft_loss": 4.614162445068359, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 1.4087474281540486, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": -0.3120538592338562, + "logits/rejected": -0.055802445858716965, + "logps/chosen": -4.461789608001709, + "logps/rejected": -4.915839195251465, + "loss": 0.0537, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.461789608001709, + "rewards/margins": 0.4540492594242096, + "rewards/rejected": -4.915839195251465, + "sft_loss": 4.131707191467285, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 0.3428163959894264, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": -0.30345502495765686, + "logits/rejected": -0.16580362617969513, + "logps/chosen": -4.670796871185303, + "logps/rejected": -5.047994613647461, + "loss": 0.0539, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.670796871185303, + "rewards/margins": 0.3771972060203552, + "rewards/rejected": -5.047994613647461, + "sft_loss": 4.354706764221191, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 0.3759873331174023, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": -0.35124725103378296, + "logits/rejected": -0.07594305276870728, + "logps/chosen": -4.675352573394775, + "logps/rejected": -5.124632835388184, + "loss": 0.0527, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.675352573394775, + "rewards/margins": 0.44928035140037537, + "rewards/rejected": -5.124632835388184, + "sft_loss": 4.424468040466309, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 0.697669128896546, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": -0.1969275176525116, + "logits/rejected": 0.06542123854160309, + "logps/chosen": -4.537474632263184, + "logps/rejected": -5.131258964538574, + "loss": 0.0527, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.537474632263184, + "rewards/margins": 0.5937844514846802, + "rewards/rejected": -5.131258964538574, + "sft_loss": 4.197497367858887, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 0.5767388364824774, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": -0.21197989583015442, + "logits/rejected": 0.06789512932300568, + "logps/chosen": -4.644294261932373, + "logps/rejected": -4.911935806274414, + "loss": 0.054, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.644294261932373, + "rewards/margins": 0.2676416039466858, + "rewards/rejected": -4.911935806274414, + "sft_loss": 4.177750110626221, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 1.0344418790294128, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": -0.21944479644298553, + "logits/rejected": -0.11954133212566376, + "logps/chosen": -4.733977794647217, + "logps/rejected": -5.374486446380615, + "loss": 0.053, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.733977794647217, + "rewards/margins": 0.6405088305473328, + "rewards/rejected": -5.374486446380615, + "sft_loss": 4.4577178955078125, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 0.8658733433242235, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": -0.18332402408123016, + "logits/rejected": -0.07167172431945801, + "logps/chosen": -4.696061611175537, + "logps/rejected": -5.125707149505615, + "loss": 0.0535, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.696061611175537, + "rewards/margins": 0.42964568734169006, + "rewards/rejected": -5.125707149505615, + "sft_loss": 4.375224590301514, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 0.8231450884283922, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": -0.1758008599281311, + "logits/rejected": -0.06344632804393768, + "logps/chosen": -4.6443257331848145, + "logps/rejected": -4.973420143127441, + "loss": 0.0552, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.6443257331848145, + "rewards/margins": 0.329093873500824, + "rewards/rejected": -4.973420143127441, + "sft_loss": 4.3561811447143555, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 0.46591650849175953, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": -0.3961028456687927, + "logits/rejected": -0.15232224762439728, + "logps/chosen": -4.621685028076172, + "logps/rejected": -5.0412163734436035, + "loss": 0.0539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.621685028076172, + "rewards/margins": 0.4195311665534973, + "rewards/rejected": -5.0412163734436035, + "sft_loss": 4.4617919921875, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 1.0101262696651687, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": -0.3851791024208069, + "logits/rejected": -0.15365764498710632, + "logps/chosen": -4.548623085021973, + "logps/rejected": -4.8963422775268555, + "loss": 0.0542, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.548623085021973, + "rewards/margins": 0.34772005677223206, + "rewards/rejected": -4.8963422775268555, + "sft_loss": 4.327893257141113, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 0.5381428324831616, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": -0.2905058264732361, + "logits/rejected": -0.006855395622551441, + "logps/chosen": -4.724823951721191, + "logps/rejected": -5.080996036529541, + "loss": 0.0541, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.724823951721191, + "rewards/margins": 0.35617202520370483, + "rewards/rejected": -5.080996036529541, + "sft_loss": 4.46238899230957, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 0.5495692093454033, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": -0.1810792237520218, + "logits/rejected": 0.015948548913002014, + "logps/chosen": -4.573351860046387, + "logps/rejected": -4.975152492523193, + "loss": 0.054, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.573351860046387, + "rewards/margins": 0.4018007218837738, + "rewards/rejected": -4.975152492523193, + "sft_loss": 4.238281726837158, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 0.8432393389554336, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": -0.16392990946769714, + "logits/rejected": -0.1051705926656723, + "logps/chosen": -4.922740936279297, + "logps/rejected": -5.354665279388428, + "loss": 0.0547, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.922740936279297, + "rewards/margins": 0.4319241940975189, + "rewards/rejected": -5.354665279388428, + "sft_loss": 4.64121150970459, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 0.5154826838857, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": -0.21759569644927979, + "logits/rejected": -0.22146296501159668, + "logps/chosen": -4.731788158416748, + "logps/rejected": -5.0571770668029785, + "loss": 0.0547, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.731788158416748, + "rewards/margins": 0.3253888487815857, + "rewards/rejected": -5.0571770668029785, + "sft_loss": 4.384237289428711, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 0.9101370133418469, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": -0.3886938691139221, + "logits/rejected": -0.09766797721385956, + "logps/chosen": -4.5959296226501465, + "logps/rejected": -5.024158477783203, + "loss": 0.0527, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.5959296226501465, + "rewards/margins": 0.4282284677028656, + "rewards/rejected": -5.024158477783203, + "sft_loss": 4.394379138946533, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 0.41205584641739607, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": -0.4098678231239319, + "logits/rejected": -0.16409628093242645, + "logps/chosen": -4.53586483001709, + "logps/rejected": -4.860126972198486, + "loss": 0.0535, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.53586483001709, + "rewards/margins": 0.32426196336746216, + "rewards/rejected": -4.860126972198486, + "sft_loss": 4.254485130310059, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 0.5228974262769843, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": -0.29943370819091797, + "logits/rejected": -0.012423193082213402, + "logps/chosen": -4.564153671264648, + "logps/rejected": -4.881725311279297, + "loss": 0.0536, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.564153671264648, + "rewards/margins": 0.31757181882858276, + "rewards/rejected": -4.881725311279297, + "sft_loss": 4.236629962921143, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 0.48911677587093744, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": -0.16598041355609894, + "logits/rejected": -0.18363508582115173, + "logps/chosen": -4.710743427276611, + "logps/rejected": -4.974171161651611, + "loss": 0.0537, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.710743427276611, + "rewards/margins": 0.26342788338661194, + "rewards/rejected": -4.974171161651611, + "sft_loss": 4.399572849273682, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 0.8522763924193989, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": -0.20104625821113586, + "logits/rejected": -0.13274021446704865, + "logps/chosen": -4.837375640869141, + "logps/rejected": -5.168421745300293, + "loss": 0.0531, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.837375640869141, + "rewards/margins": 0.33104628324508667, + "rewards/rejected": -5.168421745300293, + "sft_loss": 4.40709924697876, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 0.9809109204437388, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": -0.24949832260608673, + "logits/rejected": -0.013175847008824348, + "logps/chosen": -4.6659345626831055, + "logps/rejected": -5.258612632751465, + "loss": 0.0535, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.6659345626831055, + "rewards/margins": 0.592678427696228, + "rewards/rejected": -5.258612632751465, + "sft_loss": 4.3432440757751465, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 0.6177294989073903, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": -0.2501566410064697, + "logits/rejected": -0.04616551846265793, + "logps/chosen": -4.930415153503418, + "logps/rejected": -5.398449897766113, + "loss": 0.0538, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.930415153503418, + "rewards/margins": 0.4680354595184326, + "rewards/rejected": -5.398449897766113, + "sft_loss": 4.56772518157959, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 0.7477611616553518, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": -0.3545466959476471, + "logits/rejected": -0.10054433345794678, + "logps/chosen": -4.5024824142456055, + "logps/rejected": -4.952744960784912, + "loss": 0.0546, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.5024824142456055, + "rewards/margins": 0.4502628445625305, + "rewards/rejected": -4.952744960784912, + "sft_loss": 4.22027063369751, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 0.6168398978776214, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": -0.4304068982601166, + "logits/rejected": -0.18468213081359863, + "logps/chosen": -4.629603385925293, + "logps/rejected": -4.93204402923584, + "loss": 0.0542, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.629603385925293, + "rewards/margins": 0.30244094133377075, + "rewards/rejected": -4.93204402923584, + "sft_loss": 4.375184059143066, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 0.5481544408539222, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": -0.41533294320106506, + "logits/rejected": -0.15929146111011505, + "logps/chosen": -4.695633888244629, + "logps/rejected": -5.094050407409668, + "loss": 0.0527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.695633888244629, + "rewards/margins": 0.398416668176651, + "rewards/rejected": -5.094050407409668, + "sft_loss": 4.4184675216674805, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 0.4981215224708011, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": -0.35322266817092896, + "logits/rejected": -0.1806357502937317, + "logps/chosen": -4.483834743499756, + "logps/rejected": -4.888542652130127, + "loss": 0.0532, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.483834743499756, + "rewards/margins": 0.40470829606056213, + "rewards/rejected": -4.888542652130127, + "sft_loss": 4.222219944000244, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 0.6564438791162569, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": -0.25478994846343994, + "logits/rejected": -0.17686933279037476, + "logps/chosen": -4.613704204559326, + "logps/rejected": -4.8227033615112305, + "loss": 0.0562, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.613704204559326, + "rewards/margins": 0.2089988738298416, + "rewards/rejected": -4.8227033615112305, + "sft_loss": 4.313691139221191, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 0.7392853368692655, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": -0.44997015595436096, + "logits/rejected": -0.27288126945495605, + "logps/chosen": -4.7664642333984375, + "logps/rejected": -5.0896687507629395, + "loss": 0.0537, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.7664642333984375, + "rewards/margins": 0.3232039511203766, + "rewards/rejected": -5.0896687507629395, + "sft_loss": 4.511096000671387, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 0.6876145426311351, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": -0.3371017575263977, + "logits/rejected": -0.06301428377628326, + "logps/chosen": -4.491923809051514, + "logps/rejected": -5.174212455749512, + "loss": 0.0523, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.491923809051514, + "rewards/margins": 0.6822883486747742, + "rewards/rejected": -5.174212455749512, + "sft_loss": 4.314373970031738, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 0.7139936287057881, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": -0.3784538507461548, + "logits/rejected": -0.17158463597297668, + "logps/chosen": -4.396938323974609, + "logps/rejected": -4.944519996643066, + "loss": 0.0519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.396938323974609, + "rewards/margins": 0.5475821495056152, + "rewards/rejected": -4.944519996643066, + "sft_loss": 4.0358452796936035, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 0.5892634440060744, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": -0.3060999810695648, + "logits/rejected": -0.1631624698638916, + "logps/chosen": -4.6079511642456055, + "logps/rejected": -5.2234787940979, + "loss": 0.0527, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.6079511642456055, + "rewards/margins": 0.6155272126197815, + "rewards/rejected": -5.2234787940979, + "sft_loss": 4.3594512939453125, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 0.929026425003863, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": -0.18277141451835632, + "logits/rejected": -0.1487278938293457, + "logps/chosen": -4.66930627822876, + "logps/rejected": -5.10184907913208, + "loss": 0.0534, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.66930627822876, + "rewards/margins": 0.4325428009033203, + "rewards/rejected": -5.10184907913208, + "sft_loss": 4.38519287109375, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 1.1507620636323597, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": -0.12372313439846039, + "logits/rejected": 0.09936396777629852, + "logps/chosen": -4.571523189544678, + "logps/rejected": -5.108660697937012, + "loss": 0.0524, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.571523189544678, + "rewards/margins": 0.5371370911598206, + "rewards/rejected": -5.108660697937012, + "sft_loss": 4.278738021850586, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 1.1625783677852743, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": -0.12434210628271103, + "logits/rejected": 0.043297264724969864, + "logps/chosen": -4.410771369934082, + "logps/rejected": -4.947856426239014, + "loss": 0.053, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.410771369934082, + "rewards/margins": 0.5370848178863525, + "rewards/rejected": -4.947856426239014, + "sft_loss": 4.148685932159424, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 0.7569548293124136, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": -0.26663917303085327, + "logits/rejected": 0.0644756332039833, + "logps/chosen": -4.429556846618652, + "logps/rejected": -4.855227470397949, + "loss": 0.0542, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.429556846618652, + "rewards/margins": 0.42567119002342224, + "rewards/rejected": -4.855227470397949, + "sft_loss": 4.217803001403809, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 0.5098138301352244, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": -0.1906522810459137, + "logits/rejected": -0.0011430894955992699, + "logps/chosen": -4.641746997833252, + "logps/rejected": -5.193634986877441, + "loss": 0.0528, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.641746997833252, + "rewards/margins": 0.5518878698348999, + "rewards/rejected": -5.193634986877441, + "sft_loss": 4.367932319641113, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 0.5456950404150688, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": -0.3177880346775055, + "logits/rejected": -0.1907317340373993, + "logps/chosen": -4.793037414550781, + "logps/rejected": -5.058445930480957, + "loss": 0.0543, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.793037414550781, + "rewards/margins": 0.265408456325531, + "rewards/rejected": -5.058445930480957, + "sft_loss": 4.428118705749512, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 0.6987818198293332, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": -0.23779654502868652, + "logits/rejected": -0.053666941821575165, + "logps/chosen": -4.703165531158447, + "logps/rejected": -5.080471992492676, + "loss": 0.0532, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.703165531158447, + "rewards/margins": 0.3773062825202942, + "rewards/rejected": -5.080471992492676, + "sft_loss": 4.4036335945129395, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 0.4862054708330395, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": -0.2143319547176361, + "logits/rejected": -0.09488032013177872, + "logps/chosen": -4.576449394226074, + "logps/rejected": -4.933585166931152, + "loss": 0.0529, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.576449394226074, + "rewards/margins": 0.35713550448417664, + "rewards/rejected": -4.933585166931152, + "sft_loss": 4.224600791931152, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 0.5843251817725653, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": -0.4183814525604248, + "logits/rejected": -0.20347313582897186, + "logps/chosen": -4.669787406921387, + "logps/rejected": -5.200324058532715, + "loss": 0.0533, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.669787406921387, + "rewards/margins": 0.5305365324020386, + "rewards/rejected": -5.200324058532715, + "sft_loss": 4.445235252380371, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 0.6456408033834538, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": -0.2532975971698761, + "logits/rejected": -0.04155152291059494, + "logps/chosen": -4.547806262969971, + "logps/rejected": -5.053961753845215, + "loss": 0.0527, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.547806262969971, + "rewards/margins": 0.5061560869216919, + "rewards/rejected": -5.053961753845215, + "sft_loss": 4.219943046569824, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 0.5237674186864618, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": -0.36289113759994507, + "logits/rejected": -0.2682749032974243, + "logps/chosen": -4.5317912101745605, + "logps/rejected": -4.91931676864624, + "loss": 0.0527, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.5317912101745605, + "rewards/margins": 0.3875252604484558, + "rewards/rejected": -4.91931676864624, + "sft_loss": 4.134699821472168, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 0.7270955294325449, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": -0.30084487795829773, + "logits/rejected": -0.07306574285030365, + "logps/chosen": -4.786416530609131, + "logps/rejected": -5.113640785217285, + "loss": 0.0541, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.786416530609131, + "rewards/margins": 0.3272242546081543, + "rewards/rejected": -5.113640785217285, + "sft_loss": 4.5564351081848145, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 0.7225983756880933, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": -0.32694125175476074, + "logits/rejected": -0.05704299733042717, + "logps/chosen": -4.673354625701904, + "logps/rejected": -5.159552574157715, + "loss": 0.0529, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.673354625701904, + "rewards/margins": 0.48619788885116577, + "rewards/rejected": -5.159552574157715, + "sft_loss": 4.341565132141113, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 0.6272602306537511, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": -0.40429896116256714, + "logits/rejected": -0.23206374049186707, + "logps/chosen": -4.349539756774902, + "logps/rejected": -4.834110260009766, + "loss": 0.0525, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.349539756774902, + "rewards/margins": 0.4845706820487976, + "rewards/rejected": -4.834110260009766, + "sft_loss": 4.087153434753418, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 0.664900836717412, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": -0.3435072600841522, + "logits/rejected": -0.05373241752386093, + "logps/chosen": -4.66226863861084, + "logps/rejected": -5.135153293609619, + "loss": 0.0531, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.66226863861084, + "rewards/margins": 0.47288474440574646, + "rewards/rejected": -5.135153293609619, + "sft_loss": 4.391148090362549, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 0.46821226895754736, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": -0.1869077980518341, + "logits/rejected": -0.06527513265609741, + "logps/chosen": -4.599356651306152, + "logps/rejected": -4.932245254516602, + "loss": 0.0535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.599356651306152, + "rewards/margins": 0.3328891396522522, + "rewards/rejected": -4.932245254516602, + "sft_loss": 4.303515434265137, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 0.681156914428372, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": -0.35399603843688965, + "logits/rejected": -0.0633891299366951, + "logps/chosen": -4.58389949798584, + "logps/rejected": -4.945822715759277, + "loss": 0.0532, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.58389949798584, + "rewards/margins": 0.3619235157966614, + "rewards/rejected": -4.945822715759277, + "sft_loss": 4.220614433288574, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 0.48492189227810406, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": -0.32695597410202026, + "logits/rejected": -0.29391515254974365, + "logps/chosen": -4.7623701095581055, + "logps/rejected": -5.030303001403809, + "loss": 0.054, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.7623701095581055, + "rewards/margins": 0.26793205738067627, + "rewards/rejected": -5.030303001403809, + "sft_loss": 4.4485955238342285, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 0.6311037618756278, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": -0.44052833318710327, + "logits/rejected": -0.262589693069458, + "logps/chosen": -4.548510551452637, + "logps/rejected": -5.029641151428223, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.548510551452637, + "rewards/margins": 0.4811309278011322, + "rewards/rejected": -5.029641151428223, + "sft_loss": 4.205142498016357, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 0.5356140327074448, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": -0.3407883048057556, + "logits/rejected": -0.17592433094978333, + "logps/chosen": -4.670349597930908, + "logps/rejected": -5.063414573669434, + "loss": 0.0528, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.670349597930908, + "rewards/margins": 0.3930647373199463, + "rewards/rejected": -5.063414573669434, + "sft_loss": 4.288704872131348, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 0.5224161429228669, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": -0.2652224898338318, + "logits/rejected": -0.16097518801689148, + "logps/chosen": -4.668039798736572, + "logps/rejected": -5.022774696350098, + "loss": 0.0541, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.668039798736572, + "rewards/margins": 0.3547355532646179, + "rewards/rejected": -5.022774696350098, + "sft_loss": 4.322123050689697, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 0.5931763351107936, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": -0.26682934165000916, + "logits/rejected": -0.12901510298252106, + "logps/chosen": -4.589568138122559, + "logps/rejected": -5.130476951599121, + "loss": 0.0519, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.589568138122559, + "rewards/margins": 0.5409084558486938, + "rewards/rejected": -5.130476951599121, + "sft_loss": 4.2350568771362305, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 0.4542520418662769, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": -0.3942444920539856, + "logits/rejected": -0.2551219165325165, + "logps/chosen": -4.672030448913574, + "logps/rejected": -4.914698123931885, + "loss": 0.0556, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.672030448913574, + "rewards/margins": 0.24266783893108368, + "rewards/rejected": -4.914698123931885, + "sft_loss": 4.431342124938965, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 0.5438517815194526, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": -0.4611254632472992, + "logits/rejected": -0.2544347643852234, + "logps/chosen": -4.990399360656738, + "logps/rejected": -5.296509742736816, + "loss": 0.0535, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.990399360656738, + "rewards/margins": 0.30610987544059753, + "rewards/rejected": -5.296509742736816, + "sft_loss": 4.532609462738037, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 0.6798316854469919, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": -0.4359145164489746, + "logits/rejected": -0.18338151276111603, + "logps/chosen": -4.589885234832764, + "logps/rejected": -5.0170488357543945, + "loss": 0.0534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.589885234832764, + "rewards/margins": 0.4271632730960846, + "rewards/rejected": -5.0170488357543945, + "sft_loss": 4.323793411254883, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 0.4468539269210184, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": -0.429384708404541, + "logits/rejected": -0.1899581104516983, + "logps/chosen": -4.401920795440674, + "logps/rejected": -4.892082214355469, + "loss": 0.0535, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.401920795440674, + "rewards/margins": 0.49016109108924866, + "rewards/rejected": -4.892082214355469, + "sft_loss": 4.202319145202637, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 0.6567316541688504, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": -0.3670702874660492, + "logits/rejected": -0.15794377028942108, + "logps/chosen": -4.501652240753174, + "logps/rejected": -5.101126670837402, + "loss": 0.0528, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.501652240753174, + "rewards/margins": 0.5994741916656494, + "rewards/rejected": -5.101126670837402, + "sft_loss": 4.22614049911499, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 0.8362449587781202, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": -0.4296230375766754, + "logits/rejected": -0.22723379731178284, + "logps/chosen": -4.814973831176758, + "logps/rejected": -5.287397861480713, + "loss": 0.0536, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.814973831176758, + "rewards/margins": 0.47242408990859985, + "rewards/rejected": -5.287397861480713, + "sft_loss": 4.558465957641602, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 0.7260413821003713, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": -0.2747848927974701, + "logits/rejected": -0.09800789505243301, + "logps/chosen": -4.492433071136475, + "logps/rejected": -4.882441520690918, + "loss": 0.0525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.492433071136475, + "rewards/margins": 0.3900087773799896, + "rewards/rejected": -4.882441520690918, + "sft_loss": 4.168577671051025, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 1.0721944049489407, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": -0.31729406118392944, + "logits/rejected": -0.25745829939842224, + "logps/chosen": -4.472096920013428, + "logps/rejected": -4.868285655975342, + "loss": 0.0533, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.472096920013428, + "rewards/margins": 0.396188348531723, + "rewards/rejected": -4.868285655975342, + "sft_loss": 4.1427130699157715, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.2683974504470825, + "eval_logits/rejected": 0.40141844749450684, + "eval_logps/chosen": -4.519753456115723, + "eval_logps/rejected": -5.018647193908691, + "eval_loss": 0.05182144418358803, + "eval_rewards/accuracies": 0.6483679413795471, + "eval_rewards/chosen": -4.519753456115723, + "eval_rewards/margins": 0.4988936185836792, + "eval_rewards/rejected": -5.018647193908691, + "eval_runtime": 42.9586, + "eval_samples_per_second": 31.309, + "eval_sft_loss": 4.150440692901611, + "eval_steps_per_second": 7.845, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 0.7156317001015575, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": -0.3744416832923889, + "logits/rejected": -0.13187824189662933, + "logps/chosen": -4.595070838928223, + "logps/rejected": -5.1346435546875, + "loss": 0.0521, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.595070838928223, + "rewards/margins": 0.5395724177360535, + "rewards/rejected": -5.1346435546875, + "sft_loss": 4.348048210144043, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 0.6367294977520913, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": -0.3582982122898102, + "logits/rejected": -0.048285432159900665, + "logps/chosen": -4.640942573547363, + "logps/rejected": -5.121342182159424, + "loss": 0.0539, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.640942573547363, + "rewards/margins": 0.4803994297981262, + "rewards/rejected": -5.121342182159424, + "sft_loss": 4.395642280578613, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 1.4858558236187764, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": -0.41571682691574097, + "logits/rejected": -0.20789651572704315, + "logps/chosen": -4.591765880584717, + "logps/rejected": -4.994922637939453, + "loss": 0.0535, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.591765880584717, + "rewards/margins": 0.403156578540802, + "rewards/rejected": -4.994922637939453, + "sft_loss": 4.2717695236206055, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 0.5323772990644925, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": -0.18281269073486328, + "logits/rejected": -0.012447918765246868, + "logps/chosen": -4.610562324523926, + "logps/rejected": -4.983885765075684, + "loss": 0.0533, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.610562324523926, + "rewards/margins": 0.37332338094711304, + "rewards/rejected": -4.983885765075684, + "sft_loss": 4.315565586090088, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 0.5854916339421308, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": -0.44287237524986267, + "logits/rejected": -0.1460832804441452, + "logps/chosen": -4.5870513916015625, + "logps/rejected": -4.914314270019531, + "loss": 0.0532, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.5870513916015625, + "rewards/margins": 0.32726341485977173, + "rewards/rejected": -4.914314270019531, + "sft_loss": 4.275424003601074, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 0.5795088883824004, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": -0.4347182810306549, + "logits/rejected": -0.28513267636299133, + "logps/chosen": -4.611976623535156, + "logps/rejected": -4.973719120025635, + "loss": 0.0544, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.611976623535156, + "rewards/margins": 0.36174243688583374, + "rewards/rejected": -4.973719120025635, + "sft_loss": 4.437606334686279, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 0.34028010927917013, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": -0.4847342371940613, + "logits/rejected": -0.21808162331581116, + "logps/chosen": -4.651471138000488, + "logps/rejected": -5.037489891052246, + "loss": 0.053, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.651471138000488, + "rewards/margins": 0.3860177993774414, + "rewards/rejected": -5.037489891052246, + "sft_loss": 4.363861083984375, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 0.3618440750506579, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": -0.5013711452484131, + "logits/rejected": -0.34211140871047974, + "logps/chosen": -4.682461261749268, + "logps/rejected": -4.998291492462158, + "loss": 0.0531, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.682461261749268, + "rewards/margins": 0.31583017110824585, + "rewards/rejected": -4.998291492462158, + "sft_loss": 4.3636016845703125, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 0.518393886973729, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": -0.41425085067749023, + "logits/rejected": -0.17347685992717743, + "logps/chosen": -4.481367588043213, + "logps/rejected": -4.9345831871032715, + "loss": 0.0533, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.481367588043213, + "rewards/margins": 0.4532155394554138, + "rewards/rejected": -4.9345831871032715, + "sft_loss": 4.235280513763428, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 0.4137314181635811, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": -0.45675116777420044, + "logits/rejected": -0.15419267117977142, + "logps/chosen": -4.616604804992676, + "logps/rejected": -4.993886947631836, + "loss": 0.0535, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.616604804992676, + "rewards/margins": 0.3772817552089691, + "rewards/rejected": -4.993886947631836, + "sft_loss": 4.367188930511475, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 0.5601409436047087, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": -0.3995305895805359, + "logits/rejected": -0.24616310000419617, + "logps/chosen": -4.687084674835205, + "logps/rejected": -5.09484338760376, + "loss": 0.0542, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.687084674835205, + "rewards/margins": 0.407759428024292, + "rewards/rejected": -5.09484338760376, + "sft_loss": 4.395379066467285, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 0.8031738031976163, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": -0.4419690668582916, + "logits/rejected": -0.23541542887687683, + "logps/chosen": -4.723730564117432, + "logps/rejected": -5.2569379806518555, + "loss": 0.0529, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.723730564117432, + "rewards/margins": 0.5332074165344238, + "rewards/rejected": -5.2569379806518555, + "sft_loss": 4.406943321228027, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 0.7204423519976543, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": -0.32574447989463806, + "logits/rejected": -0.040873341262340546, + "logps/chosen": -4.396466255187988, + "logps/rejected": -4.99883508682251, + "loss": 0.0528, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.396466255187988, + "rewards/margins": 0.6023694276809692, + "rewards/rejected": -4.99883508682251, + "sft_loss": 4.184441566467285, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 0.4751320652431749, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": -0.43547144532203674, + "logits/rejected": -0.3618202805519104, + "logps/chosen": -4.632751941680908, + "logps/rejected": -4.960574150085449, + "loss": 0.055, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.632751941680908, + "rewards/margins": 0.3278222680091858, + "rewards/rejected": -4.960574150085449, + "sft_loss": 4.420085906982422, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 0.5521181566167117, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": -0.49553728103637695, + "logits/rejected": -0.3846838176250458, + "logps/chosen": -4.746697425842285, + "logps/rejected": -5.085881233215332, + "loss": 0.0535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.746697425842285, + "rewards/margins": 0.33918410539627075, + "rewards/rejected": -5.085881233215332, + "sft_loss": 4.425386905670166, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 0.6424572601366976, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": -0.35927778482437134, + "logits/rejected": -0.1263047158718109, + "logps/chosen": -4.6220550537109375, + "logps/rejected": -4.97434663772583, + "loss": 0.0527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.6220550537109375, + "rewards/margins": 0.3522917628288269, + "rewards/rejected": -4.97434663772583, + "sft_loss": 4.350966453552246, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 0.40220128996335763, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": -0.37630099058151245, + "logits/rejected": -0.10924334824085236, + "logps/chosen": -4.406384468078613, + "logps/rejected": -4.704051494598389, + "loss": 0.0531, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.406384468078613, + "rewards/margins": 0.2976674437522888, + "rewards/rejected": -4.704051494598389, + "sft_loss": 4.099337100982666, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 0.3608110540736878, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": -0.4187045097351074, + "logits/rejected": -0.20366141200065613, + "logps/chosen": -4.68743371963501, + "logps/rejected": -5.046011447906494, + "loss": 0.0536, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.68743371963501, + "rewards/margins": 0.358577698469162, + "rewards/rejected": -5.046011447906494, + "sft_loss": 4.357827186584473, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 0.5472169705894456, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": -0.32792991399765015, + "logits/rejected": -0.3828414976596832, + "logps/chosen": -4.738595485687256, + "logps/rejected": -5.188597679138184, + "loss": 0.0532, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.738595485687256, + "rewards/margins": 0.4500022530555725, + "rewards/rejected": -5.188597679138184, + "sft_loss": 4.347311973571777, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 0.4557106046597545, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": -0.342276394367218, + "logits/rejected": 0.016852790489792824, + "logps/chosen": -4.5591607093811035, + "logps/rejected": -5.212889671325684, + "loss": 0.0514, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.5591607093811035, + "rewards/margins": 0.6537296772003174, + "rewards/rejected": -5.212889671325684, + "sft_loss": 4.230761528015137, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 0.5654653462106812, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": -0.16376064717769623, + "logits/rejected": -0.11671161651611328, + "logps/chosen": -4.611720085144043, + "logps/rejected": -4.8727312088012695, + "loss": 0.0548, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.611720085144043, + "rewards/margins": 0.2610107958316803, + "rewards/rejected": -4.8727312088012695, + "sft_loss": 4.201091289520264, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 0.5870624108603953, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": -0.18181738257408142, + "logits/rejected": -0.09554007649421692, + "logps/chosen": -4.874770164489746, + "logps/rejected": -5.174876689910889, + "loss": 0.0532, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.874770164489746, + "rewards/margins": 0.3001064956188202, + "rewards/rejected": -5.174876689910889, + "sft_loss": 4.453106880187988, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 0.5024960047819973, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": -0.3122777044773102, + "logits/rejected": -0.08371108025312424, + "logps/chosen": -4.6728692054748535, + "logps/rejected": -5.066531181335449, + "loss": 0.0526, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.6728692054748535, + "rewards/margins": 0.3936619758605957, + "rewards/rejected": -5.066531181335449, + "sft_loss": 4.272462368011475, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 0.5431723145779719, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": -0.46850499510765076, + "logits/rejected": -0.2023623287677765, + "logps/chosen": -4.442728519439697, + "logps/rejected": -5.026824951171875, + "loss": 0.0519, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.442728519439697, + "rewards/margins": 0.5840964913368225, + "rewards/rejected": -5.026824951171875, + "sft_loss": 4.181841850280762, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 0.546519278936773, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": -0.1584623157978058, + "logits/rejected": -0.028326964005827904, + "logps/chosen": -4.671021938323975, + "logps/rejected": -5.056215763092041, + "loss": 0.0533, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.671021938323975, + "rewards/margins": 0.3851930797100067, + "rewards/rejected": -5.056215763092041, + "sft_loss": 4.2725138664245605, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 0.9667671251257445, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": -0.2628821134567261, + "logits/rejected": -0.10308748483657837, + "logps/chosen": -4.612080097198486, + "logps/rejected": -4.99003791809082, + "loss": 0.0539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.612080097198486, + "rewards/margins": 0.3779585063457489, + "rewards/rejected": -4.99003791809082, + "sft_loss": 4.356131076812744, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 0.3846431850386535, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": -0.2996244430541992, + "logits/rejected": -0.15662053227424622, + "logps/chosen": -4.658416271209717, + "logps/rejected": -5.115981578826904, + "loss": 0.0524, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.658416271209717, + "rewards/margins": 0.45756563544273376, + "rewards/rejected": -5.115981578826904, + "sft_loss": 4.365849494934082, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 0.665935476867161, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": -0.29253220558166504, + "logits/rejected": -0.25073716044425964, + "logps/chosen": -4.465968608856201, + "logps/rejected": -4.916172981262207, + "loss": 0.0527, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.465968608856201, + "rewards/margins": 0.4502039849758148, + "rewards/rejected": -4.916172981262207, + "sft_loss": 4.195641994476318, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 0.5706511414826213, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": -0.26913461089134216, + "logits/rejected": 0.0276435948908329, + "logps/chosen": -4.300182819366455, + "logps/rejected": -5.042660713195801, + "loss": 0.0504, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.300182819366455, + "rewards/margins": 0.742477536201477, + "rewards/rejected": -5.042660713195801, + "sft_loss": 4.060831546783447, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 0.7175039337411655, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": -0.21212784945964813, + "logits/rejected": -0.13273796439170837, + "logps/chosen": -4.553486347198486, + "logps/rejected": -4.936342239379883, + "loss": 0.0534, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.553486347198486, + "rewards/margins": 0.3828561007976532, + "rewards/rejected": -4.936342239379883, + "sft_loss": 4.191648960113525, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 0.63623683652012, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": -0.28056463599205017, + "logits/rejected": 0.1325863003730774, + "logps/chosen": -4.82030725479126, + "logps/rejected": -5.4860968589782715, + "loss": 0.0518, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.82030725479126, + "rewards/margins": 0.6657902002334595, + "rewards/rejected": -5.4860968589782715, + "sft_loss": 4.4288787841796875, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 0.7126472216799089, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": -0.26858845353126526, + "logits/rejected": -0.1267586648464203, + "logps/chosen": -4.539680480957031, + "logps/rejected": -5.149550437927246, + "loss": 0.053, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.539680480957031, + "rewards/margins": 0.6098700165748596, + "rewards/rejected": -5.149550437927246, + "sft_loss": 4.167618751525879, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 0.550303910887877, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": -0.37861794233322144, + "logits/rejected": -0.1660412847995758, + "logps/chosen": -4.474730491638184, + "logps/rejected": -4.919530391693115, + "loss": 0.0522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.474730491638184, + "rewards/margins": 0.44480031728744507, + "rewards/rejected": -4.919530391693115, + "sft_loss": 4.191833972930908, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 0.45261575047030345, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": -0.33118414878845215, + "logits/rejected": -0.13877174258232117, + "logps/chosen": -4.586190223693848, + "logps/rejected": -5.225762844085693, + "loss": 0.0525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.586190223693848, + "rewards/margins": 0.6395732760429382, + "rewards/rejected": -5.225762844085693, + "sft_loss": 4.352536678314209, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 0.9186138520912633, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.3902810513973236, + "logits/rejected": -0.2011948525905609, + "logps/chosen": -4.392946243286133, + "logps/rejected": -4.966264724731445, + "loss": 0.0523, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.392946243286133, + "rewards/margins": 0.5733183026313782, + "rewards/rejected": -4.966264724731445, + "sft_loss": 4.123747825622559, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 0.5968579774371027, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.4745398461818695, + "logits/rejected": -0.30128827691078186, + "logps/chosen": -4.585610389709473, + "logps/rejected": -5.091395854949951, + "loss": 0.053, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.585610389709473, + "rewards/margins": 0.5057860612869263, + "rewards/rejected": -5.091395854949951, + "sft_loss": 4.311280250549316, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 0.686411141611339, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.49996957182884216, + "logits/rejected": -0.3136758804321289, + "logps/chosen": -4.660679340362549, + "logps/rejected": -5.197052955627441, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.660679340362549, + "rewards/margins": 0.5363737344741821, + "rewards/rejected": -5.197052955627441, + "sft_loss": 4.413455009460449, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 0.7098920905319643, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": -0.32590216398239136, + "logits/rejected": -0.15566875040531158, + "logps/chosen": -4.4894208908081055, + "logps/rejected": -4.915673732757568, + "loss": 0.0535, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.4894208908081055, + "rewards/margins": 0.42625269293785095, + "rewards/rejected": -4.915673732757568, + "sft_loss": 4.128401279449463, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 0.7087262409431777, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": -0.35985809564590454, + "logits/rejected": -0.2987968325614929, + "logps/chosen": -4.599359035491943, + "logps/rejected": -4.990583419799805, + "loss": 0.0536, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.599359035491943, + "rewards/margins": 0.3912242352962494, + "rewards/rejected": -4.990583419799805, + "sft_loss": 4.331721782684326, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 0.7809553755456508, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.6510784029960632, + "logits/rejected": -0.43074244260787964, + "logps/chosen": -4.728677272796631, + "logps/rejected": -5.084107875823975, + "loss": 0.0536, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.728677272796631, + "rewards/margins": 0.3554309904575348, + "rewards/rejected": -5.084107875823975, + "sft_loss": 4.5105485916137695, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 0.8865709812275551, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.5979700088500977, + "logits/rejected": -0.29173022508621216, + "logps/chosen": -4.386279582977295, + "logps/rejected": -4.95373010635376, + "loss": 0.0531, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.386279582977295, + "rewards/margins": 0.5674503445625305, + "rewards/rejected": -4.95373010635376, + "sft_loss": 4.1684088706970215, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 0.6939758542037217, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.49798783659935, + "logits/rejected": -0.2839065194129944, + "logps/chosen": -4.452348232269287, + "logps/rejected": -4.926584720611572, + "loss": 0.0529, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.452348232269287, + "rewards/margins": 0.47423630952835083, + "rewards/rejected": -4.926584720611572, + "sft_loss": 4.255247116088867, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 0.5896298856164389, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": -0.37618792057037354, + "logits/rejected": -0.2516014575958252, + "logps/chosen": -4.588209629058838, + "logps/rejected": -4.932555198669434, + "loss": 0.0524, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.588209629058838, + "rewards/margins": 0.34434524178504944, + "rewards/rejected": -4.932555198669434, + "sft_loss": 4.211440086364746, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 0.5048753688209329, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": -0.3792995810508728, + "logits/rejected": -0.17535944283008575, + "logps/chosen": -4.544643402099609, + "logps/rejected": -5.277196884155273, + "loss": 0.052, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.544643402099609, + "rewards/margins": 0.7325533032417297, + "rewards/rejected": -5.277196884155273, + "sft_loss": 4.257083415985107, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 0.5342795514278404, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": -0.36811989545822144, + "logits/rejected": -0.15171898901462555, + "logps/chosen": -4.433106422424316, + "logps/rejected": -5.020786285400391, + "loss": 0.0541, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.433106422424316, + "rewards/margins": 0.5876799821853638, + "rewards/rejected": -5.020786285400391, + "sft_loss": 4.114101409912109, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 0.535835981919384, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.4679451584815979, + "logits/rejected": -0.3156459927558899, + "logps/chosen": -4.755094528198242, + "logps/rejected": -5.195936679840088, + "loss": 0.0524, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.755094528198242, + "rewards/margins": 0.4408422112464905, + "rewards/rejected": -5.195936679840088, + "sft_loss": 4.333351135253906, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 0.842033204355517, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": -0.38147813081741333, + "logits/rejected": -0.31262868642807007, + "logps/chosen": -4.662112236022949, + "logps/rejected": -5.163732051849365, + "loss": 0.0532, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.662112236022949, + "rewards/margins": 0.5016202330589294, + "rewards/rejected": -5.163732051849365, + "sft_loss": 4.4246416091918945, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 0.41952140030922824, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": -0.43312758207321167, + "logits/rejected": -0.32057255506515503, + "logps/chosen": -4.355666637420654, + "logps/rejected": -4.759754657745361, + "loss": 0.053, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.355666637420654, + "rewards/margins": 0.4040871560573578, + "rewards/rejected": -4.759754657745361, + "sft_loss": 4.02143669128418, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 0.5404524013697792, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.5744115114212036, + "logits/rejected": -0.2570160925388336, + "logps/chosen": -4.677066802978516, + "logps/rejected": -5.198808193206787, + "loss": 0.0522, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.677066802978516, + "rewards/margins": 0.521741509437561, + "rewards/rejected": -5.198808193206787, + "sft_loss": 4.395870685577393, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 0.5278130869284325, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": -0.4492534101009369, + "logits/rejected": -0.31946879625320435, + "logps/chosen": -4.7667155265808105, + "logps/rejected": -4.992683410644531, + "loss": 0.0537, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.7667155265808105, + "rewards/margins": 0.2259684056043625, + "rewards/rejected": -4.992683410644531, + "sft_loss": 4.413228511810303, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 0.772880187311183, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.4478439688682556, + "logits/rejected": -0.20472392439842224, + "logps/chosen": -4.402605056762695, + "logps/rejected": -5.060262680053711, + "loss": 0.0522, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.402605056762695, + "rewards/margins": 0.6576577425003052, + "rewards/rejected": -5.060262680053711, + "sft_loss": 4.161447048187256, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 0.7775561810812492, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": -0.42596834897994995, + "logits/rejected": -0.1455395519733429, + "logps/chosen": -4.4289679527282715, + "logps/rejected": -4.8820953369140625, + "loss": 0.0517, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.4289679527282715, + "rewards/margins": 0.45312729477882385, + "rewards/rejected": -4.8820953369140625, + "sft_loss": 4.115038871765137, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 0.36504463479697746, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": -0.3154129981994629, + "logits/rejected": -0.11409089714288712, + "logps/chosen": -4.550224781036377, + "logps/rejected": -4.950064659118652, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.550224781036377, + "rewards/margins": 0.39984145760536194, + "rewards/rejected": -4.950064659118652, + "sft_loss": 4.2174882888793945, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 0.7277940278719969, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.3689365088939667, + "logits/rejected": -0.37266969680786133, + "logps/chosen": -4.800908088684082, + "logps/rejected": -5.217124938964844, + "loss": 0.0536, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.800908088684082, + "rewards/margins": 0.4162166118621826, + "rewards/rejected": -5.217124938964844, + "sft_loss": 4.4879150390625, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 0.5014357810674278, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.456494003534317, + "logits/rejected": -0.20387740433216095, + "logps/chosen": -4.5678606033325195, + "logps/rejected": -5.184647560119629, + "loss": 0.0517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.5678606033325195, + "rewards/margins": 0.6167860627174377, + "rewards/rejected": -5.184647560119629, + "sft_loss": 4.280999183654785, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 0.8280312948646985, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": -0.17272579669952393, + "logits/rejected": -0.06938336789608002, + "logps/chosen": -4.48028039932251, + "logps/rejected": -4.926800727844238, + "loss": 0.0535, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.48028039932251, + "rewards/margins": 0.4465200901031494, + "rewards/rejected": -4.926800727844238, + "sft_loss": 4.15725040435791, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 0.43610312586235883, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": -0.28351375460624695, + "logits/rejected": -0.08770108222961426, + "logps/chosen": -4.448962211608887, + "logps/rejected": -4.971872329711914, + "loss": 0.0512, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.448962211608887, + "rewards/margins": 0.5229107141494751, + "rewards/rejected": -4.971872329711914, + "sft_loss": 3.9980177879333496, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 0.7384580305138214, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.317364364862442, + "logits/rejected": -0.17575505375862122, + "logps/chosen": -4.727007865905762, + "logps/rejected": -5.317923545837402, + "loss": 0.0519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.727007865905762, + "rewards/margins": 0.5909159779548645, + "rewards/rejected": -5.317923545837402, + "sft_loss": 4.3875274658203125, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 0.4737735564077406, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": -0.29782262444496155, + "logits/rejected": -0.20921726524829865, + "logps/chosen": -4.792957782745361, + "logps/rejected": -5.149683952331543, + "loss": 0.0545, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.792957782745361, + "rewards/margins": 0.3567260503768921, + "rewards/rejected": -5.149683952331543, + "sft_loss": 4.424458026885986, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 0.5026818798426406, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.405927836894989, + "logits/rejected": -0.22714976966381073, + "logps/chosen": -4.681812286376953, + "logps/rejected": -5.035951137542725, + "loss": 0.0546, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.681812286376953, + "rewards/margins": 0.3541390299797058, + "rewards/rejected": -5.035951137542725, + "sft_loss": 4.365417957305908, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 0.7029362410222331, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": -0.41014689207077026, + "logits/rejected": -0.2868584990501404, + "logps/chosen": -4.597723960876465, + "logps/rejected": -4.962946891784668, + "loss": 0.0529, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.597723960876465, + "rewards/margins": 0.3652224540710449, + "rewards/rejected": -4.962946891784668, + "sft_loss": 4.283852577209473, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 0.7647895308266636, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.5162456631660461, + "logits/rejected": -0.043417371809482574, + "logps/chosen": -4.48927640914917, + "logps/rejected": -5.17399787902832, + "loss": 0.0519, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.48927640914917, + "rewards/margins": 0.6847215890884399, + "rewards/rejected": -5.17399787902832, + "sft_loss": 4.273918151855469, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 0.48886842225485777, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": -0.3623657822608948, + "logits/rejected": -0.2918470501899719, + "logps/chosen": -4.423482418060303, + "logps/rejected": -4.869231700897217, + "loss": 0.0534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.423482418060303, + "rewards/margins": 0.4457489550113678, + "rewards/rejected": -4.869231700897217, + "sft_loss": 4.199785232543945, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 0.5651971687665704, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": -0.3085786700248718, + "logits/rejected": -0.18533708155155182, + "logps/chosen": -4.585659027099609, + "logps/rejected": -4.924344062805176, + "loss": 0.0531, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.585659027099609, + "rewards/margins": 0.338684618473053, + "rewards/rejected": -4.924344062805176, + "sft_loss": 4.329022407531738, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 0.34537796990734165, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.4657418131828308, + "logits/rejected": -0.21361199021339417, + "logps/chosen": -4.482302665710449, + "logps/rejected": -4.993915557861328, + "loss": 0.0523, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.482302665710449, + "rewards/margins": 0.5116127729415894, + "rewards/rejected": -4.993915557861328, + "sft_loss": 4.212442874908447, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 0.6353485867641818, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.3445362150669098, + "logits/rejected": -0.11221824586391449, + "logps/chosen": -4.467150688171387, + "logps/rejected": -4.882587909698486, + "loss": 0.0526, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.467150688171387, + "rewards/margins": 0.4154374599456787, + "rewards/rejected": -4.882587909698486, + "sft_loss": 4.144843101501465, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 0.7708747969935229, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.31633177399635315, + "logits/rejected": -0.053336597979068756, + "logps/chosen": -4.542233467102051, + "logps/rejected": -5.012372016906738, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.542233467102051, + "rewards/margins": 0.47013846039772034, + "rewards/rejected": -5.012372016906738, + "sft_loss": 4.1416215896606445, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 1.159682619717436, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": -0.22155523300170898, + "logits/rejected": -0.020089667290449142, + "logps/chosen": -4.597943305969238, + "logps/rejected": -5.248732089996338, + "loss": 0.0524, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.597943305969238, + "rewards/margins": 0.6507889628410339, + "rewards/rejected": -5.248732089996338, + "sft_loss": 4.222757816314697, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 0.8083866532979972, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": -0.2432478666305542, + "logits/rejected": -0.03565299138426781, + "logps/chosen": -4.642041206359863, + "logps/rejected": -5.277072906494141, + "loss": 0.0516, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.642041206359863, + "rewards/margins": 0.635032057762146, + "rewards/rejected": -5.277072906494141, + "sft_loss": 4.315460205078125, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 0.7014158947204401, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.4408930242061615, + "logits/rejected": -0.046825408935546875, + "logps/chosen": -4.495620250701904, + "logps/rejected": -5.293448448181152, + "loss": 0.0511, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.495620250701904, + "rewards/margins": 0.7978277802467346, + "rewards/rejected": -5.293448448181152, + "sft_loss": 4.236871242523193, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 0.9608041180526156, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.37728947401046753, + "logits/rejected": -0.00990285724401474, + "logps/chosen": -4.229923248291016, + "logps/rejected": -5.000068664550781, + "loss": 0.0508, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.229923248291016, + "rewards/margins": 0.7701452970504761, + "rewards/rejected": -5.000068664550781, + "sft_loss": 3.8961243629455566, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 0.5225311878998186, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": -0.20780956745147705, + "logits/rejected": -0.12729039788246155, + "logps/chosen": -4.6606974601745605, + "logps/rejected": -5.065556526184082, + "loss": 0.0538, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.6606974601745605, + "rewards/margins": 0.40485963225364685, + "rewards/rejected": -5.065556526184082, + "sft_loss": 4.36079740524292, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 0.5773964237041671, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": -0.25536245107650757, + "logits/rejected": -0.1844777762889862, + "logps/chosen": -4.609926223754883, + "logps/rejected": -5.045127868652344, + "loss": 0.0537, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.609926223754883, + "rewards/margins": 0.4352017343044281, + "rewards/rejected": -5.045127868652344, + "sft_loss": 4.38587760925293, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 0.7640839157989326, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": -0.3420313000679016, + "logits/rejected": -0.23492303490638733, + "logps/chosen": -4.852427005767822, + "logps/rejected": -5.167304039001465, + "loss": 0.0543, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.852427005767822, + "rewards/margins": 0.31487753987312317, + "rewards/rejected": -5.167304039001465, + "sft_loss": 4.611882209777832, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 0.5554897907915322, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": -0.4127574861049652, + "logits/rejected": -0.2540976107120514, + "logps/chosen": -4.4274468421936035, + "logps/rejected": -4.706323146820068, + "loss": 0.055, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.4274468421936035, + "rewards/margins": 0.2788761854171753, + "rewards/rejected": -4.706323146820068, + "sft_loss": 4.194740295410156, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 0.4757489938485724, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": -0.38364917039871216, + "logits/rejected": -0.012453851290047169, + "logps/chosen": -4.412278652191162, + "logps/rejected": -4.889250755310059, + "loss": 0.0519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.412278652191162, + "rewards/margins": 0.4769721031188965, + "rewards/rejected": -4.889250755310059, + "sft_loss": 4.152989387512207, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 0.4418324458419384, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": -0.3964102864265442, + "logits/rejected": -0.14374807476997375, + "logps/chosen": -4.395001411437988, + "logps/rejected": -5.0430474281311035, + "loss": 0.0519, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.395001411437988, + "rewards/margins": 0.6480464935302734, + "rewards/rejected": -5.0430474281311035, + "sft_loss": 4.156543731689453, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 0.7293779185730094, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.42947110533714294, + "logits/rejected": -0.3990377187728882, + "logps/chosen": -4.6490654945373535, + "logps/rejected": -4.978527545928955, + "loss": 0.0535, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.6490654945373535, + "rewards/margins": 0.3294626772403717, + "rewards/rejected": -4.978527545928955, + "sft_loss": 4.322227954864502, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 0.7207848366156323, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": -0.3351363241672516, + "logits/rejected": -0.2901817560195923, + "logps/chosen": -4.815976619720459, + "logps/rejected": -5.248465538024902, + "loss": 0.0533, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.815976619720459, + "rewards/margins": 0.43248969316482544, + "rewards/rejected": -5.248465538024902, + "sft_loss": 4.382091999053955, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 0.49484538896096164, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.49074602127075195, + "logits/rejected": -0.2281077802181244, + "logps/chosen": -4.58762264251709, + "logps/rejected": -5.309582710266113, + "loss": 0.0508, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.58762264251709, + "rewards/margins": 0.7219597697257996, + "rewards/rejected": -5.309582710266113, + "sft_loss": 4.244722843170166, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.1344272345304489, + "eval_logits/rejected": 0.2497575730085373, + "eval_logps/chosen": -4.521986961364746, + "eval_logps/rejected": -5.008147239685059, + "eval_loss": 0.05116863548755646, + "eval_rewards/accuracies": 0.6491097807884216, + "eval_rewards/chosen": -4.521986961364746, + "eval_rewards/margins": 0.4861602187156677, + "eval_rewards/rejected": -5.008147239685059, + "eval_runtime": 46.9917, + "eval_samples_per_second": 28.622, + "eval_sft_loss": 4.069005012512207, + "eval_steps_per_second": 7.171, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 0.7700792987909814, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.4408310055732727, + "logits/rejected": -0.11122790724039078, + "logps/chosen": -4.3237690925598145, + "logps/rejected": -4.888923168182373, + "loss": 0.0514, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.3237690925598145, + "rewards/margins": 0.5651546716690063, + "rewards/rejected": -4.888923168182373, + "sft_loss": 3.9699337482452393, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 0.4500857637288236, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.44538015127182007, + "logits/rejected": -0.18109442293643951, + "logps/chosen": -4.5957112312316895, + "logps/rejected": -5.148885250091553, + "loss": 0.0529, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.5957112312316895, + "rewards/margins": 0.5531740784645081, + "rewards/rejected": -5.148885250091553, + "sft_loss": 4.301665306091309, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 0.4853235079400437, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.31681299209594727, + "logits/rejected": -0.1446036398410797, + "logps/chosen": -4.822012901306152, + "logps/rejected": -5.373905658721924, + "loss": 0.053, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.822012901306152, + "rewards/margins": 0.5518924593925476, + "rewards/rejected": -5.373905658721924, + "sft_loss": 4.410131931304932, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 0.6305197858891428, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.28197187185287476, + "logits/rejected": -0.013742757961153984, + "logps/chosen": -4.69413948059082, + "logps/rejected": -5.082732200622559, + "loss": 0.0537, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.69413948059082, + "rewards/margins": 0.3885928690433502, + "rewards/rejected": -5.082732200622559, + "sft_loss": 4.341333389282227, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 0.8534848769692264, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": -0.2706916630268097, + "logits/rejected": -0.190748929977417, + "logps/chosen": -4.614697456359863, + "logps/rejected": -4.935882091522217, + "loss": 0.0548, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.614697456359863, + "rewards/margins": 0.3211846649646759, + "rewards/rejected": -4.935882091522217, + "sft_loss": 4.341588973999023, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 0.5827853719471676, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": -0.33987802267074585, + "logits/rejected": -0.14849236607551575, + "logps/chosen": -4.5428361892700195, + "logps/rejected": -5.143988609313965, + "loss": 0.0528, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.5428361892700195, + "rewards/margins": 0.6011531352996826, + "rewards/rejected": -5.143988609313965, + "sft_loss": 4.327922344207764, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 0.6927078960178196, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.4181820750236511, + "logits/rejected": -0.19935372471809387, + "logps/chosen": -4.4195332527160645, + "logps/rejected": -5.059715747833252, + "loss": 0.0526, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.4195332527160645, + "rewards/margins": 0.6401824355125427, + "rewards/rejected": -5.059715747833252, + "sft_loss": 4.210007667541504, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 0.6920786026169751, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.4544641375541687, + "logits/rejected": -0.23299141228199005, + "logps/chosen": -4.649049282073975, + "logps/rejected": -5.273727893829346, + "loss": 0.053, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.649049282073975, + "rewards/margins": 0.624678909778595, + "rewards/rejected": -5.273727893829346, + "sft_loss": 4.335822105407715, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 0.7030352155468556, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.4106753468513489, + "logits/rejected": -0.3475349247455597, + "logps/chosen": -4.661510467529297, + "logps/rejected": -4.993771076202393, + "loss": 0.0545, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.661510467529297, + "rewards/margins": 0.3322606682777405, + "rewards/rejected": -4.993771076202393, + "sft_loss": 4.4329681396484375, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 1.0133916891769041, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": -0.2660573124885559, + "logits/rejected": -0.24451692402362823, + "logps/chosen": -4.353519439697266, + "logps/rejected": -4.80262565612793, + "loss": 0.0522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.353519439697266, + "rewards/margins": 0.44910645484924316, + "rewards/rejected": -4.80262565612793, + "sft_loss": 4.052321910858154, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 0.38158224822284403, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.4017801284790039, + "logits/rejected": -0.2663123607635498, + "logps/chosen": -4.606898307800293, + "logps/rejected": -5.036543846130371, + "loss": 0.0524, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.606898307800293, + "rewards/margins": 0.4296456277370453, + "rewards/rejected": -5.036543846130371, + "sft_loss": 4.341722011566162, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 0.7340126012410843, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": -0.39234843850135803, + "logits/rejected": -0.35262423753738403, + "logps/chosen": -4.529763221740723, + "logps/rejected": -5.034109115600586, + "loss": 0.0526, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.529763221740723, + "rewards/margins": 0.5043456554412842, + "rewards/rejected": -5.034109115600586, + "sft_loss": 4.276070594787598, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 0.6370790584035594, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": -0.19702807068824768, + "logits/rejected": -0.1618746817111969, + "logps/chosen": -4.514230728149414, + "logps/rejected": -4.929436683654785, + "loss": 0.0537, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.514230728149414, + "rewards/margins": 0.4152059555053711, + "rewards/rejected": -4.929436683654785, + "sft_loss": 4.185002326965332, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 0.5247056428604062, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": -0.1723632663488388, + "logits/rejected": 0.06548583507537842, + "logps/chosen": -4.680706977844238, + "logps/rejected": -5.158249855041504, + "loss": 0.0538, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.680706977844238, + "rewards/margins": 0.4775429666042328, + "rewards/rejected": -5.158249855041504, + "sft_loss": 4.3549485206604, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 0.6651408971999766, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": -0.33580222725868225, + "logits/rejected": -0.09968128800392151, + "logps/chosen": -4.62811803817749, + "logps/rejected": -5.245832920074463, + "loss": 0.0532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.62811803817749, + "rewards/margins": 0.6177145838737488, + "rewards/rejected": -5.245832920074463, + "sft_loss": 4.451796054840088, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 0.4956364134232871, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.38807135820388794, + "logits/rejected": -0.22442781925201416, + "logps/chosen": -4.475585460662842, + "logps/rejected": -4.869357109069824, + "loss": 0.0545, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.475585460662842, + "rewards/margins": 0.3937712609767914, + "rewards/rejected": -4.869357109069824, + "sft_loss": 4.2697858810424805, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 0.4533210270930005, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": -0.37864798307418823, + "logits/rejected": -0.06893188506364822, + "logps/chosen": -4.537407398223877, + "logps/rejected": -5.067080497741699, + "loss": 0.0522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.537407398223877, + "rewards/margins": 0.5296733975410461, + "rewards/rejected": -5.067080497741699, + "sft_loss": 4.29558801651001, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 0.456636894174162, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.3543255925178528, + "logits/rejected": -0.08065290749073029, + "logps/chosen": -4.4864702224731445, + "logps/rejected": -5.07802677154541, + "loss": 0.051, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.4864702224731445, + "rewards/margins": 0.5915566682815552, + "rewards/rejected": -5.07802677154541, + "sft_loss": 4.127684593200684, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 0.7667898337451047, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.3511362671852112, + "logits/rejected": -0.14278550446033478, + "logps/chosen": -4.555008888244629, + "logps/rejected": -5.053010940551758, + "loss": 0.0523, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.555008888244629, + "rewards/margins": 0.4980013370513916, + "rewards/rejected": -5.053010940551758, + "sft_loss": 4.179459571838379, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 0.6244641339300205, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": -0.15665873885154724, + "logits/rejected": -0.2923945486545563, + "logps/chosen": -4.709486961364746, + "logps/rejected": -4.942587852478027, + "loss": 0.0551, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.709486961364746, + "rewards/margins": 0.23310072720050812, + "rewards/rejected": -4.942587852478027, + "sft_loss": 4.443274021148682, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 0.6618974427260913, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": -0.22444629669189453, + "logits/rejected": -0.08571354299783707, + "logps/chosen": -4.598001480102539, + "logps/rejected": -5.154397487640381, + "loss": 0.052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.598001480102539, + "rewards/margins": 0.5563960671424866, + "rewards/rejected": -5.154397487640381, + "sft_loss": 4.261946678161621, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 0.7041285882219205, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.26814156770706177, + "logits/rejected": -0.09805545210838318, + "logps/chosen": -4.7671709060668945, + "logps/rejected": -5.136244773864746, + "loss": 0.0529, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.7671709060668945, + "rewards/margins": 0.36907365918159485, + "rewards/rejected": -5.136244773864746, + "sft_loss": 4.420782566070557, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 0.9292121767573851, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.3112557530403137, + "logits/rejected": -0.09291480481624603, + "logps/chosen": -4.473947048187256, + "logps/rejected": -5.047905445098877, + "loss": 0.0525, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.473947048187256, + "rewards/margins": 0.5739586353302002, + "rewards/rejected": -5.047905445098877, + "sft_loss": 4.1539106369018555, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 0.7171640716077882, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.17702648043632507, + "logits/rejected": -0.07772380113601685, + "logps/chosen": -4.466500282287598, + "logps/rejected": -5.065943717956543, + "loss": 0.0521, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.466500282287598, + "rewards/margins": 0.5994431972503662, + "rewards/rejected": -5.065943717956543, + "sft_loss": 4.121184349060059, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 0.6258952957206926, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": -0.16714437305927277, + "logits/rejected": -0.03038867749273777, + "logps/chosen": -4.553987503051758, + "logps/rejected": -5.151804447174072, + "loss": 0.0518, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.553987503051758, + "rewards/margins": 0.5978171229362488, + "rewards/rejected": -5.151804447174072, + "sft_loss": 4.207757472991943, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 0.4969764494484655, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": -0.21663089096546173, + "logits/rejected": -0.16002297401428223, + "logps/chosen": -4.529235363006592, + "logps/rejected": -4.968966484069824, + "loss": 0.0529, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.529235363006592, + "rewards/margins": 0.4397306442260742, + "rewards/rejected": -4.968966484069824, + "sft_loss": 4.295144081115723, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 0.5729234869228408, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.43732938170433044, + "logits/rejected": -0.10048327594995499, + "logps/chosen": -4.420714378356934, + "logps/rejected": -5.147778511047363, + "loss": 0.0529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.420714378356934, + "rewards/margins": 0.7270635366439819, + "rewards/rejected": -5.147778511047363, + "sft_loss": 4.189997673034668, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 0.4876023798545788, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.3679789900779724, + "logits/rejected": -0.10510773956775665, + "logps/chosen": -4.425709247589111, + "logps/rejected": -4.967562198638916, + "loss": 0.0517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.425709247589111, + "rewards/margins": 0.5418528318405151, + "rewards/rejected": -4.967562198638916, + "sft_loss": 4.166138648986816, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 0.4988609878636019, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": -0.11972793191671371, + "logits/rejected": -0.1516149938106537, + "logps/chosen": -4.627892017364502, + "logps/rejected": -5.007716178894043, + "loss": 0.0534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.627892017364502, + "rewards/margins": 0.37982410192489624, + "rewards/rejected": -5.007716178894043, + "sft_loss": 4.289733409881592, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 0.6138860861212869, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.2044331133365631, + "logits/rejected": -0.12514927983283997, + "logps/chosen": -4.860374927520752, + "logps/rejected": -5.20972204208374, + "loss": 0.0556, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.860374927520752, + "rewards/margins": 0.34934720396995544, + "rewards/rejected": -5.20972204208374, + "sft_loss": 4.611658573150635, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 0.6603699568715765, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.27209264039993286, + "logits/rejected": -0.11125469207763672, + "logps/chosen": -4.411839008331299, + "logps/rejected": -4.948624610900879, + "loss": 0.0517, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.411839008331299, + "rewards/margins": 0.5367849469184875, + "rewards/rejected": -4.948624610900879, + "sft_loss": 4.160839080810547, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 0.8262536390192445, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.2572443187236786, + "logits/rejected": -0.04624100401997566, + "logps/chosen": -4.364460468292236, + "logps/rejected": -4.834274768829346, + "loss": 0.053, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.364460468292236, + "rewards/margins": 0.46981415152549744, + "rewards/rejected": -4.834274768829346, + "sft_loss": 4.102207183837891, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 0.6191068674758381, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": -0.15403051674365997, + "logits/rejected": -0.0583893358707428, + "logps/chosen": -4.623048782348633, + "logps/rejected": -5.179881572723389, + "loss": 0.0536, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.623048782348633, + "rewards/margins": 0.5568326711654663, + "rewards/rejected": -5.179881572723389, + "sft_loss": 4.381789684295654, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 0.43461183223783423, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": -0.17942842841148376, + "logits/rejected": 0.036060623824596405, + "logps/chosen": -4.722434997558594, + "logps/rejected": -5.11240291595459, + "loss": 0.0538, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.722434997558594, + "rewards/margins": 0.3899684250354767, + "rewards/rejected": -5.11240291595459, + "sft_loss": 4.488548278808594, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 0.5292183247284945, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.3148379325866699, + "logits/rejected": -0.12622442841529846, + "logps/chosen": -4.515315532684326, + "logps/rejected": -5.078459739685059, + "loss": 0.0523, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.515315532684326, + "rewards/margins": 0.5631445646286011, + "rewards/rejected": -5.078459739685059, + "sft_loss": 4.220766067504883, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 0.8995848344447617, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.30298176407814026, + "logits/rejected": -0.15992002189159393, + "logps/chosen": -4.513458251953125, + "logps/rejected": -5.091177940368652, + "loss": 0.0527, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.513458251953125, + "rewards/margins": 0.5777191519737244, + "rewards/rejected": -5.091177940368652, + "sft_loss": 4.2787041664123535, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 0.567681451550632, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": -0.2650911211967468, + "logits/rejected": -0.19625148177146912, + "logps/chosen": -4.545306205749512, + "logps/rejected": -4.82927942276001, + "loss": 0.0544, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.545306205749512, + "rewards/margins": 0.2839727997779846, + "rewards/rejected": -4.82927942276001, + "sft_loss": 4.279045104980469, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 0.4350566579254937, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": -0.2149418294429779, + "logits/rejected": -0.07808341085910797, + "logps/chosen": -4.562401294708252, + "logps/rejected": -5.076463222503662, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.562401294708252, + "rewards/margins": 0.5140615701675415, + "rewards/rejected": -5.076463222503662, + "sft_loss": 4.298698902130127, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 0.4812827472314279, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.35899245738983154, + "logits/rejected": -0.18311157822608948, + "logps/chosen": -4.479128837585449, + "logps/rejected": -5.071837425231934, + "loss": 0.0521, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.479128837585449, + "rewards/margins": 0.592708170413971, + "rewards/rejected": -5.071837425231934, + "sft_loss": 4.253784656524658, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 0.5327496826275652, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.37001967430114746, + "logits/rejected": -0.3091837465763092, + "logps/chosen": -4.516618251800537, + "logps/rejected": -4.93643856048584, + "loss": 0.0532, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.516618251800537, + "rewards/margins": 0.4198206961154938, + "rewards/rejected": -4.93643856048584, + "sft_loss": 4.2589216232299805, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 0.802834245743257, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.4872976243495941, + "logits/rejected": -0.32987093925476074, + "logps/chosen": -4.4331231117248535, + "logps/rejected": -5.15014123916626, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.4331231117248535, + "rewards/margins": 0.7170186042785645, + "rewards/rejected": -5.15014123916626, + "sft_loss": 4.194850921630859, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 0.6878570030804108, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.38983696699142456, + "logits/rejected": -0.22500252723693848, + "logps/chosen": -4.533890724182129, + "logps/rejected": -5.05610466003418, + "loss": 0.052, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.533890724182129, + "rewards/margins": 0.5222145318984985, + "rewards/rejected": -5.05610466003418, + "sft_loss": 4.274322032928467, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 0.5121223276944379, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.2688700556755066, + "logits/rejected": -0.09521832317113876, + "logps/chosen": -4.299901485443115, + "logps/rejected": -4.9609694480896, + "loss": 0.051, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.299901485443115, + "rewards/margins": 0.6610682606697083, + "rewards/rejected": -4.9609694480896, + "sft_loss": 4.028165817260742, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 0.6048520534653532, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": -0.2053041160106659, + "logits/rejected": -0.16262459754943848, + "logps/chosen": -4.482122421264648, + "logps/rejected": -5.020371913909912, + "loss": 0.0524, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.482122421264648, + "rewards/margins": 0.5382490754127502, + "rewards/rejected": -5.020371913909912, + "sft_loss": 4.157798767089844, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 0.4803677280460752, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.30528777837753296, + "logits/rejected": -0.06126274913549423, + "logps/chosen": -4.638838768005371, + "logps/rejected": -5.484151363372803, + "loss": 0.0524, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.638838768005371, + "rewards/margins": 0.8453127145767212, + "rewards/rejected": -5.484151363372803, + "sft_loss": 4.383290767669678, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 0.441717668108046, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": -0.1470208615064621, + "logits/rejected": -0.09497350454330444, + "logps/chosen": -4.461791515350342, + "logps/rejected": -4.9654388427734375, + "loss": 0.0524, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.461791515350342, + "rewards/margins": 0.5036473274230957, + "rewards/rejected": -4.9654388427734375, + "sft_loss": 4.162781715393066, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 0.46083392095277986, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.2915559411048889, + "logits/rejected": -0.18692262470722198, + "logps/chosen": -4.589268684387207, + "logps/rejected": -5.0538201332092285, + "loss": 0.0521, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.589268684387207, + "rewards/margins": 0.46455103158950806, + "rewards/rejected": -5.0538201332092285, + "sft_loss": 4.349760055541992, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 0.4360499079412217, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": -0.3233645558357239, + "logits/rejected": -0.06588099151849747, + "logps/chosen": -4.448300361633301, + "logps/rejected": -4.996584415435791, + "loss": 0.0535, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.448300361633301, + "rewards/margins": 0.5482843518257141, + "rewards/rejected": -4.996584415435791, + "sft_loss": 4.263411521911621, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 0.5306953171524185, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.22878269851207733, + "logits/rejected": -0.11855238676071167, + "logps/chosen": -4.591235160827637, + "logps/rejected": -4.991271018981934, + "loss": 0.0527, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.591235160827637, + "rewards/margins": 0.40003618597984314, + "rewards/rejected": -4.991271018981934, + "sft_loss": 4.270682334899902, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 0.3833376083394722, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.5186641216278076, + "logits/rejected": -0.3550383448600769, + "logps/chosen": -4.492537021636963, + "logps/rejected": -4.917145729064941, + "loss": 0.0536, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.492537021636963, + "rewards/margins": 0.42460840940475464, + "rewards/rejected": -4.917145729064941, + "sft_loss": 4.207838535308838, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 0.43893085734184745, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.3054492473602295, + "logits/rejected": -0.1130295991897583, + "logps/chosen": -4.558745861053467, + "logps/rejected": -5.160397529602051, + "loss": 0.0504, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.558745861053467, + "rewards/margins": 0.6016519069671631, + "rewards/rejected": -5.160397529602051, + "sft_loss": 4.105099201202393, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 0.5769251490857489, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.3264893889427185, + "logits/rejected": -0.18141944706439972, + "logps/chosen": -4.5322265625, + "logps/rejected": -5.082207679748535, + "loss": 0.0509, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.5322265625, + "rewards/margins": 0.5499812960624695, + "rewards/rejected": -5.082207679748535, + "sft_loss": 4.100834846496582, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 0.5133368713565196, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.28852778673171997, + "logits/rejected": -0.2075691968202591, + "logps/chosen": -4.686163902282715, + "logps/rejected": -5.1576972007751465, + "loss": 0.052, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.686163902282715, + "rewards/margins": 0.4715335965156555, + "rewards/rejected": -5.1576972007751465, + "sft_loss": 4.286208152770996, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 0.4180399710199207, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.35929030179977417, + "logits/rejected": -0.1946285218000412, + "logps/chosen": -4.443009853363037, + "logps/rejected": -5.0500898361206055, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.443009853363037, + "rewards/margins": 0.6070801019668579, + "rewards/rejected": -5.0500898361206055, + "sft_loss": 4.139133453369141, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 0.5503632976186634, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.4894943833351135, + "logits/rejected": -0.28924956917762756, + "logps/chosen": -4.698008060455322, + "logps/rejected": -5.052058696746826, + "loss": 0.0537, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.698008060455322, + "rewards/margins": 0.3540504276752472, + "rewards/rejected": -5.052058696746826, + "sft_loss": 4.405055046081543, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 0.45174404828507203, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.30120593309402466, + "logits/rejected": -0.164491206407547, + "logps/chosen": -4.559738636016846, + "logps/rejected": -5.136922836303711, + "loss": 0.0513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.559738636016846, + "rewards/margins": 0.5771840214729309, + "rewards/rejected": -5.136922836303711, + "sft_loss": 4.184256076812744, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 0.34956396396189543, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.3214988112449646, + "logits/rejected": -0.2575382888317108, + "logps/chosen": -4.629532814025879, + "logps/rejected": -5.043061256408691, + "loss": 0.0534, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.629532814025879, + "rewards/margins": 0.4135282635688782, + "rewards/rejected": -5.043061256408691, + "sft_loss": 4.252536773681641, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 0.6764223132300197, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.3845653831958771, + "logits/rejected": -0.08716684579849243, + "logps/chosen": -4.3737688064575195, + "logps/rejected": -4.897772312164307, + "loss": 0.0524, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.3737688064575195, + "rewards/margins": 0.5240030884742737, + "rewards/rejected": -4.897772312164307, + "sft_loss": 4.154975891113281, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 0.5770424148011009, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.27163565158843994, + "logits/rejected": -0.2584533095359802, + "logps/chosen": -4.548377990722656, + "logps/rejected": -4.94322395324707, + "loss": 0.0522, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.548377990722656, + "rewards/margins": 0.39484524726867676, + "rewards/rejected": -4.94322395324707, + "sft_loss": 4.188554763793945, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 0.5114431601672031, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.4921305775642395, + "logits/rejected": -0.08544237911701202, + "logps/chosen": -4.601258277893066, + "logps/rejected": -5.190618991851807, + "loss": 0.0514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.601258277893066, + "rewards/margins": 0.5893611907958984, + "rewards/rejected": -5.190618991851807, + "sft_loss": 4.346938133239746, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 0.9844192428537916, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.21924248337745667, + "logits/rejected": -0.037383563816547394, + "logps/chosen": -4.587491989135742, + "logps/rejected": -4.976442813873291, + "loss": 0.0542, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.587491989135742, + "rewards/margins": 0.38895124197006226, + "rewards/rejected": -4.976442813873291, + "sft_loss": 4.243929386138916, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 0.524874546523782, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.3437415063381195, + "logits/rejected": -0.17452563345432281, + "logps/chosen": -4.569422721862793, + "logps/rejected": -4.986387252807617, + "loss": 0.0534, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.569422721862793, + "rewards/margins": 0.41696444153785706, + "rewards/rejected": -4.986387252807617, + "sft_loss": 4.248903751373291, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 0.35099039329842446, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.32231372594833374, + "logits/rejected": -0.15310195088386536, + "logps/chosen": -4.564090728759766, + "logps/rejected": -5.047003269195557, + "loss": 0.0519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.564090728759766, + "rewards/margins": 0.482913076877594, + "rewards/rejected": -5.047003269195557, + "sft_loss": 4.228408336639404, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 0.2853568267955546, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.3368076980113983, + "logits/rejected": -0.21735279262065887, + "logps/chosen": -4.621280670166016, + "logps/rejected": -5.160231590270996, + "loss": 0.0524, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.621280670166016, + "rewards/margins": 0.5389507412910461, + "rewards/rejected": -5.160231590270996, + "sft_loss": 4.306357383728027, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 0.4608071152061401, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.4223474860191345, + "logits/rejected": -0.10327117145061493, + "logps/chosen": -4.403355598449707, + "logps/rejected": -4.978325843811035, + "loss": 0.0511, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.403355598449707, + "rewards/margins": 0.5749701857566833, + "rewards/rejected": -4.978325843811035, + "sft_loss": 4.0618391036987305, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 0.5609992865362639, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.23218891024589539, + "logits/rejected": 1.4790147361054551e-05, + "logps/chosen": -4.453673839569092, + "logps/rejected": -5.110269546508789, + "loss": 0.0518, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.453673839569092, + "rewards/margins": 0.6565961837768555, + "rewards/rejected": -5.110269546508789, + "sft_loss": 4.1434006690979, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 0.5850822593289026, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.2841840386390686, + "logits/rejected": 0.012512536719441414, + "logps/chosen": -4.500065803527832, + "logps/rejected": -5.142590522766113, + "loss": 0.0526, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.500065803527832, + "rewards/margins": 0.6425246596336365, + "rewards/rejected": -5.142590522766113, + "sft_loss": 4.258237838745117, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 0.7141136511961801, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.1625567376613617, + "logits/rejected": -0.03078523278236389, + "logps/chosen": -4.5068159103393555, + "logps/rejected": -5.23297119140625, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.5068159103393555, + "rewards/margins": 0.7261554598808289, + "rewards/rejected": -5.23297119140625, + "sft_loss": 4.137577056884766, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 0.43605856386239633, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.28013330698013306, + "logits/rejected": -0.07744655758142471, + "logps/chosen": -4.623358249664307, + "logps/rejected": -5.3566060066223145, + "loss": 0.0521, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.623358249664307, + "rewards/margins": 0.7332478165626526, + "rewards/rejected": -5.3566060066223145, + "sft_loss": 4.287482738494873, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 0.39378033583895794, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.21353749930858612, + "logits/rejected": 0.08030889183282852, + "logps/chosen": -4.414605140686035, + "logps/rejected": -5.014411449432373, + "loss": 0.0523, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.414605140686035, + "rewards/margins": 0.5998064279556274, + "rewards/rejected": -5.014411449432373, + "sft_loss": 4.133689880371094, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 0.4990210435130263, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.35716816782951355, + "logits/rejected": -0.269197553396225, + "logps/chosen": -4.437384605407715, + "logps/rejected": -4.960318088531494, + "loss": 0.0529, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.437384605407715, + "rewards/margins": 0.5229335427284241, + "rewards/rejected": -4.960318088531494, + "sft_loss": 4.239546775817871, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 0.35144504822556205, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.24343474209308624, + "logits/rejected": -0.2140563428401947, + "logps/chosen": -4.5842461585998535, + "logps/rejected": -4.974714756011963, + "loss": 0.0528, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.5842461585998535, + "rewards/margins": 0.3904687762260437, + "rewards/rejected": -4.974714756011963, + "sft_loss": 4.3355817794799805, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 0.5694804322434417, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.2745942175388336, + "logits/rejected": -0.18785987794399261, + "logps/chosen": -4.44666051864624, + "logps/rejected": -5.071129322052002, + "loss": 0.0514, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.44666051864624, + "rewards/margins": 0.6244686841964722, + "rewards/rejected": -5.071129322052002, + "sft_loss": 4.212424278259277, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 0.5013521036883045, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.3253830075263977, + "logits/rejected": -0.11694759130477905, + "logps/chosen": -4.50570821762085, + "logps/rejected": -5.062636852264404, + "loss": 0.052, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.50570821762085, + "rewards/margins": 0.5569278001785278, + "rewards/rejected": -5.062636852264404, + "sft_loss": 4.15521764755249, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 0.4268697531621967, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": -0.14694912731647491, + "logits/rejected": -0.08554248511791229, + "logps/chosen": -4.438208103179932, + "logps/rejected": -5.099421501159668, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.438208103179932, + "rewards/margins": 0.6612135767936707, + "rewards/rejected": -5.099421501159668, + "sft_loss": 4.202122688293457, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 0.5113103606821928, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": -0.20020289719104767, + "logits/rejected": -0.05032138153910637, + "logps/chosen": -4.628384113311768, + "logps/rejected": -5.130135536193848, + "loss": 0.0525, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.628384113311768, + "rewards/margins": 0.5017513036727905, + "rewards/rejected": -5.130135536193848, + "sft_loss": 4.327366828918457, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 0.451200741295089, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.28399142622947693, + "logits/rejected": -0.098796546459198, + "logps/chosen": -4.541468620300293, + "logps/rejected": -4.977931022644043, + "loss": 0.0519, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.541468620300293, + "rewards/margins": 0.43646302819252014, + "rewards/rejected": -4.977931022644043, + "sft_loss": 4.1500091552734375, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 0.5048494393677178, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.3281692564487457, + "logits/rejected": -0.16416522860527039, + "logps/chosen": -4.572214126586914, + "logps/rejected": -5.139347076416016, + "loss": 0.0528, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.572214126586914, + "rewards/margins": 0.5671325922012329, + "rewards/rejected": -5.139347076416016, + "sft_loss": 4.260963439941406, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 0.7189935633369484, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": -0.2590765953063965, + "logits/rejected": -0.04907204583287239, + "logps/chosen": -4.604923248291016, + "logps/rejected": -5.234789848327637, + "loss": 0.0524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.604923248291016, + "rewards/margins": 0.6298665404319763, + "rewards/rejected": -5.234789848327637, + "sft_loss": 4.281820774078369, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 0.34328131015830476, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.40063053369522095, + "logits/rejected": -0.19102320075035095, + "logps/chosen": -4.727412700653076, + "logps/rejected": -5.302260398864746, + "loss": 0.0529, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.727412700653076, + "rewards/margins": 0.5748476386070251, + "rewards/rejected": -5.302260398864746, + "sft_loss": 4.363173007965088, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.2181275635957718, + "eval_logits/rejected": 0.32679614424705505, + "eval_logps/chosen": -4.391653060913086, + "eval_logps/rejected": -4.964575290679932, + "eval_loss": 0.050767455250024796, + "eval_rewards/accuracies": 0.6520771384239197, + "eval_rewards/chosen": -4.391653060913086, + "eval_rewards/margins": 0.5729230046272278, + "eval_rewards/rejected": -4.964575290679932, + "eval_runtime": 46.8943, + "eval_samples_per_second": 28.681, + "eval_sft_loss": 3.9195380210876465, + "eval_steps_per_second": 7.186, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 0.5010503749955464, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.2518996596336365, + "logits/rejected": -0.21349790692329407, + "logps/chosen": -4.4844794273376465, + "logps/rejected": -4.871996879577637, + "loss": 0.0534, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.4844794273376465, + "rewards/margins": 0.3875174820423126, + "rewards/rejected": -4.871996879577637, + "sft_loss": 4.148735046386719, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 0.5267306082118653, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.3113710582256317, + "logits/rejected": -0.1672811210155487, + "logps/chosen": -4.509435176849365, + "logps/rejected": -5.137767314910889, + "loss": 0.051, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.509435176849365, + "rewards/margins": 0.628332257270813, + "rewards/rejected": -5.137767314910889, + "sft_loss": 4.157217979431152, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 0.6431378047075401, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.26741084456443787, + "logits/rejected": -0.12249056994915009, + "logps/chosen": -4.71677827835083, + "logps/rejected": -5.193902015686035, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.71677827835083, + "rewards/margins": 0.4771236479282379, + "rewards/rejected": -5.193902015686035, + "sft_loss": 4.391667366027832, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 0.6615586107166485, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.4982661306858063, + "logits/rejected": -0.18780794739723206, + "logps/chosen": -4.37764310836792, + "logps/rejected": -5.1312479972839355, + "loss": 0.0502, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.37764310836792, + "rewards/margins": 0.7536051869392395, + "rewards/rejected": -5.1312479972839355, + "sft_loss": 4.143968105316162, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 0.8018527086095957, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.27660831809043884, + "logits/rejected": -0.07096768170595169, + "logps/chosen": -4.368846416473389, + "logps/rejected": -5.057332515716553, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.368846416473389, + "rewards/margins": 0.6884865164756775, + "rewards/rejected": -5.057332515716553, + "sft_loss": 4.087088584899902, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 0.39634613130322927, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.27290281653404236, + "logits/rejected": -0.27338218688964844, + "logps/chosen": -4.375092506408691, + "logps/rejected": -4.8426008224487305, + "loss": 0.0532, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.375092506408691, + "rewards/margins": 0.4675084948539734, + "rewards/rejected": -4.8426008224487305, + "sft_loss": 4.101852893829346, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 0.41087822652882344, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.29975003004074097, + "logits/rejected": -0.10043933242559433, + "logps/chosen": -4.682087421417236, + "logps/rejected": -5.2238993644714355, + "loss": 0.0527, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.682087421417236, + "rewards/margins": 0.5418123006820679, + "rewards/rejected": -5.2238993644714355, + "sft_loss": 4.406389236450195, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 0.5169857197006602, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.2759687006473541, + "logits/rejected": -0.25068631768226624, + "logps/chosen": -4.628575325012207, + "logps/rejected": -5.116905689239502, + "loss": 0.0514, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.628575325012207, + "rewards/margins": 0.48833027482032776, + "rewards/rejected": -5.116905689239502, + "sft_loss": 4.2828264236450195, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 1.3389099369822377, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.1845565289258957, + "logits/rejected": -0.12967592477798462, + "logps/chosen": -4.451420307159424, + "logps/rejected": -4.993684768676758, + "loss": 0.0535, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.451420307159424, + "rewards/margins": 0.5422651171684265, + "rewards/rejected": -4.993684768676758, + "sft_loss": 4.071597099304199, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 0.4949625331470205, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.3145469129085541, + "logits/rejected": -0.1309657245874405, + "logps/chosen": -4.390885829925537, + "logps/rejected": -4.963167667388916, + "loss": 0.0533, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.390885829925537, + "rewards/margins": 0.5722818374633789, + "rewards/rejected": -4.963167667388916, + "sft_loss": 4.19434118270874, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 0.6295468994335813, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.32317155599594116, + "logits/rejected": -0.15155306458473206, + "logps/chosen": -4.628039360046387, + "logps/rejected": -5.162529945373535, + "loss": 0.0528, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.628039360046387, + "rewards/margins": 0.5344905257225037, + "rewards/rejected": -5.162529945373535, + "sft_loss": 4.412549018859863, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 0.4033918872706689, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.21534164249897003, + "logits/rejected": -0.13639280200004578, + "logps/chosen": -4.623472690582275, + "logps/rejected": -5.190655708312988, + "loss": 0.0516, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.623472690582275, + "rewards/margins": 0.5671836733818054, + "rewards/rejected": -5.190655708312988, + "sft_loss": 4.301222324371338, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 0.5476916192913638, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.27661851048469543, + "logits/rejected": -0.10544946044683456, + "logps/chosen": -4.245184898376465, + "logps/rejected": -4.931614875793457, + "loss": 0.0515, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.245184898376465, + "rewards/margins": 0.6864299774169922, + "rewards/rejected": -4.931614875793457, + "sft_loss": 3.9942116737365723, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 0.4772096862444193, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.22163578867912292, + "logits/rejected": -0.03878837823867798, + "logps/chosen": -4.369515419006348, + "logps/rejected": -5.048583030700684, + "loss": 0.0507, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.369515419006348, + "rewards/margins": 0.6790679693222046, + "rewards/rejected": -5.048583030700684, + "sft_loss": 4.050380706787109, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 0.5373686869569195, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.3304591178894043, + "logits/rejected": -0.2032184600830078, + "logps/chosen": -4.617485046386719, + "logps/rejected": -5.059828758239746, + "loss": 0.053, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.617485046386719, + "rewards/margins": 0.44234347343444824, + "rewards/rejected": -5.059828758239746, + "sft_loss": 4.327376842498779, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 0.4664562059007877, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.3769041895866394, + "logits/rejected": -0.15755943953990936, + "logps/chosen": -4.339583396911621, + "logps/rejected": -4.930947780609131, + "loss": 0.0524, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.339583396911621, + "rewards/margins": 0.591364860534668, + "rewards/rejected": -4.930947780609131, + "sft_loss": 4.0584540367126465, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 0.6250482398641047, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.2822895050048828, + "logits/rejected": -0.19432392716407776, + "logps/chosen": -4.452942371368408, + "logps/rejected": -5.425656318664551, + "loss": 0.0509, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.452942371368408, + "rewards/margins": 0.9727136492729187, + "rewards/rejected": -5.425656318664551, + "sft_loss": 4.201419353485107, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 0.8350884908004433, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.2402046173810959, + "logits/rejected": -0.26276570558547974, + "logps/chosen": -4.695631980895996, + "logps/rejected": -5.093986511230469, + "loss": 0.0525, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.695631980895996, + "rewards/margins": 0.39835453033447266, + "rewards/rejected": -5.093986511230469, + "sft_loss": 4.272342681884766, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 0.49061162342011755, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.4282917380332947, + "logits/rejected": -0.2478407621383667, + "logps/chosen": -4.363981246948242, + "logps/rejected": -5.022068977355957, + "loss": 0.052, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.363981246948242, + "rewards/margins": 0.6580876708030701, + "rewards/rejected": -5.022068977355957, + "sft_loss": 4.1623854637146, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 0.4053383565155827, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.20890608429908752, + "logits/rejected": -0.1709698736667633, + "logps/chosen": -4.5597615242004395, + "logps/rejected": -4.990670204162598, + "loss": 0.0521, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.5597615242004395, + "rewards/margins": 0.43090900778770447, + "rewards/rejected": -4.990670204162598, + "sft_loss": 4.204160213470459, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 0.5077357597268557, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": -0.17620953917503357, + "logits/rejected": 0.038376279175281525, + "logps/chosen": -4.530808925628662, + "logps/rejected": -5.178600788116455, + "loss": 0.0521, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.530808925628662, + "rewards/margins": 0.6477917432785034, + "rewards/rejected": -5.178600788116455, + "sft_loss": 4.258144378662109, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 0.535113113045043, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": -0.18129639327526093, + "logits/rejected": -0.02067035436630249, + "logps/chosen": -4.638469219207764, + "logps/rejected": -5.311728000640869, + "loss": 0.053, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.638469219207764, + "rewards/margins": 0.6732583045959473, + "rewards/rejected": -5.311728000640869, + "sft_loss": 4.301131248474121, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 0.6531795517975728, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": -0.1367073506116867, + "logits/rejected": -0.10230563580989838, + "logps/chosen": -4.544414043426514, + "logps/rejected": -5.14690637588501, + "loss": 0.0526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.544414043426514, + "rewards/margins": 0.6024927496910095, + "rewards/rejected": -5.14690637588501, + "sft_loss": 4.306552886962891, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 0.4472285225867335, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": -0.28611913323402405, + "logits/rejected": -0.26917344331741333, + "logps/chosen": -4.406510353088379, + "logps/rejected": -4.875698566436768, + "loss": 0.0532, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.406510353088379, + "rewards/margins": 0.4691886305809021, + "rewards/rejected": -4.875698566436768, + "sft_loss": 4.193192481994629, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 0.44877368661612094, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": -0.3733294904232025, + "logits/rejected": -0.2503949999809265, + "logps/chosen": -4.5276312828063965, + "logps/rejected": -5.178982734680176, + "loss": 0.0532, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.5276312828063965, + "rewards/margins": 0.6513513326644897, + "rewards/rejected": -5.178982734680176, + "sft_loss": 4.268059730529785, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 0.5827149114778036, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": -0.21721968054771423, + "logits/rejected": 0.10426320880651474, + "logps/chosen": -4.440943717956543, + "logps/rejected": -5.223211288452148, + "loss": 0.0497, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.440943717956543, + "rewards/margins": 0.782267689704895, + "rewards/rejected": -5.223211288452148, + "sft_loss": 4.089264869689941, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 0.5107309445628425, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": -0.24261124432086945, + "logits/rejected": 0.019307659938931465, + "logps/chosen": -4.272761821746826, + "logps/rejected": -5.059557914733887, + "loss": 0.052, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.272761821746826, + "rewards/margins": 0.7867968082427979, + "rewards/rejected": -5.059557914733887, + "sft_loss": 4.039781093597412, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 0.6359037944598581, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": -0.18060888350009918, + "logits/rejected": -0.014508080668747425, + "logps/chosen": -4.498339653015137, + "logps/rejected": -5.16357421875, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.498339653015137, + "rewards/margins": 0.6652345657348633, + "rewards/rejected": -5.16357421875, + "sft_loss": 4.235073566436768, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 0.6393126236107872, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": -0.15019895136356354, + "logits/rejected": -0.05985778570175171, + "logps/chosen": -4.496522426605225, + "logps/rejected": -5.198064804077148, + "loss": 0.0504, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.496522426605225, + "rewards/margins": 0.7015424966812134, + "rewards/rejected": -5.198064804077148, + "sft_loss": 4.151566505432129, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 0.5093607606357484, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.20116212964057922, + "logits/rejected": -0.1553642749786377, + "logps/chosen": -4.311014652252197, + "logps/rejected": -5.0506696701049805, + "loss": 0.0521, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.311014652252197, + "rewards/margins": 0.7396548986434937, + "rewards/rejected": -5.0506696701049805, + "sft_loss": 4.00994348526001, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 0.5657638745166378, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": -0.24293391406536102, + "logits/rejected": -0.13297154009342194, + "logps/chosen": -4.716964244842529, + "logps/rejected": -5.143126487731934, + "loss": 0.0532, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.716964244842529, + "rewards/margins": 0.42616158723831177, + "rewards/rejected": -5.143126487731934, + "sft_loss": 4.424800872802734, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 0.4765104519295766, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.3335570693016052, + "logits/rejected": -0.25355204939842224, + "logps/chosen": -4.636630058288574, + "logps/rejected": -5.252082347869873, + "loss": 0.0519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.636630058288574, + "rewards/margins": 0.6154532432556152, + "rewards/rejected": -5.252082347869873, + "sft_loss": 4.426233768463135, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 0.44084472243021816, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": -0.3639964461326599, + "logits/rejected": -0.12874539196491241, + "logps/chosen": -4.40498685836792, + "logps/rejected": -4.935255527496338, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.40498685836792, + "rewards/margins": 0.5302689671516418, + "rewards/rejected": -4.935255527496338, + "sft_loss": 4.194338798522949, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 0.4315771678550106, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": -0.2753906846046448, + "logits/rejected": -0.08385102450847626, + "logps/chosen": -4.243175506591797, + "logps/rejected": -4.9434309005737305, + "loss": 0.051, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.243175506591797, + "rewards/margins": 0.7002550363540649, + "rewards/rejected": -4.9434309005737305, + "sft_loss": 4.0009446144104, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 0.5705012184083136, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.3476327359676361, + "logits/rejected": -0.07195943593978882, + "logps/chosen": -4.406363487243652, + "logps/rejected": -4.905777454376221, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.406363487243652, + "rewards/margins": 0.4994131922721863, + "rewards/rejected": -4.905777454376221, + "sft_loss": 4.096680641174316, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 0.4734632097680218, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": -0.22654542326927185, + "logits/rejected": -0.13219550251960754, + "logps/chosen": -4.476452827453613, + "logps/rejected": -5.189483642578125, + "loss": 0.0505, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.476452827453613, + "rewards/margins": 0.7130311727523804, + "rewards/rejected": -5.189483642578125, + "sft_loss": 4.207291603088379, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 0.6920160565963868, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": -0.16666167974472046, + "logits/rejected": 0.041519664227962494, + "logps/chosen": -4.522221565246582, + "logps/rejected": -5.004232883453369, + "loss": 0.0522, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.522221565246582, + "rewards/margins": 0.4820104241371155, + "rewards/rejected": -5.004232883453369, + "sft_loss": 4.196804523468018, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 0.5798926531076499, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": -0.017039481550455093, + "logits/rejected": -0.031264323741197586, + "logps/chosen": -4.368385314941406, + "logps/rejected": -5.084227085113525, + "loss": 0.0512, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.368385314941406, + "rewards/margins": 0.7158415913581848, + "rewards/rejected": -5.084227085113525, + "sft_loss": 4.072343826293945, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 0.8352305328176775, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": -0.22756394743919373, + "logits/rejected": -0.10421310365200043, + "logps/chosen": -4.519444465637207, + "logps/rejected": -5.141909599304199, + "loss": 0.0548, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.519444465637207, + "rewards/margins": 0.6224651336669922, + "rewards/rejected": -5.141909599304199, + "sft_loss": 4.257163047790527, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 0.6049852133055805, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": -0.2685548663139343, + "logits/rejected": -0.305839478969574, + "logps/chosen": -4.889684677124023, + "logps/rejected": -5.2240166664123535, + "loss": 0.0537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.889684677124023, + "rewards/margins": 0.334332138299942, + "rewards/rejected": -5.2240166664123535, + "sft_loss": 4.526303768157959, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 0.3697278141071938, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.4287821650505066, + "logits/rejected": -0.1586742401123047, + "logps/chosen": -4.467938423156738, + "logps/rejected": -5.110435485839844, + "loss": 0.0519, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.467938423156738, + "rewards/margins": 0.6424973607063293, + "rewards/rejected": -5.110435485839844, + "sft_loss": 4.301453590393066, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 0.4839978796312147, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.411714643239975, + "logits/rejected": -0.2825588881969452, + "logps/chosen": -4.301059722900391, + "logps/rejected": -4.954368591308594, + "loss": 0.0513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.301059722900391, + "rewards/margins": 0.6533088684082031, + "rewards/rejected": -4.954368591308594, + "sft_loss": 4.1210832595825195, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 0.4909297019707165, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.3669358491897583, + "logits/rejected": -0.24129393696784973, + "logps/chosen": -4.29843807220459, + "logps/rejected": -4.948997974395752, + "loss": 0.0512, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.29843807220459, + "rewards/margins": 0.6505595445632935, + "rewards/rejected": -4.948997974395752, + "sft_loss": 4.052264213562012, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 0.43671107394456493, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.4114173352718353, + "logits/rejected": -0.2702207565307617, + "logps/chosen": -4.2567853927612305, + "logps/rejected": -4.826117515563965, + "loss": 0.0531, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.2567853927612305, + "rewards/margins": 0.5693323016166687, + "rewards/rejected": -4.826117515563965, + "sft_loss": 4.011662006378174, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 0.5556279783763788, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.4900270998477936, + "logits/rejected": -0.13813424110412598, + "logps/chosen": -4.605090141296387, + "logps/rejected": -5.394580841064453, + "loss": 0.052, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.605090141296387, + "rewards/margins": 0.7894911766052246, + "rewards/rejected": -5.394580841064453, + "sft_loss": 4.288597106933594, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 0.5509398392055068, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.40336865186691284, + "logits/rejected": -0.1709717959165573, + "logps/chosen": -4.6631364822387695, + "logps/rejected": -5.51657772064209, + "loss": 0.0516, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.6631364822387695, + "rewards/margins": 0.8534411191940308, + "rewards/rejected": -5.51657772064209, + "sft_loss": 4.334843158721924, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 0.5955483599486879, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": -0.3279435634613037, + "logits/rejected": -0.07166824489831924, + "logps/chosen": -4.5641679763793945, + "logps/rejected": -5.14822244644165, + "loss": 0.0519, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.5641679763793945, + "rewards/margins": 0.5840541124343872, + "rewards/rejected": -5.14822244644165, + "sft_loss": 4.200583457946777, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 0.43430584417433116, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.3868328928947449, + "logits/rejected": -0.16218364238739014, + "logps/chosen": -4.473959922790527, + "logps/rejected": -4.889747619628906, + "loss": 0.0529, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.473959922790527, + "rewards/margins": 0.4157875180244446, + "rewards/rejected": -4.889747619628906, + "sft_loss": 4.134532451629639, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 0.5300640696357395, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": -0.2527707815170288, + "logits/rejected": -0.23362918198108673, + "logps/chosen": -4.4117431640625, + "logps/rejected": -4.904858589172363, + "loss": 0.0526, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.4117431640625, + "rewards/margins": 0.4931156635284424, + "rewards/rejected": -4.904858589172363, + "sft_loss": 4.160271644592285, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 0.4918512380012406, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": -0.19767886400222778, + "logits/rejected": -0.16038745641708374, + "logps/chosen": -4.483613967895508, + "logps/rejected": -5.078944206237793, + "loss": 0.0518, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.483613967895508, + "rewards/margins": 0.5953308343887329, + "rewards/rejected": -5.078944206237793, + "sft_loss": 4.205395698547363, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 0.6306090411080313, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.279359370470047, + "logits/rejected": -0.24908506870269775, + "logps/chosen": -4.543668270111084, + "logps/rejected": -5.088433742523193, + "loss": 0.0528, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.543668270111084, + "rewards/margins": 0.5447657108306885, + "rewards/rejected": -5.088433742523193, + "sft_loss": 4.243107795715332, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 0.47948094032983946, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.2636973261833191, + "logits/rejected": -0.2998460531234741, + "logps/chosen": -4.743630409240723, + "logps/rejected": -5.1253662109375, + "loss": 0.0531, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.743630409240723, + "rewards/margins": 0.3817363381385803, + "rewards/rejected": -5.1253662109375, + "sft_loss": 4.456059455871582, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 0.5996058550320081, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.3166981637477875, + "logits/rejected": -0.20749597251415253, + "logps/chosen": -4.545644283294678, + "logps/rejected": -5.04119348526001, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.545644283294678, + "rewards/margins": 0.4955490529537201, + "rewards/rejected": -5.04119348526001, + "sft_loss": 4.291190147399902, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 0.5220798479877709, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.4534633159637451, + "logits/rejected": -0.2128814458847046, + "logps/chosen": -4.273514747619629, + "logps/rejected": -4.954968452453613, + "loss": 0.0505, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.273514747619629, + "rewards/margins": 0.6814538240432739, + "rewards/rejected": -4.954968452453613, + "sft_loss": 4.040926933288574, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 0.7702132976372609, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.35295426845550537, + "logits/rejected": -0.16922760009765625, + "logps/chosen": -4.291215419769287, + "logps/rejected": -5.033682346343994, + "loss": 0.0505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.291215419769287, + "rewards/margins": 0.7424668073654175, + "rewards/rejected": -5.033682346343994, + "sft_loss": 3.9390976428985596, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 0.721486911740481, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.4789690375328064, + "logits/rejected": -0.23137912154197693, + "logps/chosen": -4.274365425109863, + "logps/rejected": -4.961302757263184, + "loss": 0.0501, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.274365425109863, + "rewards/margins": 0.6869370341300964, + "rewards/rejected": -4.961302757263184, + "sft_loss": 3.922863483428955, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 0.4295817185964868, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.4171268045902252, + "logits/rejected": -0.1910628080368042, + "logps/chosen": -4.448271751403809, + "logps/rejected": -5.207947731018066, + "loss": 0.0508, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.448271751403809, + "rewards/margins": 0.7596766352653503, + "rewards/rejected": -5.207947731018066, + "sft_loss": 4.162169456481934, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 0.5017782636688317, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.5226677060127258, + "logits/rejected": -0.3166903853416443, + "logps/chosen": -4.627984046936035, + "logps/rejected": -5.201998710632324, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.627984046936035, + "rewards/margins": 0.5740151405334473, + "rewards/rejected": -5.201998710632324, + "sft_loss": 4.343914985656738, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 0.622045784701378, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.43745899200439453, + "logits/rejected": -0.3405126631259918, + "logps/chosen": -4.604950904846191, + "logps/rejected": -5.118515968322754, + "loss": 0.0527, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.604950904846191, + "rewards/margins": 0.51356440782547, + "rewards/rejected": -5.118515968322754, + "sft_loss": 4.261171817779541, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 0.46280700860813107, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.3574795722961426, + "logits/rejected": -0.2373519241809845, + "logps/chosen": -4.473115921020508, + "logps/rejected": -5.071488857269287, + "loss": 0.0511, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.473115921020508, + "rewards/margins": 0.5983726978302002, + "rewards/rejected": -5.071488857269287, + "sft_loss": 4.186740398406982, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 0.5617703312518776, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.3852320611476898, + "logits/rejected": -0.23650208115577698, + "logps/chosen": -4.551094055175781, + "logps/rejected": -5.2193145751953125, + "loss": 0.0526, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.551094055175781, + "rewards/margins": 0.6682202816009521, + "rewards/rejected": -5.2193145751953125, + "sft_loss": 4.264029026031494, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 0.6630442372036683, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.3888396918773651, + "logits/rejected": -0.055044613778591156, + "logps/chosen": -4.306995391845703, + "logps/rejected": -5.142845153808594, + "loss": 0.0496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.306995391845703, + "rewards/margins": 0.8358501195907593, + "rewards/rejected": -5.142845153808594, + "sft_loss": 4.013720512390137, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 0.7976635274894721, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.48275822401046753, + "logits/rejected": -0.24777165055274963, + "logps/chosen": -4.506577491760254, + "logps/rejected": -5.197785377502441, + "loss": 0.0509, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.506577491760254, + "rewards/margins": 0.6912076473236084, + "rewards/rejected": -5.197785377502441, + "sft_loss": 4.213435173034668, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 0.38382243279532763, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": -0.2914837896823883, + "logits/rejected": -0.262010395526886, + "logps/chosen": -4.344498157501221, + "logps/rejected": -5.013187408447266, + "loss": 0.0513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.344498157501221, + "rewards/margins": 0.6686891317367554, + "rewards/rejected": -5.013187408447266, + "sft_loss": 4.004141807556152, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 0.7023075707396644, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": -0.16346505284309387, + "logits/rejected": -0.10149259865283966, + "logps/chosen": -4.34529972076416, + "logps/rejected": -5.073423862457275, + "loss": 0.0507, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.34529972076416, + "rewards/margins": 0.7281247973442078, + "rewards/rejected": -5.073423862457275, + "sft_loss": 3.941153049468994, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 0.49306275936732125, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.3227735161781311, + "logits/rejected": -0.22728899121284485, + "logps/chosen": -4.6322503089904785, + "logps/rejected": -5.29410457611084, + "loss": 0.0517, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.6322503089904785, + "rewards/margins": 0.6618545055389404, + "rewards/rejected": -5.29410457611084, + "sft_loss": 4.227608680725098, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 1.0035943476967324, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": -0.26348575949668884, + "logits/rejected": -0.17716734111309052, + "logps/chosen": -4.563479423522949, + "logps/rejected": -5.222577095031738, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.563479423522949, + "rewards/margins": 0.6590980291366577, + "rewards/rejected": -5.222577095031738, + "sft_loss": 4.292522430419922, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 0.45450786301419244, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.5200982093811035, + "logits/rejected": -0.21942846477031708, + "logps/chosen": -4.524590492248535, + "logps/rejected": -5.191534996032715, + "loss": 0.052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.524590492248535, + "rewards/margins": 0.6669445633888245, + "rewards/rejected": -5.191534996032715, + "sft_loss": 4.256932258605957, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 0.45066743934900094, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.4577765464782715, + "logits/rejected": -0.25274404883384705, + "logps/chosen": -4.507070064544678, + "logps/rejected": -5.168764591217041, + "loss": 0.0516, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.507070064544678, + "rewards/margins": 0.6616944670677185, + "rewards/rejected": -5.168764591217041, + "sft_loss": 4.1500983238220215, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 0.7021636941447833, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.4425809979438782, + "logits/rejected": -0.24780945479869843, + "logps/chosen": -4.2700958251953125, + "logps/rejected": -4.92608642578125, + "loss": 0.0517, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.2700958251953125, + "rewards/margins": 0.655990719795227, + "rewards/rejected": -4.92608642578125, + "sft_loss": 4.022648811340332, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 0.4364800359106503, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.3534063398838043, + "logits/rejected": -0.2112211287021637, + "logps/chosen": -4.486388206481934, + "logps/rejected": -5.181911468505859, + "loss": 0.052, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.486388206481934, + "rewards/margins": 0.6955228447914124, + "rewards/rejected": -5.181911468505859, + "sft_loss": 4.218506813049316, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 0.5266734489894225, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.4793574810028076, + "logits/rejected": -0.2572689950466156, + "logps/chosen": -4.473581314086914, + "logps/rejected": -5.181696891784668, + "loss": 0.0511, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.473581314086914, + "rewards/margins": 0.7081155180931091, + "rewards/rejected": -5.181696891784668, + "sft_loss": 4.205438613891602, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 0.6791010230273304, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.3394157290458679, + "logits/rejected": -0.1353457123041153, + "logps/chosen": -4.384115219116211, + "logps/rejected": -5.233609676361084, + "loss": 0.0523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.384115219116211, + "rewards/margins": 0.8494939804077148, + "rewards/rejected": -5.233609676361084, + "sft_loss": 4.137483596801758, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 0.4237329746059985, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.3028372526168823, + "logits/rejected": -0.08005464822053909, + "logps/chosen": -4.468472003936768, + "logps/rejected": -5.0935516357421875, + "loss": 0.0521, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.468472003936768, + "rewards/margins": 0.6250793933868408, + "rewards/rejected": -5.0935516357421875, + "sft_loss": 4.173741340637207, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 0.7904664403236191, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.4600791037082672, + "logits/rejected": -0.21939058601856232, + "logps/chosen": -4.389094352722168, + "logps/rejected": -5.0552263259887695, + "loss": 0.0524, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.389094352722168, + "rewards/margins": 0.6661325097084045, + "rewards/rejected": -5.0552263259887695, + "sft_loss": 4.177814960479736, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 0.5259227542270914, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.365754097700119, + "logits/rejected": -0.15738160908222198, + "logps/chosen": -4.536040306091309, + "logps/rejected": -5.309323310852051, + "loss": 0.0513, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.536040306091309, + "rewards/margins": 0.773283064365387, + "rewards/rejected": -5.309323310852051, + "sft_loss": 4.273083686828613, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 0.44323002580409565, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.3308809697628021, + "logits/rejected": -0.21239659190177917, + "logps/chosen": -4.450998783111572, + "logps/rejected": -5.074978351593018, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.450998783111572, + "rewards/margins": 0.6239796280860901, + "rewards/rejected": -5.074978351593018, + "sft_loss": 4.151812553405762, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 0.47793631642078366, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": -0.34116891026496887, + "logits/rejected": -0.3185744881629944, + "logps/chosen": -4.456019401550293, + "logps/rejected": -5.188083648681641, + "loss": 0.0504, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.456019401550293, + "rewards/margins": 0.732064425945282, + "rewards/rejected": -5.188083648681641, + "sft_loss": 4.120429039001465, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 0.579752216710794, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.5160247087478638, + "logits/rejected": -0.3117043375968933, + "logps/chosen": -4.367626190185547, + "logps/rejected": -4.930412292480469, + "loss": 0.0521, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.367626190185547, + "rewards/margins": 0.5627862215042114, + "rewards/rejected": -4.930412292480469, + "sft_loss": 4.12152624130249, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 0.6163130040907793, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.38553833961486816, + "logits/rejected": -0.23147761821746826, + "logps/chosen": -4.212340354919434, + "logps/rejected": -4.871836185455322, + "loss": 0.0522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.212340354919434, + "rewards/margins": 0.6594957709312439, + "rewards/rejected": -4.871836185455322, + "sft_loss": 3.979743480682373, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.16218583285808563, + "eval_logits/rejected": 0.2726520597934723, + "eval_logps/chosen": -4.613275527954102, + "eval_logps/rejected": -5.277085304260254, + "eval_loss": 0.050447020679712296, + "eval_rewards/accuracies": 0.6646884083747864, + "eval_rewards/chosen": -4.613275527954102, + "eval_rewards/margins": 0.6638097763061523, + "eval_rewards/rejected": -5.277085304260254, + "eval_runtime": 46.8637, + "eval_samples_per_second": 28.7, + "eval_sft_loss": 4.179710388183594, + "eval_steps_per_second": 7.191, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 0.8125972917328307, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.30984312295913696, + "logits/rejected": -0.16324174404144287, + "logps/chosen": -4.691993713378906, + "logps/rejected": -5.421947479248047, + "loss": 0.0519, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.691993713378906, + "rewards/margins": 0.7299537062644958, + "rewards/rejected": -5.421947479248047, + "sft_loss": 4.399287223815918, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 0.6208444104515567, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.3938482403755188, + "logits/rejected": -0.18808431923389435, + "logps/chosen": -4.691821098327637, + "logps/rejected": -5.170048236846924, + "loss": 0.0509, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.691821098327637, + "rewards/margins": 0.47822675108909607, + "rewards/rejected": -5.170048236846924, + "sft_loss": 4.298820972442627, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 0.44700685429264475, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": -0.23651990294456482, + "logits/rejected": -0.11198411136865616, + "logps/chosen": -4.394803524017334, + "logps/rejected": -5.207266330718994, + "loss": 0.0508, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.394803524017334, + "rewards/margins": 0.8124624490737915, + "rewards/rejected": -5.207266330718994, + "sft_loss": 4.091854095458984, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 0.6111476651853875, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.31173262000083923, + "logits/rejected": -0.18862289190292358, + "logps/chosen": -4.262498378753662, + "logps/rejected": -4.927305698394775, + "loss": 0.0511, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.262498378753662, + "rewards/margins": 0.6648072004318237, + "rewards/rejected": -4.927305698394775, + "sft_loss": 3.939523220062256, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 1.236864464715216, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.2820888161659241, + "logits/rejected": -0.20513828098773956, + "logps/chosen": -4.186476707458496, + "logps/rejected": -4.759819984436035, + "loss": 0.0534, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.186476707458496, + "rewards/margins": 0.5733426809310913, + "rewards/rejected": -4.759819984436035, + "sft_loss": 3.976614475250244, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 0.43286785735508476, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.32675403356552124, + "logits/rejected": -0.23593978583812714, + "logps/chosen": -4.594229698181152, + "logps/rejected": -5.107740879058838, + "loss": 0.0532, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.594229698181152, + "rewards/margins": 0.5135103464126587, + "rewards/rejected": -5.107740879058838, + "sft_loss": 4.226545810699463, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 0.3582759153640644, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.4037497043609619, + "logits/rejected": -0.2016705572605133, + "logps/chosen": -4.662694454193115, + "logps/rejected": -5.2524003982543945, + "loss": 0.0525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.662694454193115, + "rewards/margins": 0.5897052884101868, + "rewards/rejected": -5.2524003982543945, + "sft_loss": 4.4913787841796875, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 0.4573925711303691, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.3449918329715729, + "logits/rejected": -0.1870274394750595, + "logps/chosen": -4.6676716804504395, + "logps/rejected": -5.286660194396973, + "loss": 0.0525, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.6676716804504395, + "rewards/margins": 0.6189885139465332, + "rewards/rejected": -5.286660194396973, + "sft_loss": 4.378539085388184, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 0.48802271760572685, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.39901071786880493, + "logits/rejected": -0.22823762893676758, + "logps/chosen": -4.416820526123047, + "logps/rejected": -5.092315673828125, + "loss": 0.0515, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.416820526123047, + "rewards/margins": 0.6754951477050781, + "rewards/rejected": -5.092315673828125, + "sft_loss": 4.0865936279296875, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 0.508227844853999, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.3008289933204651, + "logits/rejected": -0.10734491050243378, + "logps/chosen": -4.298119068145752, + "logps/rejected": -4.975314617156982, + "loss": 0.051, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.298119068145752, + "rewards/margins": 0.6771960854530334, + "rewards/rejected": -4.975314617156982, + "sft_loss": 4.031597137451172, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 0.44280748250592333, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.3537518382072449, + "logits/rejected": 0.0016300469869747758, + "logps/chosen": -4.296114444732666, + "logps/rejected": -5.029053211212158, + "loss": 0.0505, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.296114444732666, + "rewards/margins": 0.7329393029212952, + "rewards/rejected": -5.029053211212158, + "sft_loss": 4.010401725769043, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 0.5994814714419845, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.14458681643009186, + "logits/rejected": -0.024362895637750626, + "logps/chosen": -4.518597602844238, + "logps/rejected": -5.020750999450684, + "loss": 0.0524, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.518597602844238, + "rewards/margins": 0.502153217792511, + "rewards/rejected": -5.020750999450684, + "sft_loss": 4.172475814819336, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 0.4992944547596079, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.1841050386428833, + "logits/rejected": -0.08634983003139496, + "logps/chosen": -4.33657169342041, + "logps/rejected": -5.232463836669922, + "loss": 0.0494, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.33657169342041, + "rewards/margins": 0.8958921432495117, + "rewards/rejected": -5.232463836669922, + "sft_loss": 3.983189344406128, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 0.5969208799043295, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.3868961036205292, + "logits/rejected": -0.24560889601707458, + "logps/chosen": -4.6813249588012695, + "logps/rejected": -5.233216285705566, + "loss": 0.053, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.6813249588012695, + "rewards/margins": 0.5518918037414551, + "rewards/rejected": -5.233216285705566, + "sft_loss": 4.4259934425354, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 1.0979867390491649, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.16251157224178314, + "logits/rejected": -0.1308099925518036, + "logps/chosen": -4.580937385559082, + "logps/rejected": -5.119907855987549, + "loss": 0.0518, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.580937385559082, + "rewards/margins": 0.5389704704284668, + "rewards/rejected": -5.119907855987549, + "sft_loss": 4.249577045440674, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 0.7299460616177387, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.4401113986968994, + "logits/rejected": -0.20187684893608093, + "logps/chosen": -4.588586330413818, + "logps/rejected": -5.211333751678467, + "loss": 0.052, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.588586330413818, + "rewards/margins": 0.6227480173110962, + "rewards/rejected": -5.211333751678467, + "sft_loss": 4.332615852355957, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 0.6580906178724784, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": -0.22082960605621338, + "logits/rejected": -0.05857197567820549, + "logps/chosen": -4.458443641662598, + "logps/rejected": -5.258761882781982, + "loss": 0.0508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.458443641662598, + "rewards/margins": 0.8003188967704773, + "rewards/rejected": -5.258761882781982, + "sft_loss": 4.194399833679199, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 0.44049659030132926, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.3119185268878937, + "logits/rejected": -0.24192802608013153, + "logps/chosen": -4.311916828155518, + "logps/rejected": -4.910783290863037, + "loss": 0.0523, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.311916828155518, + "rewards/margins": 0.5988671183586121, + "rewards/rejected": -4.910783290863037, + "sft_loss": 4.0275068283081055, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 0.7358415647474225, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": -0.2163584679365158, + "logits/rejected": -0.20647704601287842, + "logps/chosen": -4.386069297790527, + "logps/rejected": -4.962353706359863, + "loss": 0.0538, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.386069297790527, + "rewards/margins": 0.5762845277786255, + "rewards/rejected": -4.962353706359863, + "sft_loss": 4.141005039215088, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 0.7678855187302202, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.28678449988365173, + "logits/rejected": -0.059916090220212936, + "logps/chosen": -4.479846477508545, + "logps/rejected": -5.117179870605469, + "loss": 0.0526, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.479846477508545, + "rewards/margins": 0.6373331546783447, + "rewards/rejected": -5.117179870605469, + "sft_loss": 4.278372287750244, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 0.38028014441452435, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.35091131925582886, + "logits/rejected": -0.14311781525611877, + "logps/chosen": -4.657089710235596, + "logps/rejected": -5.283184051513672, + "loss": 0.0513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.657089710235596, + "rewards/margins": 0.6260942220687866, + "rewards/rejected": -5.283184051513672, + "sft_loss": 4.375996112823486, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 0.4178525242758897, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.3786674439907074, + "logits/rejected": -0.20341435074806213, + "logps/chosen": -4.496813774108887, + "logps/rejected": -5.082845687866211, + "loss": 0.0512, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.496813774108887, + "rewards/margins": 0.5860317945480347, + "rewards/rejected": -5.082845687866211, + "sft_loss": 4.184346675872803, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 0.5131854014657652, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": -0.2360183298587799, + "logits/rejected": -0.15669827163219452, + "logps/chosen": -4.303060531616211, + "logps/rejected": -4.955399990081787, + "loss": 0.0515, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.303060531616211, + "rewards/margins": 0.6523396968841553, + "rewards/rejected": -4.955399990081787, + "sft_loss": 4.033090114593506, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 1.1599307981261955, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.2967987358570099, + "logits/rejected": -0.22975853085517883, + "logps/chosen": -4.2588067054748535, + "logps/rejected": -4.761368274688721, + "loss": 0.0527, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.2588067054748535, + "rewards/margins": 0.5025621652603149, + "rewards/rejected": -4.761368274688721, + "sft_loss": 3.913691997528076, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 0.41847911237686436, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.2883647382259369, + "logits/rejected": -0.22551052272319794, + "logps/chosen": -4.588094234466553, + "logps/rejected": -5.287965774536133, + "loss": 0.0523, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.588094234466553, + "rewards/margins": 0.6998715400695801, + "rewards/rejected": -5.287965774536133, + "sft_loss": 4.375986099243164, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 0.7019822923715036, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.34520524740219116, + "logits/rejected": -0.28489288687705994, + "logps/chosen": -4.859744071960449, + "logps/rejected": -5.2669358253479, + "loss": 0.0535, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.859744071960449, + "rewards/margins": 0.40719157457351685, + "rewards/rejected": -5.2669358253479, + "sft_loss": 4.574738025665283, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 0.4557018145797472, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.4234161376953125, + "logits/rejected": -0.2507956922054291, + "logps/chosen": -4.543206214904785, + "logps/rejected": -5.178067207336426, + "loss": 0.0522, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.543206214904785, + "rewards/margins": 0.6348603367805481, + "rewards/rejected": -5.178067207336426, + "sft_loss": 4.318524360656738, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 0.4933166093135709, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.3509432375431061, + "logits/rejected": -0.1096065416932106, + "logps/chosen": -4.508671760559082, + "logps/rejected": -4.966963768005371, + "loss": 0.0515, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.508671760559082, + "rewards/margins": 0.4582923352718353, + "rewards/rejected": -4.966963768005371, + "sft_loss": 4.152871608734131, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 0.4803004498103421, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.36170434951782227, + "logits/rejected": -0.10494896024465561, + "logps/chosen": -4.229575157165527, + "logps/rejected": -5.020062446594238, + "loss": 0.0509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.229575157165527, + "rewards/margins": 0.7904866933822632, + "rewards/rejected": -5.020062446594238, + "sft_loss": 4.025331497192383, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 0.5985182635532804, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.405205100774765, + "logits/rejected": -0.22326946258544922, + "logps/chosen": -4.403514862060547, + "logps/rejected": -4.895939826965332, + "loss": 0.0526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.403514862060547, + "rewards/margins": 0.4924253523349762, + "rewards/rejected": -4.895939826965332, + "sft_loss": 4.0891618728637695, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 0.6600807629722721, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.3793318569660187, + "logits/rejected": -0.21826522052288055, + "logps/chosen": -4.522066116333008, + "logps/rejected": -5.230228900909424, + "loss": 0.0522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.522066116333008, + "rewards/margins": 0.7081626653671265, + "rewards/rejected": -5.230228900909424, + "sft_loss": 4.325783729553223, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 0.5657421936672914, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": -0.40977898240089417, + "logits/rejected": -0.2763887345790863, + "logps/chosen": -4.552574157714844, + "logps/rejected": -5.204157829284668, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.552574157714844, + "rewards/margins": 0.6515840291976929, + "rewards/rejected": -5.204157829284668, + "sft_loss": 4.2583231925964355, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 0.4706038058812201, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.43373212218284607, + "logits/rejected": -0.2594808042049408, + "logps/chosen": -4.4295654296875, + "logps/rejected": -5.267152786254883, + "loss": 0.0511, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.4295654296875, + "rewards/margins": 0.8375871777534485, + "rewards/rejected": -5.267152786254883, + "sft_loss": 4.168534278869629, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 0.44872008369044214, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": -0.36206430196762085, + "logits/rejected": -0.28401902318000793, + "logps/chosen": -4.553805828094482, + "logps/rejected": -5.051944255828857, + "loss": 0.0513, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.553805828094482, + "rewards/margins": 0.4981384873390198, + "rewards/rejected": -5.051944255828857, + "sft_loss": 4.1244049072265625, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 0.4122542208426088, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.46242958307266235, + "logits/rejected": -0.24346823990345, + "logps/chosen": -4.514195919036865, + "logps/rejected": -5.153885841369629, + "loss": 0.0517, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.514195919036865, + "rewards/margins": 0.6396892070770264, + "rewards/rejected": -5.153885841369629, + "sft_loss": 4.257824897766113, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 0.5988318190800053, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.387522429227829, + "logits/rejected": -0.3059254288673401, + "logps/chosen": -4.457314968109131, + "logps/rejected": -4.824244499206543, + "loss": 0.0538, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.457314968109131, + "rewards/margins": 0.3669296205043793, + "rewards/rejected": -4.824244499206543, + "sft_loss": 4.090696811676025, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 0.4148136422508877, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": -0.2866331934928894, + "logits/rejected": -0.07610544562339783, + "logps/chosen": -4.626020431518555, + "logps/rejected": -5.238821506500244, + "loss": 0.0509, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.626020431518555, + "rewards/margins": 0.6128014922142029, + "rewards/rejected": -5.238821506500244, + "sft_loss": 4.2085371017456055, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 0.567187780871907, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.36029210686683655, + "logits/rejected": -0.22232429683208466, + "logps/chosen": -4.536396503448486, + "logps/rejected": -5.154208183288574, + "loss": 0.0519, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.536396503448486, + "rewards/margins": 0.6178122758865356, + "rewards/rejected": -5.154208183288574, + "sft_loss": 4.228185653686523, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 0.4092259618625428, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.41812753677368164, + "logits/rejected": -0.36150071024894714, + "logps/chosen": -4.364471435546875, + "logps/rejected": -4.96616792678833, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.364471435546875, + "rewards/margins": 0.6016958951950073, + "rewards/rejected": -4.96616792678833, + "sft_loss": 4.140119552612305, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 0.6595402733474404, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": -0.2546065151691437, + "logits/rejected": -0.09192848950624466, + "logps/chosen": -4.478928565979004, + "logps/rejected": -4.960371494293213, + "loss": 0.0517, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.478928565979004, + "rewards/margins": 0.48144254088401794, + "rewards/rejected": -4.960371494293213, + "sft_loss": 4.136466026306152, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 0.5332527476480814, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": -0.17541834712028503, + "logits/rejected": -0.09706973284482956, + "logps/chosen": -4.556529998779297, + "logps/rejected": -5.159417629241943, + "loss": 0.0528, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.556529998779297, + "rewards/margins": 0.6028882265090942, + "rewards/rejected": -5.159417629241943, + "sft_loss": 4.306204319000244, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 0.4498464591100737, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": -0.16866278648376465, + "logits/rejected": -0.141506165266037, + "logps/chosen": -4.507957935333252, + "logps/rejected": -5.1205620765686035, + "loss": 0.0528, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.507957935333252, + "rewards/margins": 0.6126040816307068, + "rewards/rejected": -5.1205620765686035, + "sft_loss": 4.23211145401001, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 0.8215085878156593, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.35125938057899475, + "logits/rejected": -0.16566213965415955, + "logps/chosen": -4.5039963722229, + "logps/rejected": -5.095160007476807, + "loss": 0.0533, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.5039963722229, + "rewards/margins": 0.5911641716957092, + "rewards/rejected": -5.095160007476807, + "sft_loss": 4.220867156982422, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 0.6073875924204214, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": -0.27543169260025024, + "logits/rejected": -0.3689972162246704, + "logps/chosen": -4.526713848114014, + "logps/rejected": -5.047951698303223, + "loss": 0.0536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.526713848114014, + "rewards/margins": 0.5212381482124329, + "rewards/rejected": -5.047951698303223, + "sft_loss": 4.3203959465026855, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 0.3938937307686175, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.4955802857875824, + "logits/rejected": -0.3202812373638153, + "logps/chosen": -4.622469425201416, + "logps/rejected": -5.246644973754883, + "loss": 0.0514, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.622469425201416, + "rewards/margins": 0.6241754293441772, + "rewards/rejected": -5.246644973754883, + "sft_loss": 4.328534126281738, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 0.5272944726316406, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.5755268931388855, + "logits/rejected": -0.33584824204444885, + "logps/chosen": -4.600456714630127, + "logps/rejected": -5.08190393447876, + "loss": 0.0528, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.600456714630127, + "rewards/margins": 0.4814472794532776, + "rewards/rejected": -5.08190393447876, + "sft_loss": 4.373833179473877, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 0.4155006152783869, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.551335334777832, + "logits/rejected": -0.4132702350616455, + "logps/chosen": -4.421041011810303, + "logps/rejected": -4.975304126739502, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.421041011810303, + "rewards/margins": 0.554263174533844, + "rewards/rejected": -4.975304126739502, + "sft_loss": 4.110798358917236, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 0.6515975463345118, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": -0.401276171207428, + "logits/rejected": -0.25514617562294006, + "logps/chosen": -4.445773124694824, + "logps/rejected": -4.844512462615967, + "loss": 0.0539, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.445773124694824, + "rewards/margins": 0.398739218711853, + "rewards/rejected": -4.844512462615967, + "sft_loss": 4.16897439956665, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 0.3446000707611468, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.43346720933914185, + "logits/rejected": -0.35360580682754517, + "logps/chosen": -4.3459696769714355, + "logps/rejected": -5.048262596130371, + "loss": 0.0513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.3459696769714355, + "rewards/margins": 0.7022929191589355, + "rewards/rejected": -5.048262596130371, + "sft_loss": 4.152453422546387, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 0.3217984812990353, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.4467714726924896, + "logits/rejected": -0.2542513608932495, + "logps/chosen": -4.547066688537598, + "logps/rejected": -5.322503566741943, + "loss": 0.0506, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.547066688537598, + "rewards/margins": 0.7754372358322144, + "rewards/rejected": -5.322503566741943, + "sft_loss": 4.235215187072754, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 0.6050168275087299, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.3104148209095001, + "logits/rejected": -0.10100110620260239, + "logps/chosen": -4.59012508392334, + "logps/rejected": -5.028843879699707, + "loss": 0.0522, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.59012508392334, + "rewards/margins": 0.4387180209159851, + "rewards/rejected": -5.028843879699707, + "sft_loss": 4.259760856628418, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 0.5526542417463417, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": -0.27798131108283997, + "logits/rejected": -0.16789737343788147, + "logps/chosen": -4.456320762634277, + "logps/rejected": -5.029903411865234, + "loss": 0.0524, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.456320762634277, + "rewards/margins": 0.5735821723937988, + "rewards/rejected": -5.029903411865234, + "sft_loss": 4.188068389892578, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 0.6725358519055418, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.43549829721450806, + "logits/rejected": -0.23555438220500946, + "logps/chosen": -4.349188804626465, + "logps/rejected": -5.042940139770508, + "loss": 0.0517, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.349188804626465, + "rewards/margins": 0.6937510371208191, + "rewards/rejected": -5.042940139770508, + "sft_loss": 4.0453596115112305, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 0.47214900467098986, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": -0.2399250715970993, + "logits/rejected": -0.09774953126907349, + "logps/chosen": -4.491705417633057, + "logps/rejected": -5.064525604248047, + "loss": 0.051, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.491705417633057, + "rewards/margins": 0.5728203654289246, + "rewards/rejected": -5.064525604248047, + "sft_loss": 4.136842727661133, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 0.4972109814218113, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.3452068865299225, + "logits/rejected": -0.15056590735912323, + "logps/chosen": -4.566656589508057, + "logps/rejected": -5.146246433258057, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.566656589508057, + "rewards/margins": 0.5795894861221313, + "rewards/rejected": -5.146246433258057, + "sft_loss": 4.118401050567627, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 0.6882142186528432, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.367424339056015, + "logits/rejected": -0.16041013598442078, + "logps/chosen": -4.63809061050415, + "logps/rejected": -5.268001079559326, + "loss": 0.052, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.63809061050415, + "rewards/margins": 0.6299105286598206, + "rewards/rejected": -5.268001079559326, + "sft_loss": 4.292515754699707, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 0.48308726153623693, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.40587466955184937, + "logits/rejected": -0.11217250674962997, + "logps/chosen": -4.291191577911377, + "logps/rejected": -5.193698883056641, + "loss": 0.0491, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.291191577911377, + "rewards/margins": 0.9025076627731323, + "rewards/rejected": -5.193698883056641, + "sft_loss": 4.034631729125977, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 0.5935174412481173, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.4540501534938812, + "logits/rejected": -0.1173338070511818, + "logps/chosen": -4.344797611236572, + "logps/rejected": -5.267854690551758, + "loss": 0.0505, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.344797611236572, + "rewards/margins": 0.9230567216873169, + "rewards/rejected": -5.267854690551758, + "sft_loss": 3.9546589851379395, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 0.6750078977580464, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": -0.19961309432983398, + "logits/rejected": -0.2153279036283493, + "logps/chosen": -4.354008674621582, + "logps/rejected": -4.975162506103516, + "loss": 0.0528, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.354008674621582, + "rewards/margins": 0.6211541891098022, + "rewards/rejected": -4.975162506103516, + "sft_loss": 4.119563102722168, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 0.43839706775658527, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.35946568846702576, + "logits/rejected": -0.12679320573806763, + "logps/chosen": -4.462148189544678, + "logps/rejected": -5.072259902954102, + "loss": 0.0532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.462148189544678, + "rewards/margins": 0.6101123690605164, + "rewards/rejected": -5.072259902954102, + "sft_loss": 4.199645519256592, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 0.5240237174493583, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -0.5560758709907532, + "logits/rejected": -0.2003978192806244, + "logps/chosen": -4.430878639221191, + "logps/rejected": -5.3489251136779785, + "loss": 0.0511, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.430878639221191, + "rewards/margins": 0.9180465936660767, + "rewards/rejected": -5.3489251136779785, + "sft_loss": 4.257626533508301, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 0.34362372364773286, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": -0.37839382886886597, + "logits/rejected": -0.4028445780277252, + "logps/chosen": -4.604657173156738, + "logps/rejected": -5.0622429847717285, + "loss": 0.0518, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.604657173156738, + "rewards/margins": 0.45758503675460815, + "rewards/rejected": -5.0622429847717285, + "sft_loss": 4.296733856201172, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 0.39810904002106345, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.5207768082618713, + "logits/rejected": -0.3638109266757965, + "logps/chosen": -4.3686041831970215, + "logps/rejected": -5.053372383117676, + "loss": 0.0505, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.3686041831970215, + "rewards/margins": 0.684768795967102, + "rewards/rejected": -5.053372383117676, + "sft_loss": 4.103168964385986, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 0.4173210251113381, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.3816406726837158, + "logits/rejected": -0.2449624091386795, + "logps/chosen": -4.267149448394775, + "logps/rejected": -4.792271614074707, + "loss": 0.0516, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.267149448394775, + "rewards/margins": 0.525122344493866, + "rewards/rejected": -4.792271614074707, + "sft_loss": 3.9861807823181152, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 0.5206031699456903, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.3659888505935669, + "logits/rejected": -0.22819384932518005, + "logps/chosen": -4.522797584533691, + "logps/rejected": -5.074974060058594, + "loss": 0.0523, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.522797584533691, + "rewards/margins": 0.5521765351295471, + "rewards/rejected": -5.074974060058594, + "sft_loss": 4.2375807762146, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 0.5951169620080391, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.32757097482681274, + "logits/rejected": -0.1094558984041214, + "logps/chosen": -4.466285705566406, + "logps/rejected": -5.261303424835205, + "loss": 0.0501, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.466285705566406, + "rewards/margins": 0.7950171232223511, + "rewards/rejected": -5.261303424835205, + "sft_loss": 4.161604881286621, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 0.5900488732663051, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.33634185791015625, + "logits/rejected": -0.11354222148656845, + "logps/chosen": -4.338430881500244, + "logps/rejected": -5.108603000640869, + "loss": 0.0502, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.338430881500244, + "rewards/margins": 0.7701722383499146, + "rewards/rejected": -5.108603000640869, + "sft_loss": 4.010077953338623, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 0.43817436520379927, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.26562008261680603, + "logits/rejected": -0.16547879576683044, + "logps/chosen": -4.503973484039307, + "logps/rejected": -5.440014839172363, + "loss": 0.0501, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.503973484039307, + "rewards/margins": 0.9360410571098328, + "rewards/rejected": -5.440014839172363, + "sft_loss": 4.1721086502075195, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 0.4595898472433441, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.3069925606250763, + "logits/rejected": -0.14767572283744812, + "logps/chosen": -4.208481311798096, + "logps/rejected": -4.952236652374268, + "loss": 0.0507, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.208481311798096, + "rewards/margins": 0.7437552213668823, + "rewards/rejected": -4.952236652374268, + "sft_loss": 3.812131404876709, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 0.5305346877025717, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.1933150589466095, + "logits/rejected": -0.0845588892698288, + "logps/chosen": -4.497615814208984, + "logps/rejected": -5.3453803062438965, + "loss": 0.0495, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.497615814208984, + "rewards/margins": 0.8477641344070435, + "rewards/rejected": -5.3453803062438965, + "sft_loss": 4.077464580535889, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 0.6410263342308636, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.276964008808136, + "logits/rejected": -0.16606521606445312, + "logps/chosen": -4.369429588317871, + "logps/rejected": -5.116156578063965, + "loss": 0.0511, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.369429588317871, + "rewards/margins": 0.7467272281646729, + "rewards/rejected": -5.116156578063965, + "sft_loss": 4.034445762634277, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 0.44830176640192815, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.4733208119869232, + "logits/rejected": -0.18805237114429474, + "logps/chosen": -4.483094692230225, + "logps/rejected": -5.300882339477539, + "loss": 0.0518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.483094692230225, + "rewards/margins": 0.8177868127822876, + "rewards/rejected": -5.300882339477539, + "sft_loss": 4.2453718185424805, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 0.6409870632514963, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": -0.314197838306427, + "logits/rejected": -0.17623493075370789, + "logps/chosen": -4.647237777709961, + "logps/rejected": -5.2594404220581055, + "loss": 0.0522, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.647237777709961, + "rewards/margins": 0.612203061580658, + "rewards/rejected": -5.2594404220581055, + "sft_loss": 4.35164737701416, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 0.3504734308346918, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.5192325115203857, + "logits/rejected": -0.46321648359298706, + "logps/chosen": -4.322511672973633, + "logps/rejected": -4.9438066482543945, + "loss": 0.0517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.322511672973633, + "rewards/margins": 0.6212958097457886, + "rewards/rejected": -4.9438066482543945, + "sft_loss": 4.152332305908203, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 0.5039716273254905, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.42465925216674805, + "logits/rejected": -0.3103640377521515, + "logps/chosen": -4.393340110778809, + "logps/rejected": -5.021480560302734, + "loss": 0.0531, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.393340110778809, + "rewards/margins": 0.6281408071517944, + "rewards/rejected": -5.021480560302734, + "sft_loss": 4.2171502113342285, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 0.32450018350172033, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.39585283398628235, + "logits/rejected": -0.3093074560165405, + "logps/chosen": -4.389555931091309, + "logps/rejected": -4.971273899078369, + "loss": 0.052, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.389555931091309, + "rewards/margins": 0.5817176103591919, + "rewards/rejected": -4.971273899078369, + "sft_loss": 4.138664245605469, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 0.4160051311346219, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.48266953229904175, + "logits/rejected": -0.3032703101634979, + "logps/chosen": -4.55446720123291, + "logps/rejected": -5.380070209503174, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.55446720123291, + "rewards/margins": 0.8256031274795532, + "rewards/rejected": -5.380070209503174, + "sft_loss": 4.303145408630371, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 0.7984546744437679, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": -0.34332793951034546, + "logits/rejected": -0.4161340594291687, + "logps/chosen": -4.649590492248535, + "logps/rejected": -5.079288482666016, + "loss": 0.0541, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.649590492248535, + "rewards/margins": 0.42969760298728943, + "rewards/rejected": -5.079288482666016, + "sft_loss": 4.418673515319824, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 0.522883901950886, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": -0.34629374742507935, + "logits/rejected": -0.37964576482772827, + "logps/chosen": -4.412274360656738, + "logps/rejected": -4.9246368408203125, + "loss": 0.0531, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.412274360656738, + "rewards/margins": 0.5123627781867981, + "rewards/rejected": -4.9246368408203125, + "sft_loss": 4.1936140060424805, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 0.46071677852464993, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.4360761046409607, + "logits/rejected": -0.4055995047092438, + "logps/chosen": -4.450671195983887, + "logps/rejected": -5.006396293640137, + "loss": 0.0515, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.450671195983887, + "rewards/margins": 0.5557257533073425, + "rewards/rejected": -5.006396293640137, + "sft_loss": 4.149048805236816, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.09837622940540314, + "eval_logits/rejected": 0.20496821403503418, + "eval_logps/chosen": -4.444206237792969, + "eval_logps/rejected": -5.07861852645874, + "eval_loss": 0.05041274055838585, + "eval_rewards/accuracies": 0.6824925541877747, + "eval_rewards/chosen": -4.444206237792969, + "eval_rewards/margins": 0.6344121694564819, + "eval_rewards/rejected": -5.07861852645874, + "eval_runtime": 46.5888, + "eval_samples_per_second": 28.87, + "eval_sft_loss": 4.093306541442871, + "eval_steps_per_second": 7.233, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 0.6572158687306725, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.605633020401001, + "logits/rejected": -0.3718469738960266, + "logps/chosen": -4.480620384216309, + "logps/rejected": -5.0697431564331055, + "loss": 0.0519, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.480620384216309, + "rewards/margins": 0.5891224145889282, + "rewards/rejected": -5.0697431564331055, + "sft_loss": 4.248944282531738, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 0.6664663344359709, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.3627350926399231, + "logits/rejected": -0.24719932675361633, + "logps/chosen": -4.511195182800293, + "logps/rejected": -5.193454265594482, + "loss": 0.0512, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.511195182800293, + "rewards/margins": 0.6822598576545715, + "rewards/rejected": -5.193454265594482, + "sft_loss": 4.303326606750488, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 0.5120494751105008, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": -0.34684649109840393, + "logits/rejected": -0.24992990493774414, + "logps/chosen": -4.363380432128906, + "logps/rejected": -5.179116249084473, + "loss": 0.0505, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.363380432128906, + "rewards/margins": 0.8157358169555664, + "rewards/rejected": -5.179116249084473, + "sft_loss": 4.205404758453369, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 0.6874585270699716, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.4058879017829895, + "logits/rejected": -0.17549274861812592, + "logps/chosen": -4.187855243682861, + "logps/rejected": -4.797568321228027, + "loss": 0.0525, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.187855243682861, + "rewards/margins": 0.6097137928009033, + "rewards/rejected": -4.797568321228027, + "sft_loss": 4.016551494598389, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 0.6060820378492465, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.4957265257835388, + "logits/rejected": -0.2640833258628845, + "logps/chosen": -4.30916690826416, + "logps/rejected": -5.123547554016113, + "loss": 0.0515, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.30916690826416, + "rewards/margins": 0.8143804669380188, + "rewards/rejected": -5.123547554016113, + "sft_loss": 4.097870826721191, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 0.5151875125094709, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.3500848412513733, + "logits/rejected": -0.2688957750797272, + "logps/chosen": -4.340577602386475, + "logps/rejected": -5.0534515380859375, + "loss": 0.0489, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.340577602386475, + "rewards/margins": 0.7128733396530151, + "rewards/rejected": -5.0534515380859375, + "sft_loss": 3.984767198562622, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 0.46145166440756263, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.4007895588874817, + "logits/rejected": -0.2340659350156784, + "logps/chosen": -4.39790678024292, + "logps/rejected": -5.234405517578125, + "loss": 0.0514, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.39790678024292, + "rewards/margins": 0.8364987373352051, + "rewards/rejected": -5.234405517578125, + "sft_loss": 4.1350483894348145, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 0.5800643184003756, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.38760238885879517, + "logits/rejected": -0.33849984407424927, + "logps/chosen": -4.464108467102051, + "logps/rejected": -5.05782413482666, + "loss": 0.0527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.464108467102051, + "rewards/margins": 0.5937153100967407, + "rewards/rejected": -5.05782413482666, + "sft_loss": 4.192983150482178, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 0.489683355986247, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": -0.4335315227508545, + "logits/rejected": -0.3097968101501465, + "logps/chosen": -4.670001029968262, + "logps/rejected": -5.261584281921387, + "loss": 0.0528, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.670001029968262, + "rewards/margins": 0.5915828347206116, + "rewards/rejected": -5.261584281921387, + "sft_loss": 4.404904365539551, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 0.7102902511949915, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.45332375168800354, + "logits/rejected": -0.36583977937698364, + "logps/chosen": -4.492857933044434, + "logps/rejected": -5.338987827301025, + "loss": 0.0499, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.492857933044434, + "rewards/margins": 0.8461304903030396, + "rewards/rejected": -5.338987827301025, + "sft_loss": 4.102261066436768, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 0.579184068772222, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.5107121467590332, + "logits/rejected": -0.27316388487815857, + "logps/chosen": -4.263310432434082, + "logps/rejected": -4.881662368774414, + "loss": 0.0516, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.263310432434082, + "rewards/margins": 0.6183524131774902, + "rewards/rejected": -4.881662368774414, + "sft_loss": 4.044577598571777, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 0.42738608993363675, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -0.5125621557235718, + "logits/rejected": -0.2538798749446869, + "logps/chosen": -4.523799896240234, + "logps/rejected": -5.066987037658691, + "loss": 0.0513, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.523799896240234, + "rewards/margins": 0.5431872010231018, + "rewards/rejected": -5.066987037658691, + "sft_loss": 4.190533638000488, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 0.681260006283264, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -0.48108816146850586, + "logits/rejected": -0.30100640654563904, + "logps/chosen": -4.549483299255371, + "logps/rejected": -5.0618977546691895, + "loss": 0.0527, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.549483299255371, + "rewards/margins": 0.5124139785766602, + "rewards/rejected": -5.0618977546691895, + "sft_loss": 4.250813007354736, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 0.5920717815446642, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.45581454038619995, + "logits/rejected": -0.31429341435432434, + "logps/chosen": -4.41359806060791, + "logps/rejected": -5.109930515289307, + "loss": 0.0505, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.41359806060791, + "rewards/margins": 0.6963319778442383, + "rewards/rejected": -5.109930515289307, + "sft_loss": 4.015780448913574, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 0.6330137390349065, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.3992709219455719, + "logits/rejected": -0.28337493538856506, + "logps/chosen": -4.330594539642334, + "logps/rejected": -4.930819034576416, + "loss": 0.0501, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.330594539642334, + "rewards/margins": 0.6002241969108582, + "rewards/rejected": -4.930819034576416, + "sft_loss": 3.878619432449341, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 0.8940253461870524, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": -0.24059781432151794, + "logits/rejected": -0.12366914749145508, + "logps/chosen": -4.571654796600342, + "logps/rejected": -5.292092800140381, + "loss": 0.052, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.571654796600342, + "rewards/margins": 0.7204381823539734, + "rewards/rejected": -5.292092800140381, + "sft_loss": 4.188701152801514, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 0.4704393131487359, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.3739601969718933, + "logits/rejected": -0.23108768463134766, + "logps/chosen": -4.504766464233398, + "logps/rejected": -5.312743663787842, + "loss": 0.051, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.504766464233398, + "rewards/margins": 0.807977557182312, + "rewards/rejected": -5.312743663787842, + "sft_loss": 4.1565680503845215, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 0.511447960826902, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.3512064814567566, + "logits/rejected": -0.1589089035987854, + "logps/chosen": -4.425866603851318, + "logps/rejected": -5.213771820068359, + "loss": 0.0507, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.425866603851318, + "rewards/margins": 0.7879055738449097, + "rewards/rejected": -5.213771820068359, + "sft_loss": 4.079559326171875, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 0.5089415103306051, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.38519638776779175, + "logits/rejected": -0.18898412585258484, + "logps/chosen": -4.655232906341553, + "logps/rejected": -5.344581604003906, + "loss": 0.0525, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.655232906341553, + "rewards/margins": 0.6893488168716431, + "rewards/rejected": -5.344581604003906, + "sft_loss": 4.351537704467773, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 0.5171631219205263, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.30115431547164917, + "logits/rejected": -0.20378553867340088, + "logps/chosen": -4.3300395011901855, + "logps/rejected": -5.086686134338379, + "loss": 0.0488, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.3300395011901855, + "rewards/margins": 0.7566461563110352, + "rewards/rejected": -5.086686134338379, + "sft_loss": 3.8978729248046875, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 0.7414383342459527, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": -0.2479126900434494, + "logits/rejected": -0.26643773913383484, + "logps/chosen": -4.493429660797119, + "logps/rejected": -5.09151554107666, + "loss": 0.0526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.493429660797119, + "rewards/margins": 0.5980857610702515, + "rewards/rejected": -5.09151554107666, + "sft_loss": 4.203255653381348, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 0.43566242923597975, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.36846572160720825, + "logits/rejected": -0.24956099689006805, + "logps/chosen": -4.325127601623535, + "logps/rejected": -4.973796367645264, + "loss": 0.0505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.325127601623535, + "rewards/margins": 0.6486689448356628, + "rewards/rejected": -4.973796367645264, + "sft_loss": 3.9762065410614014, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 0.4748215414635767, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.2849575877189636, + "logits/rejected": -0.05325264856219292, + "logps/chosen": -4.543246269226074, + "logps/rejected": -5.216692924499512, + "loss": 0.0515, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.543246269226074, + "rewards/margins": 0.6734462976455688, + "rewards/rejected": -5.216692924499512, + "sft_loss": 4.21115779876709, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 1.3413675309441973, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.3605079650878906, + "logits/rejected": -0.22670654952526093, + "logps/chosen": -4.357509136199951, + "logps/rejected": -5.064937591552734, + "loss": 0.0519, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.357509136199951, + "rewards/margins": 0.7074285745620728, + "rewards/rejected": -5.064937591552734, + "sft_loss": 4.133220195770264, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 0.6364875128270325, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.27434051036834717, + "logits/rejected": -0.1490970402956009, + "logps/chosen": -4.532397747039795, + "logps/rejected": -5.171026706695557, + "loss": 0.0512, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.532397747039795, + "rewards/margins": 0.638629138469696, + "rewards/rejected": -5.171026706695557, + "sft_loss": 4.182089328765869, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 0.5736099451418228, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.45021653175354004, + "logits/rejected": -0.2502950429916382, + "logps/chosen": -4.460026741027832, + "logps/rejected": -5.434047222137451, + "loss": 0.0498, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.460026741027832, + "rewards/margins": 0.9740206003189087, + "rewards/rejected": -5.434047222137451, + "sft_loss": 4.120427131652832, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 0.46758669071726616, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.44528883695602417, + "logits/rejected": -0.3131914734840393, + "logps/chosen": -4.494815349578857, + "logps/rejected": -5.001442909240723, + "loss": 0.0533, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.494815349578857, + "rewards/margins": 0.5066278576850891, + "rewards/rejected": -5.001442909240723, + "sft_loss": 4.199819564819336, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 0.49191933154206485, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.45389944314956665, + "logits/rejected": -0.2852258086204529, + "logps/chosen": -4.434971809387207, + "logps/rejected": -5.1843976974487305, + "loss": 0.0513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.434971809387207, + "rewards/margins": 0.7494255900382996, + "rewards/rejected": -5.1843976974487305, + "sft_loss": 4.208171844482422, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 0.9037410118950826, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.3565642237663269, + "logits/rejected": -0.24279490113258362, + "logps/chosen": -4.221377372741699, + "logps/rejected": -4.9192304611206055, + "loss": 0.0509, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.221377372741699, + "rewards/margins": 0.6978529691696167, + "rewards/rejected": -4.9192304611206055, + "sft_loss": 3.895989179611206, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 0.5071819263495008, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.3867717981338501, + "logits/rejected": -0.0971466600894928, + "logps/chosen": -4.355823516845703, + "logps/rejected": -5.244298934936523, + "loss": 0.0505, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.355823516845703, + "rewards/margins": 0.8884755969047546, + "rewards/rejected": -5.244298934936523, + "sft_loss": 4.127333164215088, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 0.9240705189349777, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.27731746435165405, + "logits/rejected": -0.11383461952209473, + "logps/chosen": -4.596616268157959, + "logps/rejected": -5.186906814575195, + "loss": 0.0518, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.596616268157959, + "rewards/margins": 0.590290904045105, + "rewards/rejected": -5.186906814575195, + "sft_loss": 4.284438610076904, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 0.6753498654428448, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.40660548210144043, + "logits/rejected": -0.21776506304740906, + "logps/chosen": -4.394054412841797, + "logps/rejected": -5.0748724937438965, + "loss": 0.0516, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.394054412841797, + "rewards/margins": 0.6808184385299683, + "rewards/rejected": -5.0748724937438965, + "sft_loss": 4.122300148010254, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 0.4671400500652704, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.37436243891716003, + "logits/rejected": -0.19707582890987396, + "logps/chosen": -4.417657375335693, + "logps/rejected": -5.068818092346191, + "loss": 0.0521, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.417657375335693, + "rewards/margins": 0.6511603593826294, + "rewards/rejected": -5.068818092346191, + "sft_loss": 4.166541576385498, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 0.49230232744365754, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.42794451117515564, + "logits/rejected": -0.20655164122581482, + "logps/chosen": -4.463809013366699, + "logps/rejected": -5.186387062072754, + "loss": 0.0506, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.463809013366699, + "rewards/margins": 0.7225781679153442, + "rewards/rejected": -5.186387062072754, + "sft_loss": 4.121039867401123, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 0.8790707326411614, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.33565372228622437, + "logits/rejected": -0.13639724254608154, + "logps/chosen": -4.411907196044922, + "logps/rejected": -5.098509788513184, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.411907196044922, + "rewards/margins": 0.6866029500961304, + "rewards/rejected": -5.098509788513184, + "sft_loss": 4.14210319519043, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 0.4537497662691024, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.3935709595680237, + "logits/rejected": -0.16922712326049805, + "logps/chosen": -4.428333282470703, + "logps/rejected": -5.005343437194824, + "loss": 0.052, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.428333282470703, + "rewards/margins": 0.5770100355148315, + "rewards/rejected": -5.005343437194824, + "sft_loss": 4.094406604766846, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 0.49349269120138967, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.4444296360015869, + "logits/rejected": -0.17299816012382507, + "logps/chosen": -4.411839485168457, + "logps/rejected": -5.164982318878174, + "loss": 0.051, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.411839485168457, + "rewards/margins": 0.7531424760818481, + "rewards/rejected": -5.164982318878174, + "sft_loss": 4.166609764099121, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 0.5341732858975665, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.44806498289108276, + "logits/rejected": -0.20182207226753235, + "logps/chosen": -4.432277679443359, + "logps/rejected": -5.187029838562012, + "loss": 0.0513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.432277679443359, + "rewards/margins": 0.7547519207000732, + "rewards/rejected": -5.187029838562012, + "sft_loss": 4.210070610046387, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 0.6009087279793405, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.26394909620285034, + "logits/rejected": -0.14600220322608948, + "logps/chosen": -4.402077674865723, + "logps/rejected": -5.284658908843994, + "loss": 0.0494, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.402077674865723, + "rewards/margins": 0.8825809359550476, + "rewards/rejected": -5.284658908843994, + "sft_loss": 4.052593231201172, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 0.49808890555063334, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.4313860535621643, + "logits/rejected": -0.18899060785770416, + "logps/chosen": -4.493370056152344, + "logps/rejected": -5.064239501953125, + "loss": 0.0513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.493370056152344, + "rewards/margins": 0.5708690881729126, + "rewards/rejected": -5.064239501953125, + "sft_loss": 4.171633720397949, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 0.8289595837730992, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.5128465294837952, + "logits/rejected": -0.2724844217300415, + "logps/chosen": -4.469786643981934, + "logps/rejected": -4.959803104400635, + "loss": 0.0536, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.469786643981934, + "rewards/margins": 0.490016371011734, + "rewards/rejected": -4.959803104400635, + "sft_loss": 4.14853572845459, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 0.5032451243230273, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.5017607808113098, + "logits/rejected": -0.3467402160167694, + "logps/chosen": -4.393344879150391, + "logps/rejected": -5.1140851974487305, + "loss": 0.0535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.393344879150391, + "rewards/margins": 0.7207397222518921, + "rewards/rejected": -5.1140851974487305, + "sft_loss": 4.095994472503662, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 0.4694756225943738, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": -0.2644937336444855, + "logits/rejected": -0.187953382730484, + "logps/chosen": -4.710282802581787, + "logps/rejected": -5.166679382324219, + "loss": 0.0528, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.710282802581787, + "rewards/margins": 0.45639634132385254, + "rewards/rejected": -5.166679382324219, + "sft_loss": 4.37774133682251, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 0.357064549599628, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.5153255462646484, + "logits/rejected": -0.35782045125961304, + "logps/chosen": -4.570986270904541, + "logps/rejected": -5.14137601852417, + "loss": 0.0526, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.570986270904541, + "rewards/margins": 0.570389986038208, + "rewards/rejected": -5.14137601852417, + "sft_loss": 4.399497032165527, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 0.3821581130831781, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.5005272626876831, + "logits/rejected": -0.26194092631340027, + "logps/chosen": -4.353894233703613, + "logps/rejected": -4.978847980499268, + "loss": 0.0513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.353894233703613, + "rewards/margins": 0.6249544024467468, + "rewards/rejected": -4.978847980499268, + "sft_loss": 4.077733516693115, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 0.6565617472724057, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.34044063091278076, + "logits/rejected": -0.15807709097862244, + "logps/chosen": -4.35715389251709, + "logps/rejected": -4.989835262298584, + "loss": 0.0517, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.35715389251709, + "rewards/margins": 0.6326818466186523, + "rewards/rejected": -4.989835262298584, + "sft_loss": 4.096672534942627, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 0.5764274735030329, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.21918360888957977, + "logits/rejected": -0.06928624957799911, + "logps/chosen": -4.338745594024658, + "logps/rejected": -4.947409152984619, + "loss": 0.05, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.338745594024658, + "rewards/margins": 0.6086626648902893, + "rewards/rejected": -4.947409152984619, + "sft_loss": 3.980414628982544, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 0.5353367303320445, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.24074383080005646, + "logits/rejected": -0.1598784625530243, + "logps/chosen": -4.4777021408081055, + "logps/rejected": -5.272585391998291, + "loss": 0.0508, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.4777021408081055, + "rewards/margins": 0.7948837280273438, + "rewards/rejected": -5.272585391998291, + "sft_loss": 4.107278823852539, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 0.9053716419441293, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.11156382411718369, + "logits/rejected": -0.016821326687932014, + "logps/chosen": -4.582097053527832, + "logps/rejected": -5.265927791595459, + "loss": 0.0509, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.582097053527832, + "rewards/margins": 0.6838306784629822, + "rewards/rejected": -5.265927791595459, + "sft_loss": 4.143289566040039, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 0.38002133797992227, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.2311353236436844, + "logits/rejected": -0.1018177717924118, + "logps/chosen": -4.600157737731934, + "logps/rejected": -5.246756076812744, + "loss": 0.0507, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.600157737731934, + "rewards/margins": 0.6465979814529419, + "rewards/rejected": -5.246756076812744, + "sft_loss": 4.1230292320251465, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 0.47238484247484197, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.4059460759162903, + "logits/rejected": -0.1782340258359909, + "logps/chosen": -4.562694549560547, + "logps/rejected": -5.263379096984863, + "loss": 0.053, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.562694549560547, + "rewards/margins": 0.7006848454475403, + "rewards/rejected": -5.263379096984863, + "sft_loss": 4.330922603607178, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 0.46008827011632475, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.30883657932281494, + "logits/rejected": -0.08638667315244675, + "logps/chosen": -4.29897403717041, + "logps/rejected": -5.054502010345459, + "loss": 0.0519, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.29897403717041, + "rewards/margins": 0.7555279731750488, + "rewards/rejected": -5.054502010345459, + "sft_loss": 4.01223611831665, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 0.49637131140727475, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.32126617431640625, + "logits/rejected": -0.2086419314146042, + "logps/chosen": -4.370186805725098, + "logps/rejected": -5.012415409088135, + "loss": 0.0524, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.370186805725098, + "rewards/margins": 0.6422282457351685, + "rewards/rejected": -5.012415409088135, + "sft_loss": 4.082181453704834, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 0.5200535244364285, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.43776053190231323, + "logits/rejected": -0.3125496804714203, + "logps/chosen": -4.482710361480713, + "logps/rejected": -5.104378700256348, + "loss": 0.052, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.482710361480713, + "rewards/margins": 0.6216682195663452, + "rewards/rejected": -5.104378700256348, + "sft_loss": 4.186079025268555, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 0.41934804007219456, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.504966139793396, + "logits/rejected": -0.4237605035305023, + "logps/chosen": -4.654808044433594, + "logps/rejected": -5.295411586761475, + "loss": 0.0517, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.654808044433594, + "rewards/margins": 0.6406036019325256, + "rewards/rejected": -5.295411586761475, + "sft_loss": 4.294847011566162, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 0.4787343001068853, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.5634989738464355, + "logits/rejected": -0.3325952887535095, + "logps/chosen": -4.372393608093262, + "logps/rejected": -5.1716814041137695, + "loss": 0.0505, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.372393608093262, + "rewards/margins": 0.7992880344390869, + "rewards/rejected": -5.1716814041137695, + "sft_loss": 4.124917030334473, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 0.33250809246420404, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.40238314867019653, + "logits/rejected": -0.3741799294948578, + "logps/chosen": -4.596774101257324, + "logps/rejected": -5.119741439819336, + "loss": 0.0522, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.596774101257324, + "rewards/margins": 0.5229678750038147, + "rewards/rejected": -5.119741439819336, + "sft_loss": 4.286962509155273, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 0.517760589433127, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.326946496963501, + "logits/rejected": -0.2680968642234802, + "logps/chosen": -4.346710205078125, + "logps/rejected": -4.902353763580322, + "loss": 0.0524, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.346710205078125, + "rewards/margins": 0.5556432604789734, + "rewards/rejected": -4.902353763580322, + "sft_loss": 4.1214799880981445, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 0.6963941422892898, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.36718350648880005, + "logits/rejected": -0.1814633309841156, + "logps/chosen": -4.445253372192383, + "logps/rejected": -4.956208229064941, + "loss": 0.053, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.445253372192383, + "rewards/margins": 0.5109549760818481, + "rewards/rejected": -4.956208229064941, + "sft_loss": 4.123419284820557, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 0.47932502119566067, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.46029338240623474, + "logits/rejected": -0.3233875632286072, + "logps/chosen": -4.486559867858887, + "logps/rejected": -5.101731300354004, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.486559867858887, + "rewards/margins": 0.6151722073554993, + "rewards/rejected": -5.101731300354004, + "sft_loss": 4.225186347961426, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 0.5705615653143404, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.4992721676826477, + "logits/rejected": -0.22667022049427032, + "logps/chosen": -4.493367671966553, + "logps/rejected": -5.135613441467285, + "loss": 0.0526, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.493367671966553, + "rewards/margins": 0.6422454118728638, + "rewards/rejected": -5.135613441467285, + "sft_loss": 4.319209098815918, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 0.4273187134282065, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.3602706789970398, + "logits/rejected": -0.19205181300640106, + "logps/chosen": -4.5518798828125, + "logps/rejected": -5.212324142456055, + "loss": 0.0506, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.5518798828125, + "rewards/margins": 0.6604443788528442, + "rewards/rejected": -5.212324142456055, + "sft_loss": 4.110279560089111, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 0.4942906202027163, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.353047639131546, + "logits/rejected": -0.21400539577007294, + "logps/chosen": -4.460274696350098, + "logps/rejected": -5.068617820739746, + "loss": 0.0509, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.460274696350098, + "rewards/margins": 0.6083430647850037, + "rewards/rejected": -5.068617820739746, + "sft_loss": 4.063004493713379, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 0.46608384889709426, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.4946712553501129, + "logits/rejected": -0.3003917634487152, + "logps/chosen": -4.2645416259765625, + "logps/rejected": -5.04714298248291, + "loss": 0.0511, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.2645416259765625, + "rewards/margins": 0.7826014757156372, + "rewards/rejected": -5.04714298248291, + "sft_loss": 4.0179572105407715, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 0.5031334271882354, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.43839550018310547, + "logits/rejected": -0.28657081723213196, + "logps/chosen": -4.571595191955566, + "logps/rejected": -5.225712776184082, + "loss": 0.0515, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.571595191955566, + "rewards/margins": 0.6541174054145813, + "rewards/rejected": -5.225712776184082, + "sft_loss": 4.217996597290039, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 0.47945064630778916, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.350267231464386, + "logits/rejected": -0.14085647463798523, + "logps/chosen": -4.361026287078857, + "logps/rejected": -5.13167667388916, + "loss": 0.0502, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.361026287078857, + "rewards/margins": 0.7706495523452759, + "rewards/rejected": -5.13167667388916, + "sft_loss": 4.0194172859191895, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 0.48807203107233454, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.28777769207954407, + "logits/rejected": -0.19730152189731598, + "logps/chosen": -4.527838230133057, + "logps/rejected": -5.101526737213135, + "loss": 0.053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.527838230133057, + "rewards/margins": 0.5736882090568542, + "rewards/rejected": -5.101526737213135, + "sft_loss": 4.241372108459473, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 0.3600013132470304, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -0.5948072671890259, + "logits/rejected": -0.4410117566585541, + "logps/chosen": -4.5470781326293945, + "logps/rejected": -5.075103282928467, + "loss": 0.0523, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.5470781326293945, + "rewards/margins": 0.528024435043335, + "rewards/rejected": -5.075103282928467, + "sft_loss": 4.2060041427612305, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 0.5094808818966736, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.35384225845336914, + "logits/rejected": -0.19477471709251404, + "logps/chosen": -4.505183219909668, + "logps/rejected": -5.261287689208984, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.505183219909668, + "rewards/margins": 0.7561042904853821, + "rewards/rejected": -5.261287689208984, + "sft_loss": 4.159135341644287, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 0.5864782893861172, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.49953025579452515, + "logits/rejected": -0.3851977586746216, + "logps/chosen": -4.435602188110352, + "logps/rejected": -5.099587440490723, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.435602188110352, + "rewards/margins": 0.6639851927757263, + "rewards/rejected": -5.099587440490723, + "sft_loss": 4.142877578735352, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 0.4774100379036388, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.5168375372886658, + "logits/rejected": -0.3822152018547058, + "logps/chosen": -4.490707874298096, + "logps/rejected": -5.120068550109863, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.490707874298096, + "rewards/margins": 0.6293607950210571, + "rewards/rejected": -5.120068550109863, + "sft_loss": 4.182126998901367, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 0.5003474642863843, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.262535035610199, + "logits/rejected": -0.36348050832748413, + "logps/chosen": -4.541584014892578, + "logps/rejected": -5.141721725463867, + "loss": 0.0519, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.541584014892578, + "rewards/margins": 0.6001380681991577, + "rewards/rejected": -5.141721725463867, + "sft_loss": 4.230230331420898, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 0.7576816731387156, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.5238488912582397, + "logits/rejected": -0.3342793881893158, + "logps/chosen": -4.28743314743042, + "logps/rejected": -4.850545883178711, + "loss": 0.0534, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.28743314743042, + "rewards/margins": 0.5631122589111328, + "rewards/rejected": -4.850545883178711, + "sft_loss": 4.096673011779785, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 0.3851983061345649, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.39444658160209656, + "logits/rejected": -0.2814629077911377, + "logps/chosen": -4.476624488830566, + "logps/rejected": -5.280016899108887, + "loss": 0.0508, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.476624488830566, + "rewards/margins": 0.803392767906189, + "rewards/rejected": -5.280016899108887, + "sft_loss": 4.236693859100342, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 0.47151248691810815, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.3986544609069824, + "logits/rejected": -0.18574748933315277, + "logps/chosen": -4.561751365661621, + "logps/rejected": -5.0480241775512695, + "loss": 0.052, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.561751365661621, + "rewards/margins": 0.4862731993198395, + "rewards/rejected": -5.0480241775512695, + "sft_loss": 4.235430717468262, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 0.49421059769461206, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.2525349259376526, + "logits/rejected": -0.20519249141216278, + "logps/chosen": -4.362771034240723, + "logps/rejected": -5.0366387367248535, + "loss": 0.0531, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.362771034240723, + "rewards/margins": 0.6738678216934204, + "rewards/rejected": -5.0366387367248535, + "sft_loss": 4.0702033042907715, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 0.4749868502685346, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.36834999918937683, + "logits/rejected": -0.3293028771877289, + "logps/chosen": -4.631998538970947, + "logps/rejected": -4.988096237182617, + "loss": 0.0519, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.631998538970947, + "rewards/margins": 0.3560978174209595, + "rewards/rejected": -4.988096237182617, + "sft_loss": 4.213697910308838, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 0.4898756104693347, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.5464175343513489, + "logits/rejected": -0.31873512268066406, + "logps/chosen": -4.390994071960449, + "logps/rejected": -5.097039222717285, + "loss": 0.051, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.390994071960449, + "rewards/margins": 0.7060455083847046, + "rewards/rejected": -5.097039222717285, + "sft_loss": 4.161412715911865, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 0.4561997507441212, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.5197268128395081, + "logits/rejected": -0.39064133167266846, + "logps/chosen": -4.570242881774902, + "logps/rejected": -5.085498809814453, + "loss": 0.0534, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.570242881774902, + "rewards/margins": 0.5152562260627747, + "rewards/rejected": -5.085498809814453, + "sft_loss": 4.3351335525512695, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 0.5648070703248154, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.4234795570373535, + "logits/rejected": -0.21430261433124542, + "logps/chosen": -4.63162088394165, + "logps/rejected": -5.301953315734863, + "loss": 0.0526, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.63162088394165, + "rewards/margins": 0.6703327298164368, + "rewards/rejected": -5.301953315734863, + "sft_loss": 4.310842990875244, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.09203866124153137, + "eval_logits/rejected": 0.20023885369300842, + "eval_logps/chosen": -4.494305610656738, + "eval_logps/rejected": -5.15374755859375, + "eval_loss": 0.05033748596906662, + "eval_rewards/accuracies": 0.6750741600990295, + "eval_rewards/chosen": -4.494305610656738, + "eval_rewards/margins": 0.6594412326812744, + "eval_rewards/rejected": -5.15374755859375, + "eval_runtime": 44.6023, + "eval_samples_per_second": 30.155, + "eval_sft_loss": 4.088565826416016, + "eval_steps_per_second": 7.556, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 0.6012174576350848, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.4575120508670807, + "logits/rejected": -0.3348286747932434, + "logps/chosen": -4.449155330657959, + "logps/rejected": -5.158486366271973, + "loss": 0.0512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.449155330657959, + "rewards/margins": 0.7093305587768555, + "rewards/rejected": -5.158486366271973, + "sft_loss": 4.231721878051758, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 0.4060400100494984, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.49344611167907715, + "logits/rejected": -0.3567366898059845, + "logps/chosen": -4.435418128967285, + "logps/rejected": -5.056334495544434, + "loss": 0.0518, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.435418128967285, + "rewards/margins": 0.6209160089492798, + "rewards/rejected": -5.056334495544434, + "sft_loss": 4.158322811126709, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 0.440167549424582, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.35082921385765076, + "logits/rejected": -0.24494728446006775, + "logps/chosen": -4.462352752685547, + "logps/rejected": -5.083427429199219, + "loss": 0.052, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.462352752685547, + "rewards/margins": 0.6210747361183167, + "rewards/rejected": -5.083427429199219, + "sft_loss": 4.217515468597412, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 0.5313251152060041, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -0.5096575617790222, + "logits/rejected": -0.15080931782722473, + "logps/chosen": -4.45919942855835, + "logps/rejected": -5.160483360290527, + "loss": 0.0509, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.45919942855835, + "rewards/margins": 0.7012836337089539, + "rewards/rejected": -5.160483360290527, + "sft_loss": 4.170629501342773, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 0.48971300206193963, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.3851815462112427, + "logits/rejected": -0.22465872764587402, + "logps/chosen": -4.357975959777832, + "logps/rejected": -5.021440505981445, + "loss": 0.0518, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.357975959777832, + "rewards/margins": 0.6634647846221924, + "rewards/rejected": -5.021440505981445, + "sft_loss": 4.036141395568848, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 0.6060982224485721, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.4702853560447693, + "logits/rejected": -0.2630925178527832, + "logps/chosen": -4.5082268714904785, + "logps/rejected": -5.44030237197876, + "loss": 0.0508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.5082268714904785, + "rewards/margins": 0.9320752024650574, + "rewards/rejected": -5.44030237197876, + "sft_loss": 4.23955774307251, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 0.5406642244933184, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.46813541650772095, + "logits/rejected": -0.36023473739624023, + "logps/chosen": -4.2994184494018555, + "logps/rejected": -5.022599220275879, + "loss": 0.0514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.2994184494018555, + "rewards/margins": 0.7231807708740234, + "rewards/rejected": -5.022599220275879, + "sft_loss": 4.133441925048828, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 0.5646861972185111, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": -0.40759220719337463, + "logits/rejected": -0.21394577622413635, + "logps/chosen": -4.382226467132568, + "logps/rejected": -5.132818698883057, + "loss": 0.0505, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.382226467132568, + "rewards/margins": 0.7505923509597778, + "rewards/rejected": -5.132818698883057, + "sft_loss": 4.103928565979004, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 1.1862512605254292, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.5004664659500122, + "logits/rejected": -0.3564170002937317, + "logps/chosen": -4.307888507843018, + "logps/rejected": -5.052638053894043, + "loss": 0.0515, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.307888507843018, + "rewards/margins": 0.7447504997253418, + "rewards/rejected": -5.052638053894043, + "sft_loss": 4.0162553787231445, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 0.44291728240441197, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": -0.34964537620544434, + "logits/rejected": -0.12260773032903671, + "logps/chosen": -4.081252574920654, + "logps/rejected": -4.9591217041015625, + "loss": 0.0499, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.081252574920654, + "rewards/margins": 0.8778694272041321, + "rewards/rejected": -4.9591217041015625, + "sft_loss": 3.7887864112854004, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 0.4728837633840602, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.37212151288986206, + "logits/rejected": -0.2665833532810211, + "logps/chosen": -4.327893257141113, + "logps/rejected": -5.067246913909912, + "loss": 0.0506, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.327893257141113, + "rewards/margins": 0.7393532991409302, + "rewards/rejected": -5.067246913909912, + "sft_loss": 4.035815238952637, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 0.6663758806493927, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.35964614152908325, + "logits/rejected": -0.21862125396728516, + "logps/chosen": -4.549116611480713, + "logps/rejected": -5.252104759216309, + "loss": 0.053, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.549116611480713, + "rewards/margins": 0.7029882669448853, + "rewards/rejected": -5.252104759216309, + "sft_loss": 4.308150291442871, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 0.6937239177601965, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.30185604095458984, + "logits/rejected": -0.1327410191297531, + "logps/chosen": -4.816344738006592, + "logps/rejected": -5.434140682220459, + "loss": 0.0528, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.816344738006592, + "rewards/margins": 0.6177955865859985, + "rewards/rejected": -5.434140682220459, + "sft_loss": 4.486063480377197, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 0.5476027899361571, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.48964279890060425, + "logits/rejected": -0.3335990309715271, + "logps/chosen": -4.413262844085693, + "logps/rejected": -5.010111331939697, + "loss": 0.0523, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.413262844085693, + "rewards/margins": 0.5968478322029114, + "rewards/rejected": -5.010111331939697, + "sft_loss": 4.17691707611084, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 0.4816724842744238, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -0.5395382642745972, + "logits/rejected": -0.15304972231388092, + "logps/chosen": -4.421982765197754, + "logps/rejected": -5.152639865875244, + "loss": 0.0504, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.421982765197754, + "rewards/margins": 0.7306567430496216, + "rewards/rejected": -5.152639865875244, + "sft_loss": 4.0851616859436035, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 0.7336416483819101, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.4231603741645813, + "logits/rejected": -0.38552913069725037, + "logps/chosen": -4.532313346862793, + "logps/rejected": -5.2002739906311035, + "loss": 0.0516, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.532313346862793, + "rewards/margins": 0.6679608225822449, + "rewards/rejected": -5.2002739906311035, + "sft_loss": 4.266327857971191, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 0.42548735895031536, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.48353782296180725, + "logits/rejected": -0.22564320266246796, + "logps/chosen": -4.283061981201172, + "logps/rejected": -5.007942199707031, + "loss": 0.0511, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.283061981201172, + "rewards/margins": 0.724880039691925, + "rewards/rejected": -5.007942199707031, + "sft_loss": 3.973630905151367, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 0.4704875029146959, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.3822460472583771, + "logits/rejected": -0.2654896378517151, + "logps/chosen": -4.6180033683776855, + "logps/rejected": -5.288962364196777, + "loss": 0.0516, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.6180033683776855, + "rewards/margins": 0.6709581613540649, + "rewards/rejected": -5.288962364196777, + "sft_loss": 4.231356143951416, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 0.5886824609412865, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.42286020517349243, + "logits/rejected": -0.34611302614212036, + "logps/chosen": -4.418186664581299, + "logps/rejected": -5.195922374725342, + "loss": 0.0505, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.418186664581299, + "rewards/margins": 0.7777358293533325, + "rewards/rejected": -5.195922374725342, + "sft_loss": 4.094481945037842, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 0.4488080995370675, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.552751898765564, + "logits/rejected": -0.40471887588500977, + "logps/chosen": -4.430189609527588, + "logps/rejected": -5.224255084991455, + "loss": 0.0506, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.430189609527588, + "rewards/margins": 0.7940656542778015, + "rewards/rejected": -5.224255084991455, + "sft_loss": 4.111329555511475, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 0.42602730666500793, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.5452768206596375, + "logits/rejected": -0.3276907801628113, + "logps/chosen": -4.437650680541992, + "logps/rejected": -5.14551305770874, + "loss": 0.0507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.437650680541992, + "rewards/margins": 0.7078622579574585, + "rewards/rejected": -5.14551305770874, + "sft_loss": 4.128241539001465, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 0.6390670976562808, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.35805755853652954, + "logits/rejected": -0.33667880296707153, + "logps/chosen": -4.5810627937316895, + "logps/rejected": -5.146379470825195, + "loss": 0.0521, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.5810627937316895, + "rewards/margins": 0.5653164982795715, + "rewards/rejected": -5.146379470825195, + "sft_loss": 4.303463935852051, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 0.3752889320968409, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.5173454284667969, + "logits/rejected": -0.3171837329864502, + "logps/chosen": -4.367125511169434, + "logps/rejected": -5.153825283050537, + "loss": 0.0506, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.367125511169434, + "rewards/margins": 0.7866994142532349, + "rewards/rejected": -5.153825283050537, + "sft_loss": 4.1063642501831055, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 0.5181567088724439, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.42447957396507263, + "logits/rejected": -0.12471141666173935, + "logps/chosen": -4.273622035980225, + "logps/rejected": -5.119785308837891, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.273622035980225, + "rewards/margins": 0.8461631536483765, + "rewards/rejected": -5.119785308837891, + "sft_loss": 4.039144992828369, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 0.39367776087848066, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.49922627210617065, + "logits/rejected": -0.2579634487628937, + "logps/chosen": -4.542778968811035, + "logps/rejected": -5.047327995300293, + "loss": 0.053, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.542778968811035, + "rewards/margins": 0.5045495629310608, + "rewards/rejected": -5.047327995300293, + "sft_loss": 4.305514335632324, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 0.5683064013318566, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.4508441388607025, + "logits/rejected": -0.32689857482910156, + "logps/chosen": -4.481637001037598, + "logps/rejected": -5.132130146026611, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.481637001037598, + "rewards/margins": 0.6504932045936584, + "rewards/rejected": -5.132130146026611, + "sft_loss": 4.178994178771973, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 0.4845698813878629, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.4532322883605957, + "logits/rejected": -0.2531808614730835, + "logps/chosen": -4.547987461090088, + "logps/rejected": -5.118274688720703, + "loss": 0.0522, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.547987461090088, + "rewards/margins": 0.5702873468399048, + "rewards/rejected": -5.118274688720703, + "sft_loss": 4.188256740570068, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 0.5898990946897025, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.4140399396419525, + "logits/rejected": -0.18528516590595245, + "logps/chosen": -4.273344039916992, + "logps/rejected": -5.252946376800537, + "loss": 0.0501, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.273344039916992, + "rewards/margins": 0.9796028137207031, + "rewards/rejected": -5.252946376800537, + "sft_loss": 4.037383556365967, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 0.32678783019208785, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.46477198600769043, + "logits/rejected": -0.18900129199028015, + "logps/chosen": -4.3688130378723145, + "logps/rejected": -4.982115745544434, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.3688130378723145, + "rewards/margins": 0.6133025288581848, + "rewards/rejected": -4.982115745544434, + "sft_loss": 4.144090175628662, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 0.5801523203742189, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.5855122804641724, + "logits/rejected": -0.41854292154312134, + "logps/chosen": -4.312270164489746, + "logps/rejected": -4.93634557723999, + "loss": 0.0521, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.312270164489746, + "rewards/margins": 0.6240752935409546, + "rewards/rejected": -4.93634557723999, + "sft_loss": 4.130881309509277, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 0.5156421662577778, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": -0.3728213906288147, + "logits/rejected": -0.3222619891166687, + "logps/chosen": -4.4992995262146, + "logps/rejected": -5.148545265197754, + "loss": 0.0516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.4992995262146, + "rewards/margins": 0.6492457985877991, + "rewards/rejected": -5.148545265197754, + "sft_loss": 4.185006141662598, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 0.6314975765154605, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.4350285530090332, + "logits/rejected": -0.21461530029773712, + "logps/chosen": -4.306244373321533, + "logps/rejected": -5.169526100158691, + "loss": 0.0507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.306244373321533, + "rewards/margins": 0.86328125, + "rewards/rejected": -5.169526100158691, + "sft_loss": 4.004863739013672, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 0.34425815053776077, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.5348860621452332, + "logits/rejected": -0.2687731385231018, + "logps/chosen": -4.5383830070495605, + "logps/rejected": -5.160616874694824, + "loss": 0.0527, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.5383830070495605, + "rewards/margins": 0.6222342252731323, + "rewards/rejected": -5.160616874694824, + "sft_loss": 4.3074493408203125, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 0.44953749468074905, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.46770724654197693, + "logits/rejected": -0.22437289357185364, + "logps/chosen": -4.51656436920166, + "logps/rejected": -5.041363716125488, + "loss": 0.0524, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.51656436920166, + "rewards/margins": 0.5247992277145386, + "rewards/rejected": -5.041363716125488, + "sft_loss": 4.19726037979126, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 0.7984227678138108, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": -0.2765560746192932, + "logits/rejected": -0.23209819197654724, + "logps/chosen": -4.682974338531494, + "logps/rejected": -5.259125232696533, + "loss": 0.0531, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.682974338531494, + "rewards/margins": 0.5761508941650391, + "rewards/rejected": -5.259125232696533, + "sft_loss": 4.388473987579346, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 0.5857258417880921, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.539823591709137, + "logits/rejected": -0.2948388159275055, + "logps/chosen": -4.513506889343262, + "logps/rejected": -5.19875955581665, + "loss": 0.0513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.513506889343262, + "rewards/margins": 0.685252845287323, + "rewards/rejected": -5.19875955581665, + "sft_loss": 4.188086986541748, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 0.41345892065077483, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.4432690739631653, + "logits/rejected": -0.4649744927883148, + "logps/chosen": -4.3670573234558105, + "logps/rejected": -4.942203044891357, + "loss": 0.0522, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.3670573234558105, + "rewards/margins": 0.575145423412323, + "rewards/rejected": -4.942203044891357, + "sft_loss": 4.128762245178223, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 0.3807537260151555, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.517729640007019, + "logits/rejected": -0.34260427951812744, + "logps/chosen": -4.635345458984375, + "logps/rejected": -5.255611419677734, + "loss": 0.0526, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.635345458984375, + "rewards/margins": 0.6202660202980042, + "rewards/rejected": -5.255611419677734, + "sft_loss": 4.328315258026123, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 0.6093769811207337, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.602250337600708, + "logits/rejected": -0.39874905347824097, + "logps/chosen": -4.449357986450195, + "logps/rejected": -5.050063610076904, + "loss": 0.0505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.449357986450195, + "rewards/margins": 0.6007059812545776, + "rewards/rejected": -5.050063610076904, + "sft_loss": 4.129879474639893, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 0.5274513119746579, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.4147399961948395, + "logits/rejected": -0.24769814312458038, + "logps/chosen": -4.3410162925720215, + "logps/rejected": -5.063584804534912, + "loss": 0.0501, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.3410162925720215, + "rewards/margins": 0.7225686311721802, + "rewards/rejected": -5.063584804534912, + "sft_loss": 3.992316484451294, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 0.8969781254234718, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.4810507297515869, + "logits/rejected": -0.3219291567802429, + "logps/chosen": -4.2392802238464355, + "logps/rejected": -4.895898342132568, + "loss": 0.0508, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.2392802238464355, + "rewards/margins": 0.6566182374954224, + "rewards/rejected": -4.895898342132568, + "sft_loss": 3.9705615043640137, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 0.5333461902605714, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.5034727454185486, + "logits/rejected": -0.1998063623905182, + "logps/chosen": -4.280855655670166, + "logps/rejected": -5.128933906555176, + "loss": 0.05, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.280855655670166, + "rewards/margins": 0.8480777740478516, + "rewards/rejected": -5.128933906555176, + "sft_loss": 3.991110324859619, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 0.7210380554678926, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.503537654876709, + "logits/rejected": -0.36525729298591614, + "logps/chosen": -4.475733280181885, + "logps/rejected": -5.242091655731201, + "loss": 0.0515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.475733280181885, + "rewards/margins": 0.7663584351539612, + "rewards/rejected": -5.242091655731201, + "sft_loss": 4.2255539894104, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 0.5579904639421918, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.31057581305503845, + "logits/rejected": -0.19793424010276794, + "logps/chosen": -4.797817230224609, + "logps/rejected": -5.580362796783447, + "loss": 0.0542, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.797817230224609, + "rewards/margins": 0.7825452089309692, + "rewards/rejected": -5.580362796783447, + "sft_loss": 4.528318881988525, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 0.6518358573852266, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.48437052965164185, + "logits/rejected": -0.4504765570163727, + "logps/chosen": -4.491649627685547, + "logps/rejected": -5.1009202003479, + "loss": 0.0509, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.491649627685547, + "rewards/margins": 0.609270453453064, + "rewards/rejected": -5.1009202003479, + "sft_loss": 4.175753116607666, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 0.38888651929066054, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.4671882688999176, + "logits/rejected": -0.33713823556900024, + "logps/chosen": -4.218931198120117, + "logps/rejected": -5.193713188171387, + "loss": 0.0493, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.218931198120117, + "rewards/margins": 0.9747824668884277, + "rewards/rejected": -5.193713188171387, + "sft_loss": 3.91839599609375, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 0.45745043157722315, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.3741157650947571, + "logits/rejected": -0.24425411224365234, + "logps/chosen": -4.257815361022949, + "logps/rejected": -4.900847434997559, + "loss": 0.0507, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.257815361022949, + "rewards/margins": 0.6430323719978333, + "rewards/rejected": -4.900847434997559, + "sft_loss": 3.8886466026306152, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 0.5181841104189797, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.463533490896225, + "logits/rejected": -0.34041827917099, + "logps/chosen": -4.538870334625244, + "logps/rejected": -5.209536075592041, + "loss": 0.0548, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.538870334625244, + "rewards/margins": 0.670665442943573, + "rewards/rejected": -5.209536075592041, + "sft_loss": 4.259721755981445, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 0.7463694923350966, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.4446185231208801, + "logits/rejected": -0.18138308823108673, + "logps/chosen": -4.314180850982666, + "logps/rejected": -5.201914310455322, + "loss": 0.0509, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.314180850982666, + "rewards/margins": 0.8877336382865906, + "rewards/rejected": -5.201914310455322, + "sft_loss": 4.067704200744629, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 0.7305408844038221, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.5151158571243286, + "logits/rejected": -0.2955884337425232, + "logps/chosen": -4.570265293121338, + "logps/rejected": -5.323184967041016, + "loss": 0.052, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.570265293121338, + "rewards/margins": 0.7529199719429016, + "rewards/rejected": -5.323184967041016, + "sft_loss": 4.289970397949219, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 0.6296820490589102, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.456859290599823, + "logits/rejected": -0.181920126080513, + "logps/chosen": -4.556793689727783, + "logps/rejected": -5.294541835784912, + "loss": 0.0516, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.556793689727783, + "rewards/margins": 0.7377482652664185, + "rewards/rejected": -5.294541835784912, + "sft_loss": 4.333632469177246, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 0.5484676674948301, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.31831836700439453, + "logits/rejected": -0.17829912900924683, + "logps/chosen": -4.420655727386475, + "logps/rejected": -5.155341625213623, + "loss": 0.0511, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.420655727386475, + "rewards/margins": 0.7346860766410828, + "rewards/rejected": -5.155341625213623, + "sft_loss": 4.081669807434082, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 0.4479879814971447, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.4166865944862366, + "logits/rejected": -0.3156904876232147, + "logps/chosen": -4.233908176422119, + "logps/rejected": -4.787781715393066, + "loss": 0.0518, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.233908176422119, + "rewards/margins": 0.5538742542266846, + "rewards/rejected": -4.787781715393066, + "sft_loss": 3.992141008377075, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 0.5758825106294463, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.4493212103843689, + "logits/rejected": -0.36382657289505005, + "logps/chosen": -4.502279281616211, + "logps/rejected": -5.006508827209473, + "loss": 0.0539, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.502279281616211, + "rewards/margins": 0.5042295455932617, + "rewards/rejected": -5.006508827209473, + "sft_loss": 4.146180152893066, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 0.6106162004698598, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.4591015875339508, + "logits/rejected": -0.4287734925746918, + "logps/chosen": -4.42536735534668, + "logps/rejected": -5.106713771820068, + "loss": 0.0519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.42536735534668, + "rewards/margins": 0.6813467741012573, + "rewards/rejected": -5.106713771820068, + "sft_loss": 4.198317527770996, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 0.3729145732710297, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.32077568769454956, + "logits/rejected": -0.311382919549942, + "logps/chosen": -4.4926886558532715, + "logps/rejected": -5.12239933013916, + "loss": 0.0523, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.4926886558532715, + "rewards/margins": 0.6297103762626648, + "rewards/rejected": -5.12239933013916, + "sft_loss": 4.272998809814453, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 0.49448934482541285, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.4439376890659332, + "logits/rejected": -0.2607240676879883, + "logps/chosen": -4.512078285217285, + "logps/rejected": -5.103994369506836, + "loss": 0.052, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.512078285217285, + "rewards/margins": 0.5919159054756165, + "rewards/rejected": -5.103994369506836, + "sft_loss": 4.274177551269531, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 0.4570421660226491, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.36230507493019104, + "logits/rejected": -0.2706686556339264, + "logps/chosen": -4.175656318664551, + "logps/rejected": -5.063697814941406, + "loss": 0.05, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.175656318664551, + "rewards/margins": 0.8880417943000793, + "rewards/rejected": -5.063697814941406, + "sft_loss": 3.9765090942382812, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 0.4449564164610078, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -0.5528706312179565, + "logits/rejected": -0.3249674141407013, + "logps/chosen": -4.501574516296387, + "logps/rejected": -5.052069664001465, + "loss": 0.0533, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.501574516296387, + "rewards/margins": 0.5504950284957886, + "rewards/rejected": -5.052069664001465, + "sft_loss": 4.305262088775635, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 0.6068412485725911, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.35553085803985596, + "logits/rejected": -0.16507504880428314, + "logps/chosen": -4.477541923522949, + "logps/rejected": -5.131036758422852, + "loss": 0.0519, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.477541923522949, + "rewards/margins": 0.6534945964813232, + "rewards/rejected": -5.131036758422852, + "sft_loss": 4.123941421508789, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 0.417124958508913, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.4458470940589905, + "logits/rejected": -0.25313228368759155, + "logps/chosen": -4.447249412536621, + "logps/rejected": -5.175409317016602, + "loss": 0.0508, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.447249412536621, + "rewards/margins": 0.7281599640846252, + "rewards/rejected": -5.175409317016602, + "sft_loss": 4.110104084014893, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 0.5824911789053943, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.4364047944545746, + "logits/rejected": -0.34159305691719055, + "logps/chosen": -4.4093122482299805, + "logps/rejected": -5.021022319793701, + "loss": 0.0514, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.4093122482299805, + "rewards/margins": 0.6117098331451416, + "rewards/rejected": -5.021022319793701, + "sft_loss": 4.1440229415893555, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 0.5231461376544928, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.42022261023521423, + "logits/rejected": -0.18159890174865723, + "logps/chosen": -4.356125831604004, + "logps/rejected": -5.121804237365723, + "loss": 0.0514, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.356125831604004, + "rewards/margins": 0.7656790018081665, + "rewards/rejected": -5.121804237365723, + "sft_loss": 4.129373550415039, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 0.4080394786532409, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.44390755891799927, + "logits/rejected": -0.237187460064888, + "logps/chosen": -4.44614315032959, + "logps/rejected": -5.215703010559082, + "loss": 0.051, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.44614315032959, + "rewards/margins": 0.7695599794387817, + "rewards/rejected": -5.215703010559082, + "sft_loss": 4.167542457580566, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 1.1253008447763322, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.33227282762527466, + "logits/rejected": -0.08558958768844604, + "logps/chosen": -4.393829345703125, + "logps/rejected": -5.135227680206299, + "loss": 0.0521, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.393829345703125, + "rewards/margins": 0.7413985133171082, + "rewards/rejected": -5.135227680206299, + "sft_loss": 4.12273645401001, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 0.3619456418861775, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.32817691564559937, + "logits/rejected": -0.26207447052001953, + "logps/chosen": -4.479277610778809, + "logps/rejected": -4.9810686111450195, + "loss": 0.0521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.479277610778809, + "rewards/margins": 0.5017910599708557, + "rewards/rejected": -4.9810686111450195, + "sft_loss": 4.196653842926025, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 0.8233495991712592, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -0.48102402687072754, + "logits/rejected": -0.3378520607948303, + "logps/chosen": -4.401326656341553, + "logps/rejected": -5.100113868713379, + "loss": 0.0507, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.401326656341553, + "rewards/margins": 0.6987876892089844, + "rewards/rejected": -5.100113868713379, + "sft_loss": 4.070215225219727, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 0.4385078125633207, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -0.4696807861328125, + "logits/rejected": -0.17892661690711975, + "logps/chosen": -4.4610276222229, + "logps/rejected": -5.119901657104492, + "loss": 0.0523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.4610276222229, + "rewards/margins": 0.6588743925094604, + "rewards/rejected": -5.119901657104492, + "sft_loss": 4.196292400360107, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 0.8458093070187234, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -0.5620048642158508, + "logits/rejected": -0.19954653084278107, + "logps/chosen": -4.403501033782959, + "logps/rejected": -5.049036502838135, + "loss": 0.0502, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.403501033782959, + "rewards/margins": 0.6455354690551758, + "rewards/rejected": -5.049036502838135, + "sft_loss": 4.065199851989746, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 0.4704386506569429, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.2901856303215027, + "logits/rejected": -0.190904900431633, + "logps/chosen": -4.472724914550781, + "logps/rejected": -5.0304365158081055, + "loss": 0.0528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.472724914550781, + "rewards/margins": 0.5577119588851929, + "rewards/rejected": -5.0304365158081055, + "sft_loss": 4.144157886505127, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 0.6006091165206281, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -0.432685911655426, + "logits/rejected": -0.2743949294090271, + "logps/chosen": -4.504183769226074, + "logps/rejected": -5.2633185386657715, + "loss": 0.0513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.504183769226074, + "rewards/margins": 0.7591356635093689, + "rewards/rejected": -5.2633185386657715, + "sft_loss": 4.193944454193115, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 0.721435017311742, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.39169448614120483, + "logits/rejected": -0.26618900895118713, + "logps/chosen": -4.552404880523682, + "logps/rejected": -5.208046913146973, + "loss": 0.0517, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.552404880523682, + "rewards/margins": 0.6556424498558044, + "rewards/rejected": -5.208046913146973, + "sft_loss": 4.27425479888916, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 0.4939259198559468, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.33243894577026367, + "logits/rejected": -0.23660437762737274, + "logps/chosen": -4.485566139221191, + "logps/rejected": -5.194786071777344, + "loss": 0.0506, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.485566139221191, + "rewards/margins": 0.7092195749282837, + "rewards/rejected": -5.194786071777344, + "sft_loss": 4.126955509185791, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 0.5249982928488253, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.4098309874534607, + "logits/rejected": -0.1714651882648468, + "logps/chosen": -4.175349712371826, + "logps/rejected": -5.012117385864258, + "loss": 0.0487, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.175349712371826, + "rewards/margins": 0.8367677927017212, + "rewards/rejected": -5.012117385864258, + "sft_loss": 3.794126510620117, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 0.5075978035965267, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.26183438301086426, + "logits/rejected": -0.12572081387043, + "logps/chosen": -4.214966297149658, + "logps/rejected": -5.030123710632324, + "loss": 0.0508, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.214966297149658, + "rewards/margins": 0.8151571154594421, + "rewards/rejected": -5.030123710632324, + "sft_loss": 3.8706488609313965, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 0.43746094254176354, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.23620197176933289, + "logits/rejected": -0.1973074972629547, + "logps/chosen": -4.420254707336426, + "logps/rejected": -5.177483558654785, + "loss": 0.0514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.420254707336426, + "rewards/margins": 0.7572286128997803, + "rewards/rejected": -5.177483558654785, + "sft_loss": 4.062037467956543, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 0.5481978185722167, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.1542886197566986, + "logits/rejected": -0.11566226184368134, + "logps/chosen": -4.590366840362549, + "logps/rejected": -5.307346343994141, + "loss": 0.0512, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.590366840362549, + "rewards/margins": 0.7169798612594604, + "rewards/rejected": -5.307346343994141, + "sft_loss": 4.328564167022705, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 1.1503822120526854, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.2250882089138031, + "logits/rejected": -0.185527965426445, + "logps/chosen": -4.529903411865234, + "logps/rejected": -5.08640718460083, + "loss": 0.0539, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.529903411865234, + "rewards/margins": 0.5565038323402405, + "rewards/rejected": -5.08640718460083, + "sft_loss": 4.295632839202881, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 0.8891651435865691, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.3869754374027252, + "logits/rejected": -0.1528097689151764, + "logps/chosen": -4.39151668548584, + "logps/rejected": -5.24567174911499, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.39151668548584, + "rewards/margins": 0.8541552424430847, + "rewards/rejected": -5.24567174911499, + "sft_loss": 4.075733184814453, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 0.681986111329172, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.3255468010902405, + "logits/rejected": -0.09248501062393188, + "logps/chosen": -4.518547058105469, + "logps/rejected": -5.2352681159973145, + "loss": 0.0533, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.518547058105469, + "rewards/margins": 0.7167209982872009, + "rewards/rejected": -5.2352681159973145, + "sft_loss": 4.26555871963501, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.042076222598552704, + "eval_logits/rejected": 0.13484692573547363, + "eval_logps/chosen": -4.380853652954102, + "eval_logps/rejected": -5.1003336906433105, + "eval_loss": 0.05012309178709984, + "eval_rewards/accuracies": 0.6824925541877747, + "eval_rewards/chosen": -4.380853652954102, + "eval_rewards/margins": 0.7194797992706299, + "eval_rewards/rejected": -5.1003336906433105, + "eval_runtime": 53.2854, + "eval_samples_per_second": 25.241, + "eval_sft_loss": 3.985746145248413, + "eval_steps_per_second": 6.324, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 0.5549631315016357, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.413614422082901, + "logits/rejected": -0.4114568829536438, + "logps/chosen": -4.466488361358643, + "logps/rejected": -5.028023719787598, + "loss": 0.0521, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.466488361358643, + "rewards/margins": 0.5615357160568237, + "rewards/rejected": -5.028023719787598, + "sft_loss": 4.167389392852783, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 0.3925853049073582, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.4991392493247986, + "logits/rejected": -0.3091749846935272, + "logps/chosen": -4.473031044006348, + "logps/rejected": -5.131730556488037, + "loss": 0.0518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.473031044006348, + "rewards/margins": 0.6587000489234924, + "rewards/rejected": -5.131730556488037, + "sft_loss": 4.212399482727051, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 0.4248304376205085, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.3598117232322693, + "logits/rejected": -0.2084875851869583, + "logps/chosen": -4.257334232330322, + "logps/rejected": -4.964080810546875, + "loss": 0.052, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.257334232330322, + "rewards/margins": 0.7067463397979736, + "rewards/rejected": -4.964080810546875, + "sft_loss": 4.068849563598633, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 0.6211146011480987, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.4285960793495178, + "logits/rejected": -0.3357129395008087, + "logps/chosen": -4.479846000671387, + "logps/rejected": -4.998563766479492, + "loss": 0.0536, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.479846000671387, + "rewards/margins": 0.5187180638313293, + "rewards/rejected": -4.998563766479492, + "sft_loss": 4.278697490692139, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 0.5518821657498258, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.2991160452365875, + "logits/rejected": -0.3689224123954773, + "logps/chosen": -4.462956428527832, + "logps/rejected": -4.9926347732543945, + "loss": 0.0516, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.462956428527832, + "rewards/margins": 0.529678463935852, + "rewards/rejected": -4.9926347732543945, + "sft_loss": 4.177762985229492, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 0.4413001057484683, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.418252557516098, + "logits/rejected": -0.40913906693458557, + "logps/chosen": -4.441468715667725, + "logps/rejected": -5.178438186645508, + "loss": 0.0522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.441468715667725, + "rewards/margins": 0.7369694709777832, + "rewards/rejected": -5.178438186645508, + "sft_loss": 4.229043006896973, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 0.43302853267366176, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -0.49432092905044556, + "logits/rejected": -0.35859179496765137, + "logps/chosen": -4.379761219024658, + "logps/rejected": -5.066256523132324, + "loss": 0.052, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.379761219024658, + "rewards/margins": 0.6864956617355347, + "rewards/rejected": -5.066256523132324, + "sft_loss": 4.174038887023926, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 0.40957106935824633, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.4507400393486023, + "logits/rejected": -0.3858460783958435, + "logps/chosen": -4.515069961547852, + "logps/rejected": -4.925932884216309, + "loss": 0.053, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.515069961547852, + "rewards/margins": 0.41086310148239136, + "rewards/rejected": -4.925932884216309, + "sft_loss": 4.25916862487793, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 0.3396296787353694, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.4683416485786438, + "logits/rejected": -0.3325764536857605, + "logps/chosen": -4.533778190612793, + "logps/rejected": -5.040543079376221, + "loss": 0.0518, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.533778190612793, + "rewards/margins": 0.5067647695541382, + "rewards/rejected": -5.040543079376221, + "sft_loss": 4.286801815032959, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 0.45233647238819286, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -0.4944976270198822, + "logits/rejected": -0.3757014870643616, + "logps/chosen": -4.554055213928223, + "logps/rejected": -5.018446445465088, + "loss": 0.0534, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.554055213928223, + "rewards/margins": 0.4643916189670563, + "rewards/rejected": -5.018446445465088, + "sft_loss": 4.320440292358398, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 0.5370729215061321, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.3831983208656311, + "logits/rejected": -0.26895269751548767, + "logps/chosen": -4.414761543273926, + "logps/rejected": -5.030882358551025, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.414761543273926, + "rewards/margins": 0.6161209344863892, + "rewards/rejected": -5.030882358551025, + "sft_loss": 4.083725929260254, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 0.772444816008, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.39643681049346924, + "logits/rejected": -0.23135873675346375, + "logps/chosen": -4.39790153503418, + "logps/rejected": -5.099810600280762, + "loss": 0.0518, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.39790153503418, + "rewards/margins": 0.7019084095954895, + "rewards/rejected": -5.099810600280762, + "sft_loss": 4.100924491882324, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 0.5423150933685754, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.4202393591403961, + "logits/rejected": -0.3306739330291748, + "logps/chosen": -4.583292007446289, + "logps/rejected": -5.248047828674316, + "loss": 0.0513, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.583292007446289, + "rewards/margins": 0.6647554636001587, + "rewards/rejected": -5.248047828674316, + "sft_loss": 4.254556179046631, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 0.5804743436026095, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.4745996594429016, + "logits/rejected": -0.3122026324272156, + "logps/chosen": -4.478249549865723, + "logps/rejected": -5.229377746582031, + "loss": 0.0523, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.478249549865723, + "rewards/margins": 0.7511278390884399, + "rewards/rejected": -5.229377746582031, + "sft_loss": 4.368980884552002, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 0.38822005969042495, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.4124499261379242, + "logits/rejected": -0.25161346793174744, + "logps/chosen": -4.488820552825928, + "logps/rejected": -5.219266414642334, + "loss": 0.0525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.488820552825928, + "rewards/margins": 0.7304463982582092, + "rewards/rejected": -5.219266414642334, + "sft_loss": 4.217720985412598, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 0.6205345104070085, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.36543330550193787, + "logits/rejected": -0.25120458006858826, + "logps/chosen": -4.348297119140625, + "logps/rejected": -4.99812126159668, + "loss": 0.0503, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.348297119140625, + "rewards/margins": 0.6498240232467651, + "rewards/rejected": -4.99812126159668, + "sft_loss": 3.9996657371520996, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 0.4684774215516626, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.4492974877357483, + "logits/rejected": -0.3519330322742462, + "logps/chosen": -4.489088535308838, + "logps/rejected": -5.102923393249512, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.489088535308838, + "rewards/margins": 0.6138354539871216, + "rewards/rejected": -5.102923393249512, + "sft_loss": 4.269659519195557, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 0.4352793387940146, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.4023476243019104, + "logits/rejected": -0.18105700612068176, + "logps/chosen": -4.410039901733398, + "logps/rejected": -5.142436981201172, + "loss": 0.0508, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.410039901733398, + "rewards/margins": 0.7323965430259705, + "rewards/rejected": -5.142436981201172, + "sft_loss": 4.056788921356201, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 0.7221959376956284, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.38285067677497864, + "logits/rejected": -0.2815985679626465, + "logps/chosen": -4.474632740020752, + "logps/rejected": -5.1458845138549805, + "loss": 0.0513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.474632740020752, + "rewards/margins": 0.6712522506713867, + "rewards/rejected": -5.1458845138549805, + "sft_loss": 4.184819221496582, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 0.42364467886896584, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.3717033267021179, + "logits/rejected": -0.2091069519519806, + "logps/chosen": -4.41146993637085, + "logps/rejected": -5.03031587600708, + "loss": 0.0527, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.41146993637085, + "rewards/margins": 0.6188455820083618, + "rewards/rejected": -5.03031587600708, + "sft_loss": 4.186913967132568, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 0.503213529046553, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.3578525185585022, + "logits/rejected": -0.2924673557281494, + "logps/chosen": -4.406094074249268, + "logps/rejected": -5.103278160095215, + "loss": 0.051, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.406094074249268, + "rewards/margins": 0.6971846222877502, + "rewards/rejected": -5.103278160095215, + "sft_loss": 4.0739641189575195, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 0.38135488875923174, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.22055137157440186, + "logits/rejected": -0.2213733196258545, + "logps/chosen": -4.404345989227295, + "logps/rejected": -5.086258888244629, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.404345989227295, + "rewards/margins": 0.6819120049476624, + "rewards/rejected": -5.086258888244629, + "sft_loss": 4.12283992767334, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 0.3876450118385628, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.40424853563308716, + "logits/rejected": -0.36117544770240784, + "logps/chosen": -4.4008331298828125, + "logps/rejected": -5.137930870056152, + "loss": 0.0514, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.4008331298828125, + "rewards/margins": 0.7370980381965637, + "rewards/rejected": -5.137930870056152, + "sft_loss": 4.242586612701416, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 0.5861867090556139, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.41305360198020935, + "logits/rejected": -0.2922312617301941, + "logps/chosen": -4.416529655456543, + "logps/rejected": -5.068317413330078, + "loss": 0.0508, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.416529655456543, + "rewards/margins": 0.6517875790596008, + "rewards/rejected": -5.068317413330078, + "sft_loss": 4.030583381652832, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 0.5166229517646469, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.45256465673446655, + "logits/rejected": -0.35922300815582275, + "logps/chosen": -4.412965774536133, + "logps/rejected": -5.209670066833496, + "loss": 0.0516, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.412965774536133, + "rewards/margins": 0.7967040538787842, + "rewards/rejected": -5.209670066833496, + "sft_loss": 4.189126014709473, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 0.6824688697780613, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.34892401099205017, + "logits/rejected": -0.196553036570549, + "logps/chosen": -4.41003942489624, + "logps/rejected": -5.10931921005249, + "loss": 0.0512, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.41003942489624, + "rewards/margins": 0.6992799043655396, + "rewards/rejected": -5.10931921005249, + "sft_loss": 4.171704292297363, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 1.0901331051063583, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.3078027069568634, + "logits/rejected": -0.23698917031288147, + "logps/chosen": -4.411991596221924, + "logps/rejected": -5.105666637420654, + "loss": 0.0524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.411991596221924, + "rewards/margins": 0.6936756372451782, + "rewards/rejected": -5.105666637420654, + "sft_loss": 4.146667003631592, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 0.5188798925629451, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.3119719922542572, + "logits/rejected": -0.2788045406341553, + "logps/chosen": -4.284983158111572, + "logps/rejected": -4.85286283493042, + "loss": 0.0519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.284983158111572, + "rewards/margins": 0.5678800344467163, + "rewards/rejected": -4.85286283493042, + "sft_loss": 3.9691109657287598, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 0.4564540243960645, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.2935374677181244, + "logits/rejected": -0.216166689991951, + "logps/chosen": -4.339650630950928, + "logps/rejected": -5.14935827255249, + "loss": 0.0491, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.339650630950928, + "rewards/margins": 0.8097079396247864, + "rewards/rejected": -5.14935827255249, + "sft_loss": 4.013418674468994, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 0.6048823479276922, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.41443270444869995, + "logits/rejected": -0.29372677206993103, + "logps/chosen": -4.4289727210998535, + "logps/rejected": -5.105106830596924, + "loss": 0.052, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.4289727210998535, + "rewards/margins": 0.6761346459388733, + "rewards/rejected": -5.105106830596924, + "sft_loss": 4.135364532470703, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 0.5046145294593994, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.35901492834091187, + "logits/rejected": -0.1820385754108429, + "logps/chosen": -4.4694504737854, + "logps/rejected": -5.2601518630981445, + "loss": 0.052, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.4694504737854, + "rewards/margins": 0.7907018065452576, + "rewards/rejected": -5.2601518630981445, + "sft_loss": 4.254968166351318, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 0.29403349636257936, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.42322462797164917, + "logits/rejected": -0.38704365491867065, + "logps/chosen": -4.426640510559082, + "logps/rejected": -5.285890102386475, + "loss": 0.0505, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.426640510559082, + "rewards/margins": 0.8592498898506165, + "rewards/rejected": -5.285890102386475, + "sft_loss": 4.212055206298828, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 0.4409370642018633, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.4164283871650696, + "logits/rejected": -0.16731376945972443, + "logps/chosen": -4.487136363983154, + "logps/rejected": -5.313013076782227, + "loss": 0.05, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.487136363983154, + "rewards/margins": 0.8258762359619141, + "rewards/rejected": -5.313013076782227, + "sft_loss": 4.19671630859375, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 0.4638398222614792, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.4540809690952301, + "logits/rejected": -0.242467001080513, + "logps/chosen": -4.405378341674805, + "logps/rejected": -5.238399028778076, + "loss": 0.05, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.405378341674805, + "rewards/margins": 0.8330209851264954, + "rewards/rejected": -5.238399028778076, + "sft_loss": 4.111074447631836, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 0.43954904547247514, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.36221879720687866, + "logits/rejected": -0.3303019106388092, + "logps/chosen": -4.343339920043945, + "logps/rejected": -5.14573860168457, + "loss": 0.0506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.343339920043945, + "rewards/margins": 0.8023991584777832, + "rewards/rejected": -5.14573860168457, + "sft_loss": 4.104333877563477, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 0.5069367462388984, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.31020763516426086, + "logits/rejected": -0.31572312116622925, + "logps/chosen": -4.166377067565918, + "logps/rejected": -4.930800437927246, + "loss": 0.0498, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.166377067565918, + "rewards/margins": 0.7644233703613281, + "rewards/rejected": -4.930800437927246, + "sft_loss": 3.880671977996826, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 0.5671140716617928, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.37395811080932617, + "logits/rejected": -0.17182935774326324, + "logps/chosen": -4.164129734039307, + "logps/rejected": -4.959773063659668, + "loss": 0.0502, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.164129734039307, + "rewards/margins": 0.7956432104110718, + "rewards/rejected": -4.959773063659668, + "sft_loss": 3.9170658588409424, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 0.5316215456404498, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.4127592444419861, + "logits/rejected": -0.2285337895154953, + "logps/chosen": -4.2911787033081055, + "logps/rejected": -5.131359100341797, + "loss": 0.0516, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.2911787033081055, + "rewards/margins": 0.8401795625686646, + "rewards/rejected": -5.131359100341797, + "sft_loss": 4.116988658905029, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 0.5071073040531272, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.275922954082489, + "logits/rejected": -0.2805738151073456, + "logps/chosen": -4.463419437408447, + "logps/rejected": -5.0827202796936035, + "loss": 0.0508, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.463419437408447, + "rewards/margins": 0.6193008422851562, + "rewards/rejected": -5.0827202796936035, + "sft_loss": 4.152365207672119, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 0.6290278382399703, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.31285542249679565, + "logits/rejected": -0.14409422874450684, + "logps/chosen": -4.381528377532959, + "logps/rejected": -5.171832084655762, + "loss": 0.0512, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.381528377532959, + "rewards/margins": 0.790303111076355, + "rewards/rejected": -5.171832084655762, + "sft_loss": 4.130490303039551, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 0.4750673772059999, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.3556492030620575, + "logits/rejected": -0.19416716694831848, + "logps/chosen": -4.3436150550842285, + "logps/rejected": -5.072421073913574, + "loss": 0.0511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.3436150550842285, + "rewards/margins": 0.7288060188293457, + "rewards/rejected": -5.072421073913574, + "sft_loss": 4.075199127197266, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 0.561423135841397, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.19012321531772614, + "logits/rejected": -0.11848640441894531, + "logps/chosen": -4.485342979431152, + "logps/rejected": -5.298763751983643, + "loss": 0.0503, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.485342979431152, + "rewards/margins": 0.8134201169013977, + "rewards/rejected": -5.298763751983643, + "sft_loss": 4.0772199630737305, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 0.4536463672222037, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.42413783073425293, + "logits/rejected": -0.32068854570388794, + "logps/chosen": -4.322854995727539, + "logps/rejected": -5.041677474975586, + "loss": 0.0495, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.322854995727539, + "rewards/margins": 0.7188224792480469, + "rewards/rejected": -5.041677474975586, + "sft_loss": 4.019509315490723, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 0.42784473263060707, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.4775485396385193, + "logits/rejected": -0.22211973369121552, + "logps/chosen": -4.356131076812744, + "logps/rejected": -5.343842506408691, + "loss": 0.0497, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.356131076812744, + "rewards/margins": 0.9877112507820129, + "rewards/rejected": -5.343842506408691, + "sft_loss": 4.108757019042969, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 0.591812952535654, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.38019734621047974, + "logits/rejected": -0.32415252923965454, + "logps/chosen": -4.308173179626465, + "logps/rejected": -5.0770392417907715, + "loss": 0.0514, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.308173179626465, + "rewards/margins": 0.7688660025596619, + "rewards/rejected": -5.0770392417907715, + "sft_loss": 4.112570285797119, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 0.6807512245577299, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.4845009744167328, + "logits/rejected": -0.31688985228538513, + "logps/chosen": -4.246060371398926, + "logps/rejected": -5.092269420623779, + "loss": 0.0495, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.246060371398926, + "rewards/margins": 0.8462090492248535, + "rewards/rejected": -5.092269420623779, + "sft_loss": 3.8919384479522705, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 0.47888776950849704, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.3477453589439392, + "logits/rejected": -0.1463039219379425, + "logps/chosen": -4.308255195617676, + "logps/rejected": -5.2045207023620605, + "loss": 0.0509, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.308255195617676, + "rewards/margins": 0.8962651491165161, + "rewards/rejected": -5.2045207023620605, + "sft_loss": 4.06205415725708, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 0.7940416668853761, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.27007609605789185, + "logits/rejected": -0.10254746675491333, + "logps/chosen": -4.4379963874816895, + "logps/rejected": -5.277739524841309, + "loss": 0.0517, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.4379963874816895, + "rewards/margins": 0.8397432565689087, + "rewards/rejected": -5.277739524841309, + "sft_loss": 4.179162502288818, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 0.5044049210520891, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.41755181550979614, + "logits/rejected": -0.15795069932937622, + "logps/chosen": -4.296069622039795, + "logps/rejected": -5.277471542358398, + "loss": 0.0494, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.296069622039795, + "rewards/margins": 0.981401801109314, + "rewards/rejected": -5.277471542358398, + "sft_loss": 4.078981876373291, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 0.5570283652118182, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.4102238118648529, + "logits/rejected": -0.18241597712039948, + "logps/chosen": -4.257302284240723, + "logps/rejected": -5.118211269378662, + "loss": 0.0512, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.257302284240723, + "rewards/margins": 0.8609098196029663, + "rewards/rejected": -5.118211269378662, + "sft_loss": 4.112555503845215, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 0.5648628736720877, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.32026559114456177, + "logits/rejected": -0.08667844533920288, + "logps/chosen": -4.301587104797363, + "logps/rejected": -5.258988857269287, + "loss": 0.0502, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.301587104797363, + "rewards/margins": 0.9574017524719238, + "rewards/rejected": -5.258988857269287, + "sft_loss": 4.079500675201416, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 1.2682819436152235, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.2664087414741516, + "logits/rejected": -0.10435505211353302, + "logps/chosen": -4.1911211013793945, + "logps/rejected": -5.21523380279541, + "loss": 0.0512, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.1911211013793945, + "rewards/margins": 1.0241124629974365, + "rewards/rejected": -5.21523380279541, + "sft_loss": 3.9725563526153564, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 0.5482276671902016, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.4165882468223572, + "logits/rejected": -0.23353514075279236, + "logps/chosen": -4.504220008850098, + "logps/rejected": -5.191248893737793, + "loss": 0.0513, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.504220008850098, + "rewards/margins": 0.6870293617248535, + "rewards/rejected": -5.191248893737793, + "sft_loss": 4.223007678985596, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 0.7151427005992033, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.41871920228004456, + "logits/rejected": -0.31977134943008423, + "logps/chosen": -4.269133567810059, + "logps/rejected": -5.074051380157471, + "loss": 0.05, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.269133567810059, + "rewards/margins": 0.8049181699752808, + "rewards/rejected": -5.074051380157471, + "sft_loss": 4.028173923492432, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 0.5973736394357382, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.3259030282497406, + "logits/rejected": -0.230748251080513, + "logps/chosen": -4.2867631912231445, + "logps/rejected": -4.970796585083008, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.2867631912231445, + "rewards/margins": 0.6840331554412842, + "rewards/rejected": -4.970796585083008, + "sft_loss": 3.9719643592834473, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 0.5511602661782866, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.3292701542377472, + "logits/rejected": -0.1453513205051422, + "logps/chosen": -4.196750640869141, + "logps/rejected": -5.174393653869629, + "loss": 0.0477, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.196750640869141, + "rewards/margins": 0.9776426553726196, + "rewards/rejected": -5.174393653869629, + "sft_loss": 3.8621106147766113, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 0.5512517052338067, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.34448251128196716, + "logits/rejected": -0.22929009795188904, + "logps/chosen": -4.3367204666137695, + "logps/rejected": -5.269116401672363, + "loss": 0.0519, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.3367204666137695, + "rewards/margins": 0.9323955774307251, + "rewards/rejected": -5.269116401672363, + "sft_loss": 4.058518409729004, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 0.46558012602017285, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.25459569692611694, + "logits/rejected": -0.14168240129947662, + "logps/chosen": -4.452088356018066, + "logps/rejected": -5.236243724822998, + "loss": 0.0505, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.452088356018066, + "rewards/margins": 0.7841559648513794, + "rewards/rejected": -5.236243724822998, + "sft_loss": 4.1351799964904785, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 0.6347829987966642, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.4141598343849182, + "logits/rejected": -0.1650456190109253, + "logps/chosen": -4.456624507904053, + "logps/rejected": -5.327982425689697, + "loss": 0.0493, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.456624507904053, + "rewards/margins": 0.8713573217391968, + "rewards/rejected": -5.327982425689697, + "sft_loss": 4.143157958984375, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 0.3941280329888256, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.28264540433883667, + "logits/rejected": -0.29245123267173767, + "logps/chosen": -4.255073070526123, + "logps/rejected": -5.082621097564697, + "loss": 0.0499, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.255073070526123, + "rewards/margins": 0.827547550201416, + "rewards/rejected": -5.082621097564697, + "sft_loss": 4.036890983581543, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 0.4219232787295329, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.309459388256073, + "logits/rejected": -0.24101197719573975, + "logps/chosen": -4.4378767013549805, + "logps/rejected": -5.039052486419678, + "loss": 0.0523, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.4378767013549805, + "rewards/margins": 0.6011752486228943, + "rewards/rejected": -5.039052486419678, + "sft_loss": 4.130825042724609, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 0.482405415426906, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.31963956356048584, + "logits/rejected": -0.180974081158638, + "logps/chosen": -4.413501739501953, + "logps/rejected": -5.067274570465088, + "loss": 0.0514, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.413501739501953, + "rewards/margins": 0.6537727117538452, + "rewards/rejected": -5.067274570465088, + "sft_loss": 4.154252529144287, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 0.3865437479050332, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.44079723954200745, + "logits/rejected": -0.24643990397453308, + "logps/chosen": -4.45873498916626, + "logps/rejected": -5.161642074584961, + "loss": 0.052, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.45873498916626, + "rewards/margins": 0.7029072046279907, + "rewards/rejected": -5.161642074584961, + "sft_loss": 4.244243144989014, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 0.4862165718551747, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.3659209907054901, + "logits/rejected": -0.1901930421590805, + "logps/chosen": -4.335224151611328, + "logps/rejected": -5.234259605407715, + "loss": 0.0499, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.335224151611328, + "rewards/margins": 0.8990362286567688, + "rewards/rejected": -5.234259605407715, + "sft_loss": 4.113430500030518, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 0.589991768059557, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.35370510816574097, + "logits/rejected": -0.28823089599609375, + "logps/chosen": -4.255982398986816, + "logps/rejected": -5.0965423583984375, + "loss": 0.0497, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.255982398986816, + "rewards/margins": 0.8405606150627136, + "rewards/rejected": -5.0965423583984375, + "sft_loss": 3.981487274169922, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 0.6335308159808326, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.46438127756118774, + "logits/rejected": -0.2801642417907715, + "logps/chosen": -4.140517234802246, + "logps/rejected": -5.057478427886963, + "loss": 0.05, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.140517234802246, + "rewards/margins": 0.9169610142707825, + "rewards/rejected": -5.057478427886963, + "sft_loss": 3.9590744972229004, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 0.7515168620245619, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.3501364290714264, + "logits/rejected": -0.2728483974933624, + "logps/chosen": -4.320809364318848, + "logps/rejected": -5.093451499938965, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.320809364318848, + "rewards/margins": 0.7726426720619202, + "rewards/rejected": -5.093451499938965, + "sft_loss": 4.071271896362305, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 0.6183918453023575, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.3462718427181244, + "logits/rejected": -0.2552764415740967, + "logps/chosen": -4.334949970245361, + "logps/rejected": -5.172145366668701, + "loss": 0.0507, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.334949970245361, + "rewards/margins": 0.8371955752372742, + "rewards/rejected": -5.172145366668701, + "sft_loss": 4.093982219696045, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 0.7821774872769638, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.39049679040908813, + "logits/rejected": -0.17837652564048767, + "logps/chosen": -4.518766403198242, + "logps/rejected": -5.061732292175293, + "loss": 0.0538, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.518766403198242, + "rewards/margins": 0.542966365814209, + "rewards/rejected": -5.061732292175293, + "sft_loss": 4.259708404541016, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 0.4422456622841627, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.2510150372982025, + "logits/rejected": -0.09285564720630646, + "logps/chosen": -4.417247772216797, + "logps/rejected": -5.220688819885254, + "loss": 0.0506, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.417247772216797, + "rewards/margins": 0.8034406900405884, + "rewards/rejected": -5.220688819885254, + "sft_loss": 4.077648639678955, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 0.5447054778045387, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.37670284509658813, + "logits/rejected": -0.33116498589515686, + "logps/chosen": -4.425228118896484, + "logps/rejected": -5.350491046905518, + "loss": 0.0509, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.425228118896484, + "rewards/margins": 0.9252630472183228, + "rewards/rejected": -5.350491046905518, + "sft_loss": 4.21567440032959, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 0.6839068942676009, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.332570344209671, + "logits/rejected": -0.22967591881752014, + "logps/chosen": -4.309033393859863, + "logps/rejected": -5.366685390472412, + "loss": 0.0505, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.309033393859863, + "rewards/margins": 1.0576521158218384, + "rewards/rejected": -5.366685390472412, + "sft_loss": 4.095768928527832, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 0.539794604604235, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.3633222281932831, + "logits/rejected": -0.20267243683338165, + "logps/chosen": -4.284216403961182, + "logps/rejected": -5.179417610168457, + "loss": 0.0499, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.284216403961182, + "rewards/margins": 0.8952015042304993, + "rewards/rejected": -5.179417610168457, + "sft_loss": 4.089019298553467, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 0.6650863893790436, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.2811715602874756, + "logits/rejected": -0.19412469863891602, + "logps/chosen": -4.374194622039795, + "logps/rejected": -5.159496784210205, + "loss": 0.05, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.374194622039795, + "rewards/margins": 0.7853022217750549, + "rewards/rejected": -5.159496784210205, + "sft_loss": 4.077193737030029, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 0.7570690390018039, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.2788086533546448, + "logits/rejected": -0.1510922908782959, + "logps/chosen": -4.137418746948242, + "logps/rejected": -4.947587013244629, + "loss": 0.0486, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.137418746948242, + "rewards/margins": 0.8101680874824524, + "rewards/rejected": -4.947587013244629, + "sft_loss": 3.85473895072937, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 0.6922445687240398, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.3353812098503113, + "logits/rejected": -0.20698490738868713, + "logps/chosen": -4.306082248687744, + "logps/rejected": -5.15787410736084, + "loss": 0.0506, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.306082248687744, + "rewards/margins": 0.8517919778823853, + "rewards/rejected": -5.15787410736084, + "sft_loss": 4.074376106262207, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 0.5903877756187921, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.4253089427947998, + "logits/rejected": -0.28170305490493774, + "logps/chosen": -4.395491123199463, + "logps/rejected": -5.1986823081970215, + "loss": 0.0503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.395491123199463, + "rewards/margins": 0.8031916618347168, + "rewards/rejected": -5.1986823081970215, + "sft_loss": 4.087505340576172, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 0.5932205535715832, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.40591302514076233, + "logits/rejected": -0.23701974749565125, + "logps/chosen": -4.1090288162231445, + "logps/rejected": -4.979941368103027, + "loss": 0.0509, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.1090288162231445, + "rewards/margins": 0.8709122538566589, + "rewards/rejected": -4.979941368103027, + "sft_loss": 3.8675315380096436, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 0.37999548877256567, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.4235003590583801, + "logits/rejected": -0.10297024250030518, + "logps/chosen": -4.314941883087158, + "logps/rejected": -5.224982738494873, + "loss": 0.0501, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.314941883087158, + "rewards/margins": 0.9100410342216492, + "rewards/rejected": -5.224982738494873, + "sft_loss": 4.035270690917969, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 0.5848375242126514, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.23933526873588562, + "logits/rejected": -0.15588855743408203, + "logps/chosen": -4.382521629333496, + "logps/rejected": -5.272342205047607, + "loss": 0.0493, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.382521629333496, + "rewards/margins": 0.88982093334198, + "rewards/rejected": -5.272342205047607, + "sft_loss": 4.00167179107666, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.19795073568820953, + "eval_logits/rejected": 0.30290400981903076, + "eval_logps/chosen": -4.395362377166748, + "eval_logps/rejected": -5.153711795806885, + "eval_loss": 0.04997352510690689, + "eval_rewards/accuracies": 0.6839762330055237, + "eval_rewards/chosen": -4.395362377166748, + "eval_rewards/margins": 0.7583494782447815, + "eval_rewards/rejected": -5.153711795806885, + "eval_runtime": 45.0603, + "eval_samples_per_second": 29.849, + "eval_sft_loss": 3.9750618934631348, + "eval_steps_per_second": 7.479, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 0.6938806035793248, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.2542392611503601, + "logits/rejected": 0.00998584646731615, + "logps/chosen": -4.379458427429199, + "logps/rejected": -5.155462265014648, + "loss": 0.0499, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.379458427429199, + "rewards/margins": 0.7760039567947388, + "rewards/rejected": -5.155462265014648, + "sft_loss": 4.045472621917725, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 0.39509876824712153, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.3175296187400818, + "logits/rejected": -0.20715491473674774, + "logps/chosen": -4.360111236572266, + "logps/rejected": -5.166841983795166, + "loss": 0.0489, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.360111236572266, + "rewards/margins": 0.8067308664321899, + "rewards/rejected": -5.166841983795166, + "sft_loss": 3.986266613006592, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 0.66114171413954, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.31307560205459595, + "logits/rejected": -0.008983058854937553, + "logps/chosen": -4.3839335441589355, + "logps/rejected": -5.171870231628418, + "loss": 0.0522, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.3839335441589355, + "rewards/margins": 0.7879377603530884, + "rewards/rejected": -5.171870231628418, + "sft_loss": 4.155789375305176, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 0.3597383400103683, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.39556217193603516, + "logits/rejected": -0.25675544142723083, + "logps/chosen": -4.207012176513672, + "logps/rejected": -5.153992652893066, + "loss": 0.0488, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.207012176513672, + "rewards/margins": 0.9469804763793945, + "rewards/rejected": -5.153992652893066, + "sft_loss": 3.9583792686462402, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 0.5681000111895875, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.2569750249385834, + "logits/rejected": -0.15023109316825867, + "logps/chosen": -4.433220386505127, + "logps/rejected": -5.140045642852783, + "loss": 0.0514, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.433220386505127, + "rewards/margins": 0.7068256139755249, + "rewards/rejected": -5.140045642852783, + "sft_loss": 4.159654140472412, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 0.40353186945349734, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.3387718200683594, + "logits/rejected": -0.278081476688385, + "logps/chosen": -4.410946846008301, + "logps/rejected": -5.245885372161865, + "loss": 0.0485, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.410946846008301, + "rewards/margins": 0.8349380493164062, + "rewards/rejected": -5.245885372161865, + "sft_loss": 3.995271682739258, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 0.38721194269692477, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.2565738260746002, + "logits/rejected": -0.2947031259536743, + "logps/chosen": -4.291686058044434, + "logps/rejected": -4.931029796600342, + "loss": 0.0505, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.291686058044434, + "rewards/margins": 0.6393446922302246, + "rewards/rejected": -4.931029796600342, + "sft_loss": 3.965325117111206, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 0.44677508552918804, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.2852485775947571, + "logits/rejected": -0.32384148240089417, + "logps/chosen": -4.3653764724731445, + "logps/rejected": -5.132533073425293, + "loss": 0.0501, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.3653764724731445, + "rewards/margins": 0.7671566009521484, + "rewards/rejected": -5.132533073425293, + "sft_loss": 4.074067115783691, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 0.5021427968737899, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.29809674620628357, + "logits/rejected": -0.14820165932178497, + "logps/chosen": -4.290149211883545, + "logps/rejected": -5.1410932540893555, + "loss": 0.0503, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.290149211883545, + "rewards/margins": 0.8509443998336792, + "rewards/rejected": -5.1410932540893555, + "sft_loss": 4.025235176086426, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 0.5799504566757765, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.3963567912578583, + "logits/rejected": -0.2322741001844406, + "logps/chosen": -4.519765377044678, + "logps/rejected": -5.176243782043457, + "loss": 0.0544, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.519765377044678, + "rewards/margins": 0.6564784049987793, + "rewards/rejected": -5.176243782043457, + "sft_loss": 4.351537704467773, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 0.8047811292467494, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.2328902930021286, + "logits/rejected": -0.17223525047302246, + "logps/chosen": -4.405119895935059, + "logps/rejected": -4.998657703399658, + "loss": 0.0513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.405119895935059, + "rewards/margins": 0.593537449836731, + "rewards/rejected": -4.998657703399658, + "sft_loss": 4.159225940704346, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 0.5192972561919211, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.2871648371219635, + "logits/rejected": -0.09363778680562973, + "logps/chosen": -4.515463352203369, + "logps/rejected": -5.090214729309082, + "loss": 0.0532, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.515463352203369, + "rewards/margins": 0.5747517347335815, + "rewards/rejected": -5.090214729309082, + "sft_loss": 4.21968936920166, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 0.4878248040983854, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.39410749077796936, + "logits/rejected": -0.3096895217895508, + "logps/chosen": -4.484684467315674, + "logps/rejected": -5.331465721130371, + "loss": 0.0505, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.484684467315674, + "rewards/margins": 0.8467812538146973, + "rewards/rejected": -5.331465721130371, + "sft_loss": 4.192440986633301, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 0.514093539313008, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.314320832490921, + "logits/rejected": -0.07475896179676056, + "logps/chosen": -4.39267635345459, + "logps/rejected": -5.261028289794922, + "loss": 0.0502, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.39267635345459, + "rewards/margins": 0.8683518171310425, + "rewards/rejected": -5.261028289794922, + "sft_loss": 4.083760738372803, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 0.5910254433311389, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.33221128582954407, + "logits/rejected": -0.12248317152261734, + "logps/chosen": -4.258050918579102, + "logps/rejected": -5.079594135284424, + "loss": 0.0484, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.258050918579102, + "rewards/margins": 0.8215433955192566, + "rewards/rejected": -5.079594135284424, + "sft_loss": 3.9350600242614746, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 0.44331518926666863, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.3698522448539734, + "logits/rejected": -0.24156637489795685, + "logps/chosen": -4.3765363693237305, + "logps/rejected": -5.2998046875, + "loss": 0.0497, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.3765363693237305, + "rewards/margins": 0.9232684373855591, + "rewards/rejected": -5.2998046875, + "sft_loss": 4.07401704788208, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 0.4360409311344564, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.2376057207584381, + "logits/rejected": -0.1950663924217224, + "logps/chosen": -4.3534393310546875, + "logps/rejected": -5.132110595703125, + "loss": 0.0508, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.3534393310546875, + "rewards/margins": 0.7786713242530823, + "rewards/rejected": -5.132110595703125, + "sft_loss": 4.0486626625061035, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 0.4611340496677084, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.42794519662857056, + "logits/rejected": -0.22650738060474396, + "logps/chosen": -4.329638481140137, + "logps/rejected": -5.147965431213379, + "loss": 0.0496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.329638481140137, + "rewards/margins": 0.8183272480964661, + "rewards/rejected": -5.147965431213379, + "sft_loss": 3.998382568359375, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 0.5195705902795844, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.4067610800266266, + "logits/rejected": -0.32761862874031067, + "logps/chosen": -4.3563666343688965, + "logps/rejected": -5.1253862380981445, + "loss": 0.0497, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.3563666343688965, + "rewards/margins": 0.769019603729248, + "rewards/rejected": -5.1253862380981445, + "sft_loss": 4.0309929847717285, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 0.7515484520612266, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.2933884561061859, + "logits/rejected": -0.10745501518249512, + "logps/chosen": -4.388808250427246, + "logps/rejected": -5.196818828582764, + "loss": 0.05, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.388808250427246, + "rewards/margins": 0.8080108761787415, + "rewards/rejected": -5.196818828582764, + "sft_loss": 4.126204490661621, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 0.36793126931525066, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.3496021330356598, + "logits/rejected": -0.12837150692939758, + "logps/chosen": -4.346577167510986, + "logps/rejected": -5.146768569946289, + "loss": 0.0498, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.346577167510986, + "rewards/margins": 0.8001911044120789, + "rewards/rejected": -5.146768569946289, + "sft_loss": 4.027698040008545, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 0.5183161146135171, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.31241917610168457, + "logits/rejected": -0.09111049026250839, + "logps/chosen": -4.2632646560668945, + "logps/rejected": -5.173994064331055, + "loss": 0.0496, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.2632646560668945, + "rewards/margins": 0.910729706287384, + "rewards/rejected": -5.173994064331055, + "sft_loss": 3.9972128868103027, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 0.960953323451343, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.2670236825942993, + "logits/rejected": -0.18157057464122772, + "logps/chosen": -4.247032642364502, + "logps/rejected": -5.117427825927734, + "loss": 0.05, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.247032642364502, + "rewards/margins": 0.8703948855400085, + "rewards/rejected": -5.117427825927734, + "sft_loss": 3.9730453491210938, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 0.9825229794639762, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.32241255044937134, + "logits/rejected": -0.21362285315990448, + "logps/chosen": -4.210694313049316, + "logps/rejected": -4.990067481994629, + "loss": 0.0508, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.210694313049316, + "rewards/margins": 0.779373824596405, + "rewards/rejected": -4.990067481994629, + "sft_loss": 3.9055423736572266, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 0.487116120875464, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.3949047327041626, + "logits/rejected": -0.17741958796977997, + "logps/chosen": -4.219511032104492, + "logps/rejected": -5.2257795333862305, + "loss": 0.049, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.219511032104492, + "rewards/margins": 1.0062682628631592, + "rewards/rejected": -5.2257795333862305, + "sft_loss": 3.941300630569458, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 0.4971791224235736, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.26797419786453247, + "logits/rejected": -0.17251023650169373, + "logps/chosen": -4.320847034454346, + "logps/rejected": -5.197128772735596, + "loss": 0.0508, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.320847034454346, + "rewards/margins": 0.8762819170951843, + "rewards/rejected": -5.197128772735596, + "sft_loss": 4.089552879333496, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 0.6298909129841176, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.20016708970069885, + "logits/rejected": -0.24042925238609314, + "logps/chosen": -4.610543727874756, + "logps/rejected": -5.33798885345459, + "loss": 0.0527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.610543727874756, + "rewards/margins": 0.727445125579834, + "rewards/rejected": -5.33798885345459, + "sft_loss": 4.340829372406006, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 0.433532923165645, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.4819692075252533, + "logits/rejected": -0.42997080087661743, + "logps/chosen": -4.405976295471191, + "logps/rejected": -5.104135990142822, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.405976295471191, + "rewards/margins": 0.6981590986251831, + "rewards/rejected": -5.104135990142822, + "sft_loss": 4.199334144592285, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 0.5595706664938976, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.23996420204639435, + "logits/rejected": -0.0859023854136467, + "logps/chosen": -4.4849348068237305, + "logps/rejected": -5.22530460357666, + "loss": 0.0511, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.4849348068237305, + "rewards/margins": 0.7403702735900879, + "rewards/rejected": -5.22530460357666, + "sft_loss": 4.229413032531738, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 0.47235470682786457, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.36756059527397156, + "logits/rejected": -0.25558412075042725, + "logps/chosen": -4.3525214195251465, + "logps/rejected": -5.156813621520996, + "loss": 0.0504, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.3525214195251465, + "rewards/margins": 0.8042919039726257, + "rewards/rejected": -5.156813621520996, + "sft_loss": 4.11196231842041, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 0.5165025523535854, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.3705408275127411, + "logits/rejected": -0.38917452096939087, + "logps/chosen": -4.170284271240234, + "logps/rejected": -4.961236000061035, + "loss": 0.0493, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.170284271240234, + "rewards/margins": 0.790952205657959, + "rewards/rejected": -4.961236000061035, + "sft_loss": 3.8401451110839844, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 0.6027686935337975, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.3659024238586426, + "logits/rejected": -0.23701664805412292, + "logps/chosen": -4.248166561126709, + "logps/rejected": -5.038237571716309, + "loss": 0.0504, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.248166561126709, + "rewards/margins": 0.7900711894035339, + "rewards/rejected": -5.038237571716309, + "sft_loss": 3.9721550941467285, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 0.7255411038290942, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.35837799310684204, + "logits/rejected": -0.22272975742816925, + "logps/chosen": -4.344782829284668, + "logps/rejected": -5.208219051361084, + "loss": 0.0506, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.344782829284668, + "rewards/margins": 0.8634363412857056, + "rewards/rejected": -5.208219051361084, + "sft_loss": 4.040823936462402, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 0.6179482287804432, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.17336727678775787, + "logits/rejected": -0.10471125692129135, + "logps/chosen": -4.413411617279053, + "logps/rejected": -5.257023334503174, + "loss": 0.051, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.413411617279053, + "rewards/margins": 0.8436113595962524, + "rewards/rejected": -5.257023334503174, + "sft_loss": 4.22342586517334, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 0.9128972719000168, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.32853183150291443, + "logits/rejected": -0.23889176547527313, + "logps/chosen": -4.317111015319824, + "logps/rejected": -5.020560264587402, + "loss": 0.0511, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.317111015319824, + "rewards/margins": 0.703448474407196, + "rewards/rejected": -5.020560264587402, + "sft_loss": 4.063889503479004, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 0.570277555661351, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.37603339552879333, + "logits/rejected": -0.12415879964828491, + "logps/chosen": -4.372494697570801, + "logps/rejected": -5.246346950531006, + "loss": 0.051, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.372494697570801, + "rewards/margins": 0.8738527297973633, + "rewards/rejected": -5.246346950531006, + "sft_loss": 4.095280647277832, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 0.5063146939111651, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.33638912439346313, + "logits/rejected": -0.32763671875, + "logps/chosen": -4.277950286865234, + "logps/rejected": -5.035035133361816, + "loss": 0.049, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.277950286865234, + "rewards/margins": 0.757083535194397, + "rewards/rejected": -5.035035133361816, + "sft_loss": 3.931530714035034, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 0.5668311239347966, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.17369453608989716, + "logits/rejected": -0.08982907980680466, + "logps/chosen": -4.383388519287109, + "logps/rejected": -5.2250165939331055, + "loss": 0.0517, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.383388519287109, + "rewards/margins": 0.8416286706924438, + "rewards/rejected": -5.2250165939331055, + "sft_loss": 4.108694553375244, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 0.4524352019006803, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.462289422750473, + "logits/rejected": -0.32472777366638184, + "logps/chosen": -4.465978145599365, + "logps/rejected": -5.1302995681762695, + "loss": 0.0505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.465978145599365, + "rewards/margins": 0.6643209457397461, + "rewards/rejected": -5.1302995681762695, + "sft_loss": 4.117940902709961, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 0.7152486203815478, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.17068356275558472, + "logits/rejected": -0.21633978188037872, + "logps/chosen": -4.524289131164551, + "logps/rejected": -5.111501216888428, + "loss": 0.0535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.524289131164551, + "rewards/margins": 0.5872117877006531, + "rewards/rejected": -5.111501216888428, + "sft_loss": 4.23223352432251, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 0.4968827477148222, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.39958643913269043, + "logits/rejected": -0.3283675014972687, + "logps/chosen": -4.347672462463379, + "logps/rejected": -5.024527072906494, + "loss": 0.0508, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.347672462463379, + "rewards/margins": 0.6768544912338257, + "rewards/rejected": -5.024527072906494, + "sft_loss": 3.984272003173828, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 0.6034610216258647, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.35456377267837524, + "logits/rejected": -0.23129554092884064, + "logps/chosen": -4.35485315322876, + "logps/rejected": -5.382556438446045, + "loss": 0.0501, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.35485315322876, + "rewards/margins": 1.0277034044265747, + "rewards/rejected": -5.382556438446045, + "sft_loss": 4.1364922523498535, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 0.7072110788697915, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.34522002935409546, + "logits/rejected": -0.18638448417186737, + "logps/chosen": -4.422163963317871, + "logps/rejected": -5.283125400543213, + "loss": 0.0513, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.422163963317871, + "rewards/margins": 0.8609609603881836, + "rewards/rejected": -5.283125400543213, + "sft_loss": 4.139298915863037, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 0.45836262120476573, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.41230830550193787, + "logits/rejected": -0.20210537314414978, + "logps/chosen": -4.222951889038086, + "logps/rejected": -5.2063188552856445, + "loss": 0.0487, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.222951889038086, + "rewards/margins": 0.9833674430847168, + "rewards/rejected": -5.2063188552856445, + "sft_loss": 4.025243282318115, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 0.3997972564343064, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.41125649213790894, + "logits/rejected": -0.12870237231254578, + "logps/chosen": -4.29024600982666, + "logps/rejected": -5.098462104797363, + "loss": 0.0514, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.29024600982666, + "rewards/margins": 0.8082154393196106, + "rewards/rejected": -5.098462104797363, + "sft_loss": 4.028970718383789, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 0.4035893113625815, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.3744090795516968, + "logits/rejected": -0.30211079120635986, + "logps/chosen": -4.280365467071533, + "logps/rejected": -5.217028617858887, + "loss": 0.0488, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.280365467071533, + "rewards/margins": 0.9366633296012878, + "rewards/rejected": -5.217028617858887, + "sft_loss": 3.9095451831817627, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 0.6207283136955817, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.34626153111457825, + "logits/rejected": -0.2757788300514221, + "logps/chosen": -4.46042537689209, + "logps/rejected": -5.151759147644043, + "loss": 0.0504, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.46042537689209, + "rewards/margins": 0.691334068775177, + "rewards/rejected": -5.151759147644043, + "sft_loss": 4.131369590759277, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 0.48534289801784386, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.3493006229400635, + "logits/rejected": -0.25627556443214417, + "logps/chosen": -4.3235392570495605, + "logps/rejected": -5.203847885131836, + "loss": 0.0491, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.3235392570495605, + "rewards/margins": 0.8803087472915649, + "rewards/rejected": -5.203847885131836, + "sft_loss": 4.0077996253967285, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 0.4853526836957504, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.3779717683792114, + "logits/rejected": -0.22099003195762634, + "logps/chosen": -4.265778541564941, + "logps/rejected": -5.1664605140686035, + "loss": 0.0493, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.265778541564941, + "rewards/margins": 0.90068119764328, + "rewards/rejected": -5.1664605140686035, + "sft_loss": 3.9554412364959717, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 0.37946669743407174, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.37565523386001587, + "logits/rejected": -0.16238412261009216, + "logps/chosen": -4.434185981750488, + "logps/rejected": -5.394456386566162, + "loss": 0.0486, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.434185981750488, + "rewards/margins": 0.9602702856063843, + "rewards/rejected": -5.394456386566162, + "sft_loss": 4.023019313812256, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 0.5047515792520335, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.40330666303634644, + "logits/rejected": -0.1070922389626503, + "logps/chosen": -4.3636579513549805, + "logps/rejected": -5.487493991851807, + "loss": 0.049, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.3636579513549805, + "rewards/margins": 1.1238361597061157, + "rewards/rejected": -5.487493991851807, + "sft_loss": 3.9812254905700684, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 0.49986291771664965, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.3085281252861023, + "logits/rejected": -0.23384499549865723, + "logps/chosen": -4.470911979675293, + "logps/rejected": -5.105937957763672, + "loss": 0.0522, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.470911979675293, + "rewards/margins": 0.6350253820419312, + "rewards/rejected": -5.105937957763672, + "sft_loss": 4.192760467529297, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 0.419551409827489, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.45216822624206543, + "logits/rejected": -0.2786790728569031, + "logps/chosen": -4.20875883102417, + "logps/rejected": -5.2486252784729, + "loss": 0.0478, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.20875883102417, + "rewards/margins": 1.03986656665802, + "rewards/rejected": -5.2486252784729, + "sft_loss": 3.9220237731933594, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 0.672454413679323, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.3137684464454651, + "logits/rejected": -0.031981952488422394, + "logps/chosen": -4.239866256713867, + "logps/rejected": -5.073857307434082, + "loss": 0.0506, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.239866256713867, + "rewards/margins": 0.8339906930923462, + "rewards/rejected": -5.073857307434082, + "sft_loss": 3.968554973602295, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 0.47486382919668607, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.34231531620025635, + "logits/rejected": -0.23783119022846222, + "logps/chosen": -4.323024749755859, + "logps/rejected": -5.134915828704834, + "loss": 0.0506, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.323024749755859, + "rewards/margins": 0.8118915557861328, + "rewards/rejected": -5.134915828704834, + "sft_loss": 4.072056770324707, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 0.8756257355383923, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.41580209136009216, + "logits/rejected": -0.3289106488227844, + "logps/chosen": -4.2255096435546875, + "logps/rejected": -5.234991550445557, + "loss": 0.0488, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.2255096435546875, + "rewards/margins": 1.009481430053711, + "rewards/rejected": -5.234991550445557, + "sft_loss": 3.9910926818847656, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 0.5727277350002951, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.341047465801239, + "logits/rejected": -0.3902260363101959, + "logps/chosen": -4.409988880157471, + "logps/rejected": -5.2129974365234375, + "loss": 0.0505, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.409988880157471, + "rewards/margins": 0.8030092120170593, + "rewards/rejected": -5.2129974365234375, + "sft_loss": 4.154358863830566, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 0.5076883038357715, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.4511869549751282, + "logits/rejected": -0.26104557514190674, + "logps/chosen": -4.3491363525390625, + "logps/rejected": -5.162402153015137, + "loss": 0.05, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.3491363525390625, + "rewards/margins": 0.8132661581039429, + "rewards/rejected": -5.162402153015137, + "sft_loss": 4.109154224395752, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 0.4023153185094249, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.43699079751968384, + "logits/rejected": -0.3607856035232544, + "logps/chosen": -4.445807456970215, + "logps/rejected": -5.105084419250488, + "loss": 0.0518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.445807456970215, + "rewards/margins": 0.6592772603034973, + "rewards/rejected": -5.105084419250488, + "sft_loss": 4.227627754211426, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 0.47451049109887844, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.33742186427116394, + "logits/rejected": -0.22219757735729218, + "logps/chosen": -4.269178867340088, + "logps/rejected": -4.994236469268799, + "loss": 0.0499, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.269178867340088, + "rewards/margins": 0.7250576019287109, + "rewards/rejected": -4.994236469268799, + "sft_loss": 3.927827835083008, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 0.537249755204217, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.4085536003112793, + "logits/rejected": -0.2702638506889343, + "logps/chosen": -4.376568794250488, + "logps/rejected": -5.194339752197266, + "loss": 0.0506, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.376568794250488, + "rewards/margins": 0.817771315574646, + "rewards/rejected": -5.194339752197266, + "sft_loss": 4.102890968322754, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 0.4232606940574936, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.42444998025894165, + "logits/rejected": -0.20567944645881653, + "logps/chosen": -4.219876289367676, + "logps/rejected": -5.182650566101074, + "loss": 0.0495, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.219876289367676, + "rewards/margins": 0.9627736210823059, + "rewards/rejected": -5.182650566101074, + "sft_loss": 3.9575088024139404, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 0.6045181843568596, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.42669230699539185, + "logits/rejected": -0.246909499168396, + "logps/chosen": -4.394600868225098, + "logps/rejected": -5.036022186279297, + "loss": 0.0522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.394600868225098, + "rewards/margins": 0.6414215564727783, + "rewards/rejected": -5.036022186279297, + "sft_loss": 4.132719039916992, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 0.5688567714557625, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.4220002591609955, + "logits/rejected": -0.2799118161201477, + "logps/chosen": -4.280217170715332, + "logps/rejected": -5.047442436218262, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.280217170715332, + "rewards/margins": 0.7672249674797058, + "rewards/rejected": -5.047442436218262, + "sft_loss": 3.9711861610412598, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 0.6394676393882484, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.4407338500022888, + "logits/rejected": -0.241370290517807, + "logps/chosen": -4.418463706970215, + "logps/rejected": -5.3320698738098145, + "loss": 0.0513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.418463706970215, + "rewards/margins": 0.9136059880256653, + "rewards/rejected": -5.3320698738098145, + "sft_loss": 4.146149635314941, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 0.47999784311707305, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.4439243674278259, + "logits/rejected": -0.3043748438358307, + "logps/chosen": -4.377346992492676, + "logps/rejected": -5.098249912261963, + "loss": 0.0505, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.377346992492676, + "rewards/margins": 0.7209030389785767, + "rewards/rejected": -5.098249912261963, + "sft_loss": 4.071615695953369, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 0.3861928086843125, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.4043838083744049, + "logits/rejected": -0.31296491622924805, + "logps/chosen": -4.497230529785156, + "logps/rejected": -5.2701849937438965, + "loss": 0.0503, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.497230529785156, + "rewards/margins": 0.7729544639587402, + "rewards/rejected": -5.2701849937438965, + "sft_loss": 4.045097827911377, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 0.51884864533075, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.4912940561771393, + "logits/rejected": -0.2375914603471756, + "logps/chosen": -4.4356689453125, + "logps/rejected": -5.331031799316406, + "loss": 0.0503, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.4356689453125, + "rewards/margins": 0.8953633308410645, + "rewards/rejected": -5.331031799316406, + "sft_loss": 4.134769916534424, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 0.6199903820759686, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.4292561113834381, + "logits/rejected": -0.3562420606613159, + "logps/chosen": -4.502501010894775, + "logps/rejected": -5.053229331970215, + "loss": 0.0516, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.502501010894775, + "rewards/margins": 0.5507287383079529, + "rewards/rejected": -5.053229331970215, + "sft_loss": 4.21864652633667, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 0.3695485210076071, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.4103737473487854, + "logits/rejected": -0.3381521999835968, + "logps/chosen": -4.395519256591797, + "logps/rejected": -5.27461576461792, + "loss": 0.0499, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.395519256591797, + "rewards/margins": 0.8790962100028992, + "rewards/rejected": -5.27461576461792, + "sft_loss": 4.1917924880981445, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 0.4610615645602974, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.5034217238426208, + "logits/rejected": -0.34082144498825073, + "logps/chosen": -4.428948402404785, + "logps/rejected": -5.152653694152832, + "loss": 0.0513, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.428948402404785, + "rewards/margins": 0.723704993724823, + "rewards/rejected": -5.152653694152832, + "sft_loss": 4.1761956214904785, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 0.6469321968896236, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.45382094383239746, + "logits/rejected": -0.18945330381393433, + "logps/chosen": -4.2880859375, + "logps/rejected": -5.107758045196533, + "loss": 0.0496, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.2880859375, + "rewards/margins": 0.819671630859375, + "rewards/rejected": -5.107758045196533, + "sft_loss": 3.9440059661865234, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 0.46564406778831696, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.370443195104599, + "logits/rejected": -0.3640519678592682, + "logps/chosen": -4.384427547454834, + "logps/rejected": -5.047237396240234, + "loss": 0.0531, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.384427547454834, + "rewards/margins": 0.6628104448318481, + "rewards/rejected": -5.047237396240234, + "sft_loss": 4.172974109649658, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 0.501041684548939, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.3562886416912079, + "logits/rejected": -0.22665898501873016, + "logps/chosen": -4.420121669769287, + "logps/rejected": -5.273455619812012, + "loss": 0.0506, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.420121669769287, + "rewards/margins": 0.8533342480659485, + "rewards/rejected": -5.273455619812012, + "sft_loss": 4.2023725509643555, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 0.44752436228931725, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.507021963596344, + "logits/rejected": -0.23198696970939636, + "logps/chosen": -4.363643169403076, + "logps/rejected": -5.239960670471191, + "loss": 0.0511, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.363643169403076, + "rewards/margins": 0.8763176798820496, + "rewards/rejected": -5.239960670471191, + "sft_loss": 4.179306983947754, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 0.4764172344332767, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.46979475021362305, + "logits/rejected": -0.2243788242340088, + "logps/chosen": -4.079625129699707, + "logps/rejected": -4.999655723571777, + "loss": 0.0494, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.079625129699707, + "rewards/margins": 0.9200307726860046, + "rewards/rejected": -4.999655723571777, + "sft_loss": 3.936373233795166, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 0.6228198121848917, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.49106574058532715, + "logits/rejected": -0.3500491976737976, + "logps/chosen": -4.392767429351807, + "logps/rejected": -5.033437728881836, + "loss": 0.0509, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.392767429351807, + "rewards/margins": 0.6406702995300293, + "rewards/rejected": -5.033437728881836, + "sft_loss": 4.037944316864014, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 0.4087330269846568, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.4462736248970032, + "logits/rejected": -0.3225329518318176, + "logps/chosen": -4.157266139984131, + "logps/rejected": -5.061835289001465, + "loss": 0.0495, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.157266139984131, + "rewards/margins": 0.9045697450637817, + "rewards/rejected": -5.061835289001465, + "sft_loss": 3.9843857288360596, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 0.8124937399329678, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.38373181223869324, + "logits/rejected": -0.19799697399139404, + "logps/chosen": -4.203919887542725, + "logps/rejected": -5.222175598144531, + "loss": 0.0502, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.203919887542725, + "rewards/margins": 1.0182548761367798, + "rewards/rejected": -5.222175598144531, + "sft_loss": 3.9921231269836426, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 0.42055836896384047, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.43498653173446655, + "logits/rejected": -0.27146443724632263, + "logps/chosen": -4.396726131439209, + "logps/rejected": -5.123176574707031, + "loss": 0.0522, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.396726131439209, + "rewards/margins": 0.7264498472213745, + "rewards/rejected": -5.123176574707031, + "sft_loss": 4.128332614898682, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": 0.11306402832269669, + "eval_logits/rejected": 0.2139400690793991, + "eval_logps/chosen": -4.401327133178711, + "eval_logps/rejected": -5.163181781768799, + "eval_loss": 0.049954961985349655, + "eval_rewards/accuracies": 0.6869435906410217, + "eval_rewards/chosen": -4.401327133178711, + "eval_rewards/margins": 0.7618544697761536, + "eval_rewards/rejected": -5.163181781768799, + "eval_runtime": 44.8695, + "eval_samples_per_second": 29.976, + "eval_sft_loss": 3.981999635696411, + "eval_steps_per_second": 7.511, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 0.5317123135345402, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.4011858105659485, + "logits/rejected": -0.4122949242591858, + "logps/chosen": -4.417490005493164, + "logps/rejected": -5.078069686889648, + "loss": 0.0514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.417490005493164, + "rewards/margins": 0.6605796813964844, + "rewards/rejected": -5.078069686889648, + "sft_loss": 4.199349403381348, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 0.4186884267154328, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.34007394313812256, + "logits/rejected": -0.25113579630851746, + "logps/chosen": -4.509642124176025, + "logps/rejected": -5.317172050476074, + "loss": 0.0512, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.509642124176025, + "rewards/margins": 0.8075307607650757, + "rewards/rejected": -5.317172050476074, + "sft_loss": 4.248913764953613, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 0.6066617953485327, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.394034206867218, + "logits/rejected": -0.20599737763404846, + "logps/chosen": -4.279552459716797, + "logps/rejected": -5.119963645935059, + "loss": 0.0503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.279552459716797, + "rewards/margins": 0.8404117822647095, + "rewards/rejected": -5.119963645935059, + "sft_loss": 3.9960410594940186, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 0.43807345354435145, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.392520010471344, + "logits/rejected": -0.2974608838558197, + "logps/chosen": -4.408247470855713, + "logps/rejected": -5.05702543258667, + "loss": 0.0511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.408247470855713, + "rewards/margins": 0.6487780809402466, + "rewards/rejected": -5.05702543258667, + "sft_loss": 4.111867427825928, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 0.3919286648470138, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.3907695710659027, + "logits/rejected": -0.32728058099746704, + "logps/chosen": -4.114865303039551, + "logps/rejected": -4.888617515563965, + "loss": 0.0516, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.114865303039551, + "rewards/margins": 0.7737522125244141, + "rewards/rejected": -4.888617515563965, + "sft_loss": 3.919823408126831, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 0.42910296321292596, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.4506562352180481, + "logits/rejected": -0.2691783607006073, + "logps/chosen": -4.15285587310791, + "logps/rejected": -5.211766719818115, + "loss": 0.0474, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.15285587310791, + "rewards/margins": 1.0589115619659424, + "rewards/rejected": -5.211766719818115, + "sft_loss": 3.842996120452881, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 0.5039120295826071, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.42511478066444397, + "logits/rejected": -0.2042577713727951, + "logps/chosen": -4.339873790740967, + "logps/rejected": -5.093599319458008, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.339873790740967, + "rewards/margins": 0.7537254095077515, + "rewards/rejected": -5.093599319458008, + "sft_loss": 4.0721588134765625, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 0.4743203519633735, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.5358638763427734, + "logits/rejected": -0.219042107462883, + "logps/chosen": -4.194911956787109, + "logps/rejected": -5.071628570556641, + "loss": 0.0496, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.194911956787109, + "rewards/margins": 0.8767167925834656, + "rewards/rejected": -5.071628570556641, + "sft_loss": 3.974562168121338, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 0.5337490751429477, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.43740472197532654, + "logits/rejected": -0.1700814664363861, + "logps/chosen": -4.525032043457031, + "logps/rejected": -5.163545608520508, + "loss": 0.0503, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.525032043457031, + "rewards/margins": 0.6385140419006348, + "rewards/rejected": -5.163545608520508, + "sft_loss": 4.158745765686035, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 0.6497343103889583, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.3816669285297394, + "logits/rejected": -0.23741519451141357, + "logps/chosen": -4.496787071228027, + "logps/rejected": -5.277434349060059, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.496787071228027, + "rewards/margins": 0.7806466221809387, + "rewards/rejected": -5.277434349060059, + "sft_loss": 4.196100234985352, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 0.5955899070785665, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.4977712631225586, + "logits/rejected": -0.3810668885707855, + "logps/chosen": -4.355957508087158, + "logps/rejected": -5.177579879760742, + "loss": 0.05, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.355957508087158, + "rewards/margins": 0.8216217756271362, + "rewards/rejected": -5.177579879760742, + "sft_loss": 4.06763219833374, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 0.7554544600132199, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.5054645538330078, + "logits/rejected": -0.2624374032020569, + "logps/chosen": -4.189089298248291, + "logps/rejected": -5.132569789886475, + "loss": 0.0507, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.189089298248291, + "rewards/margins": 0.9434806108474731, + "rewards/rejected": -5.132569789886475, + "sft_loss": 3.9219775199890137, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 0.4472598222234297, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.45849332213401794, + "logits/rejected": -0.2598671615123749, + "logps/chosen": -4.236002445220947, + "logps/rejected": -5.057923316955566, + "loss": 0.0493, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.236002445220947, + "rewards/margins": 0.8219209909439087, + "rewards/rejected": -5.057923316955566, + "sft_loss": 3.9211764335632324, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 0.5938477095529445, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.3556726574897766, + "logits/rejected": -0.27085644006729126, + "logps/chosen": -4.353777885437012, + "logps/rejected": -5.105501651763916, + "loss": 0.0513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.353777885437012, + "rewards/margins": 0.751723051071167, + "rewards/rejected": -5.105501651763916, + "sft_loss": 4.091824531555176, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 0.46956475804542436, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.4694312512874603, + "logits/rejected": -0.23797054588794708, + "logps/chosen": -4.494842529296875, + "logps/rejected": -5.201089382171631, + "loss": 0.0504, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.494842529296875, + "rewards/margins": 0.7062473297119141, + "rewards/rejected": -5.201089382171631, + "sft_loss": 4.138050079345703, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 0.46317836970348397, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.5320299863815308, + "logits/rejected": -0.3951790928840637, + "logps/chosen": -4.233944416046143, + "logps/rejected": -5.062338829040527, + "loss": 0.0491, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.233944416046143, + "rewards/margins": 0.8283944129943848, + "rewards/rejected": -5.062338829040527, + "sft_loss": 3.9799435138702393, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 0.500716513352662, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.36238226294517517, + "logits/rejected": -0.28217214345932007, + "logps/chosen": -4.403859615325928, + "logps/rejected": -5.291438102722168, + "loss": 0.0505, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.403859615325928, + "rewards/margins": 0.8875784873962402, + "rewards/rejected": -5.291438102722168, + "sft_loss": 4.151859760284424, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 0.4413271451808516, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.46627339720726013, + "logits/rejected": -0.21761274337768555, + "logps/chosen": -4.465592384338379, + "logps/rejected": -5.2193803787231445, + "loss": 0.0517, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.465592384338379, + "rewards/margins": 0.7537881135940552, + "rewards/rejected": -5.2193803787231445, + "sft_loss": 4.165318965911865, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 0.5698078550422322, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.5476104021072388, + "logits/rejected": -0.30981510877609253, + "logps/chosen": -4.382649898529053, + "logps/rejected": -5.232707500457764, + "loss": 0.0497, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.382649898529053, + "rewards/margins": 0.8500572443008423, + "rewards/rejected": -5.232707500457764, + "sft_loss": 4.069231033325195, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 0.7446967723465798, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.36129865050315857, + "logits/rejected": -0.34973570704460144, + "logps/chosen": -4.401464939117432, + "logps/rejected": -5.0461273193359375, + "loss": 0.051, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.401464939117432, + "rewards/margins": 0.6446620225906372, + "rewards/rejected": -5.0461273193359375, + "sft_loss": 4.061834812164307, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 0.44889912577423824, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.4160916805267334, + "logits/rejected": -0.18251463770866394, + "logps/chosen": -4.292811393737793, + "logps/rejected": -5.121218681335449, + "loss": 0.0511, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.292811393737793, + "rewards/margins": 0.8284076452255249, + "rewards/rejected": -5.121218681335449, + "sft_loss": 3.972756862640381, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 0.5480919233036169, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.4768191874027252, + "logits/rejected": -0.3089585602283478, + "logps/chosen": -4.455610752105713, + "logps/rejected": -5.065629005432129, + "loss": 0.0522, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.455610752105713, + "rewards/margins": 0.6100180745124817, + "rewards/rejected": -5.065629005432129, + "sft_loss": 4.14192533493042, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 0.6830158913780228, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.4352652132511139, + "logits/rejected": -0.38772186636924744, + "logps/chosen": -4.220919609069824, + "logps/rejected": -5.060526371002197, + "loss": 0.0481, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.220919609069824, + "rewards/margins": 0.8396071195602417, + "rewards/rejected": -5.060526371002197, + "sft_loss": 3.8675715923309326, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 0.5852249757949145, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.45020991563796997, + "logits/rejected": -0.32614919543266296, + "logps/chosen": -4.4303412437438965, + "logps/rejected": -5.190096378326416, + "loss": 0.0503, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.4303412437438965, + "rewards/margins": 0.7597540616989136, + "rewards/rejected": -5.190096378326416, + "sft_loss": 4.109742164611816, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 0.4980895649611331, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.4938434660434723, + "logits/rejected": -0.2938065826892853, + "logps/chosen": -4.283869743347168, + "logps/rejected": -5.195856094360352, + "loss": 0.0492, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.283869743347168, + "rewards/margins": 0.9119867086410522, + "rewards/rejected": -5.195856094360352, + "sft_loss": 4.028626441955566, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 0.9526004589440363, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.5190502405166626, + "logits/rejected": -0.43536219000816345, + "logps/chosen": -4.259387493133545, + "logps/rejected": -5.101330280303955, + "loss": 0.0513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.259387493133545, + "rewards/margins": 0.8419429659843445, + "rewards/rejected": -5.101330280303955, + "sft_loss": 4.020884037017822, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 0.595068276479099, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.36445021629333496, + "logits/rejected": -0.2759942412376404, + "logps/chosen": -4.400937080383301, + "logps/rejected": -5.269299507141113, + "loss": 0.0515, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.400937080383301, + "rewards/margins": 0.8683616518974304, + "rewards/rejected": -5.269299507141113, + "sft_loss": 4.176723957061768, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 0.45636661581557164, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.4203532636165619, + "logits/rejected": -0.3165586590766907, + "logps/chosen": -4.449545383453369, + "logps/rejected": -5.1142096519470215, + "loss": 0.0524, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.449545383453369, + "rewards/margins": 0.6646645069122314, + "rewards/rejected": -5.1142096519470215, + "sft_loss": 4.189427375793457, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 0.37891154011258427, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.3171940743923187, + "logits/rejected": -0.21769234538078308, + "logps/chosen": -4.44630241394043, + "logps/rejected": -5.259999752044678, + "loss": 0.0498, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.44630241394043, + "rewards/margins": 0.8136976957321167, + "rewards/rejected": -5.259999752044678, + "sft_loss": 4.108068943023682, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 0.6043546626443211, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.4760221838951111, + "logits/rejected": -0.37830227613449097, + "logps/chosen": -4.349831581115723, + "logps/rejected": -5.125363826751709, + "loss": 0.0512, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.349831581115723, + "rewards/margins": 0.7755329012870789, + "rewards/rejected": -5.125363826751709, + "sft_loss": 4.081582069396973, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 0.8255678355187496, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.41388097405433655, + "logits/rejected": -0.33308395743370056, + "logps/chosen": -4.39818811416626, + "logps/rejected": -4.987654209136963, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.39818811416626, + "rewards/margins": 0.5894662141799927, + "rewards/rejected": -4.987654209136963, + "sft_loss": 4.104317665100098, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 0.5862689384145306, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.49319133162498474, + "logits/rejected": -0.22184018790721893, + "logps/chosen": -4.365403175354004, + "logps/rejected": -5.162243366241455, + "loss": 0.0513, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.365403175354004, + "rewards/margins": 0.7968395352363586, + "rewards/rejected": -5.162243366241455, + "sft_loss": 4.030394554138184, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 0.5586358557159208, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.41878741979599, + "logits/rejected": -0.3136092722415924, + "logps/chosen": -4.293999671936035, + "logps/rejected": -5.158097743988037, + "loss": 0.049, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.293999671936035, + "rewards/margins": 0.8640983700752258, + "rewards/rejected": -5.158097743988037, + "sft_loss": 3.9552371501922607, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 0.6768516428300098, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.4009891152381897, + "logits/rejected": -0.22887440025806427, + "logps/chosen": -4.390179634094238, + "logps/rejected": -5.233609199523926, + "loss": 0.0508, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.390179634094238, + "rewards/margins": 0.8434289693832397, + "rewards/rejected": -5.233609199523926, + "sft_loss": 4.147332191467285, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 0.39904184672015913, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.33844825625419617, + "logits/rejected": -0.28487923741340637, + "logps/chosen": -4.381079196929932, + "logps/rejected": -5.2011590003967285, + "loss": 0.0511, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.381079196929932, + "rewards/margins": 0.8200796246528625, + "rewards/rejected": -5.2011590003967285, + "sft_loss": 4.086765766143799, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 0.6386156784622389, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.3708297312259674, + "logits/rejected": -0.315926194190979, + "logps/chosen": -4.607503414154053, + "logps/rejected": -5.189393520355225, + "loss": 0.0515, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.607503414154053, + "rewards/margins": 0.5818904638290405, + "rewards/rejected": -5.189393520355225, + "sft_loss": 4.188039302825928, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 0.7690041418422596, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.3973682224750519, + "logits/rejected": -0.2335738241672516, + "logps/chosen": -4.327630519866943, + "logps/rejected": -5.160770416259766, + "loss": 0.05, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.327630519866943, + "rewards/margins": 0.8331397771835327, + "rewards/rejected": -5.160770416259766, + "sft_loss": 4.095370769500732, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 0.7700375694598965, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.4294312596321106, + "logits/rejected": -0.28892675042152405, + "logps/chosen": -4.411342620849609, + "logps/rejected": -5.047314167022705, + "loss": 0.0523, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.411342620849609, + "rewards/margins": 0.6359715461730957, + "rewards/rejected": -5.047314167022705, + "sft_loss": 4.160157203674316, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 0.40676704281363285, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.4509078562259674, + "logits/rejected": -0.3071804940700531, + "logps/chosen": -4.243309497833252, + "logps/rejected": -5.143277168273926, + "loss": 0.0508, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.243309497833252, + "rewards/margins": 0.899968147277832, + "rewards/rejected": -5.143277168273926, + "sft_loss": 3.960758924484253, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 0.598721559427233, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.4274400770664215, + "logits/rejected": -0.17729714512825012, + "logps/chosen": -4.204309940338135, + "logps/rejected": -5.192355155944824, + "loss": 0.0494, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.204309940338135, + "rewards/margins": 0.9880453944206238, + "rewards/rejected": -5.192355155944824, + "sft_loss": 3.941906690597534, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 0.7290877157371126, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.42297524213790894, + "logits/rejected": -0.22855360805988312, + "logps/chosen": -4.441008567810059, + "logps/rejected": -5.380014419555664, + "loss": 0.0495, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.441008567810059, + "rewards/margins": 0.9390062093734741, + "rewards/rejected": -5.380014419555664, + "sft_loss": 4.100342273712158, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 0.5176782637957663, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.46048182249069214, + "logits/rejected": -0.2981131970882416, + "logps/chosen": -4.4200263023376465, + "logps/rejected": -5.22593355178833, + "loss": 0.0509, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.4200263023376465, + "rewards/margins": 0.8059074282646179, + "rewards/rejected": -5.22593355178833, + "sft_loss": 4.118147850036621, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 0.5672219301450849, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.4102095067501068, + "logits/rejected": -0.22788353264331818, + "logps/chosen": -4.337491989135742, + "logps/rejected": -5.260950088500977, + "loss": 0.0507, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.337491989135742, + "rewards/margins": 0.9234585762023926, + "rewards/rejected": -5.260950088500977, + "sft_loss": 4.1589765548706055, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 0.4917175367313671, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.39620310068130493, + "logits/rejected": -0.20798726379871368, + "logps/chosen": -4.377896308898926, + "logps/rejected": -5.2211503982543945, + "loss": 0.0491, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.377896308898926, + "rewards/margins": 0.8432537317276001, + "rewards/rejected": -5.2211503982543945, + "sft_loss": 4.037580966949463, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 0.5310625596499312, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.4279257357120514, + "logits/rejected": -0.46391788125038147, + "logps/chosen": -4.196773052215576, + "logps/rejected": -5.05009651184082, + "loss": 0.0496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.196773052215576, + "rewards/margins": 0.8533236384391785, + "rewards/rejected": -5.05009651184082, + "sft_loss": 3.8642678260803223, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 0.4117856488441471, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.2937605381011963, + "logits/rejected": -0.20567739009857178, + "logps/chosen": -4.333211421966553, + "logps/rejected": -5.020572662353516, + "loss": 0.0507, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.333211421966553, + "rewards/margins": 0.687361478805542, + "rewards/rejected": -5.020572662353516, + "sft_loss": 3.9512009620666504, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 0.5959908535076706, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.2766355574131012, + "logits/rejected": -0.09923405945301056, + "logps/chosen": -4.339729309082031, + "logps/rejected": -5.314494609832764, + "loss": 0.0499, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.339729309082031, + "rewards/margins": 0.9747658967971802, + "rewards/rejected": -5.314494609832764, + "sft_loss": 4.076969623565674, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 0.42216745478121104, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.32927799224853516, + "logits/rejected": -0.20864331722259521, + "logps/chosen": -4.405019283294678, + "logps/rejected": -5.173530101776123, + "loss": 0.0512, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.405019283294678, + "rewards/margins": 0.7685114145278931, + "rewards/rejected": -5.173530101776123, + "sft_loss": 4.068688869476318, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 0.5712246307382178, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.37075644731521606, + "logits/rejected": -0.20269623398780823, + "logps/chosen": -4.265493869781494, + "logps/rejected": -5.115437030792236, + "loss": 0.0501, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.265493869781494, + "rewards/margins": 0.8499435186386108, + "rewards/rejected": -5.115437030792236, + "sft_loss": 4.042442321777344, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 0.6427814813306205, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.44174280762672424, + "logits/rejected": -0.18351683020591736, + "logps/chosen": -4.300289154052734, + "logps/rejected": -5.178165912628174, + "loss": 0.0495, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.300289154052734, + "rewards/margins": 0.8778765797615051, + "rewards/rejected": -5.178165912628174, + "sft_loss": 3.9347262382507324, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 0.463317227859052, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.4231947362422943, + "logits/rejected": -0.255962073802948, + "logps/chosen": -4.436898708343506, + "logps/rejected": -5.069366931915283, + "loss": 0.051, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.436898708343506, + "rewards/margins": 0.6324674487113953, + "rewards/rejected": -5.069366931915283, + "sft_loss": 4.138705253601074, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 0.5303310864210551, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.38676175475120544, + "logits/rejected": -0.2671288549900055, + "logps/chosen": -4.541159152984619, + "logps/rejected": -5.180571556091309, + "loss": 0.0506, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.541159152984619, + "rewards/margins": 0.639412522315979, + "rewards/rejected": -5.180571556091309, + "sft_loss": 4.238387107849121, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 0.49009820184561415, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.3614255487918854, + "logits/rejected": -0.30020448565483093, + "logps/chosen": -4.45319128036499, + "logps/rejected": -5.10440731048584, + "loss": 0.0512, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.45319128036499, + "rewards/margins": 0.6512158513069153, + "rewards/rejected": -5.10440731048584, + "sft_loss": 4.143372535705566, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 0.4592325156306962, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.3967505991458893, + "logits/rejected": -0.2671595811843872, + "logps/chosen": -4.2683258056640625, + "logps/rejected": -4.9582109451293945, + "loss": 0.0523, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.2683258056640625, + "rewards/margins": 0.6898849010467529, + "rewards/rejected": -4.9582109451293945, + "sft_loss": 3.975896120071411, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 0.4543301716930907, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.3207847774028778, + "logits/rejected": -0.1029786691069603, + "logps/chosen": -4.457320213317871, + "logps/rejected": -5.187180519104004, + "loss": 0.0512, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.457320213317871, + "rewards/margins": 0.7298603057861328, + "rewards/rejected": -5.187180519104004, + "sft_loss": 4.13206148147583, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 0.46938640425653955, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.27763500809669495, + "logits/rejected": -0.14167213439941406, + "logps/chosen": -4.377208232879639, + "logps/rejected": -5.241822242736816, + "loss": 0.0506, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.377208232879639, + "rewards/margins": 0.8646138310432434, + "rewards/rejected": -5.241822242736816, + "sft_loss": 4.1298441886901855, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 0.6623288312592525, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.36637941002845764, + "logits/rejected": -0.12512700259685516, + "logps/chosen": -4.084376335144043, + "logps/rejected": -5.053424835205078, + "loss": 0.0496, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.084376335144043, + "rewards/margins": 0.969048798084259, + "rewards/rejected": -5.053424835205078, + "sft_loss": 3.852780818939209, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 0.45516242129923573, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.270879864692688, + "logits/rejected": -0.23090717196464539, + "logps/chosen": -4.380623817443848, + "logps/rejected": -5.0403642654418945, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.380623817443848, + "rewards/margins": 0.6597407460212708, + "rewards/rejected": -5.0403642654418945, + "sft_loss": 4.06832218170166, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 0.5497102945676162, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.34165748953819275, + "logits/rejected": -0.1425810605287552, + "logps/chosen": -4.432342052459717, + "logps/rejected": -5.2141642570495605, + "loss": 0.0504, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.432342052459717, + "rewards/margins": 0.7818223237991333, + "rewards/rejected": -5.2141642570495605, + "sft_loss": 4.182814121246338, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 0.5186126688524669, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.41190558671951294, + "logits/rejected": -0.26131734251976013, + "logps/chosen": -4.429064750671387, + "logps/rejected": -5.08976936340332, + "loss": 0.0513, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.429064750671387, + "rewards/margins": 0.660704493522644, + "rewards/rejected": -5.08976936340332, + "sft_loss": 4.123114585876465, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 0.5294964456734004, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.30540773272514343, + "logits/rejected": -0.25562867522239685, + "logps/chosen": -4.42539119720459, + "logps/rejected": -5.28076171875, + "loss": 0.0491, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.42539119720459, + "rewards/margins": 0.855370819568634, + "rewards/rejected": -5.28076171875, + "sft_loss": 4.095969200134277, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 0.5460324987819931, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.4230508804321289, + "logits/rejected": -0.1935521364212036, + "logps/chosen": -4.407500267028809, + "logps/rejected": -5.24190616607666, + "loss": 0.0509, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.407500267028809, + "rewards/margins": 0.834405243396759, + "rewards/rejected": -5.24190616607666, + "sft_loss": 4.136740684509277, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 0.4644225159425639, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.3028753697872162, + "logits/rejected": -0.3180554211139679, + "logps/chosen": -4.3475775718688965, + "logps/rejected": -5.271296501159668, + "loss": 0.0486, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.3475775718688965, + "rewards/margins": 0.9237188100814819, + "rewards/rejected": -5.271296501159668, + "sft_loss": 3.9962737560272217, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 0.4587484730030635, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.3752499222755432, + "logits/rejected": -0.21907536685466766, + "logps/chosen": -4.364150047302246, + "logps/rejected": -5.151577472686768, + "loss": 0.0494, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.364150047302246, + "rewards/margins": 0.7874273061752319, + "rewards/rejected": -5.151577472686768, + "sft_loss": 4.01193904876709, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 0.5348042270404899, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.3915547728538513, + "logits/rejected": -0.35350117087364197, + "logps/chosen": -4.341032981872559, + "logps/rejected": -5.086256980895996, + "loss": 0.0526, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.341032981872559, + "rewards/margins": 0.74522465467453, + "rewards/rejected": -5.086256980895996, + "sft_loss": 4.098066329956055, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 0.5572563588972967, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.2931649088859558, + "logits/rejected": -0.10043120384216309, + "logps/chosen": -4.398486137390137, + "logps/rejected": -5.248486518859863, + "loss": 0.0496, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.398486137390137, + "rewards/margins": 0.8499997854232788, + "rewards/rejected": -5.248486518859863, + "sft_loss": 4.0921549797058105, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 0.8956166487217501, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.25029462575912476, + "logits/rejected": -0.37777963280677795, + "logps/chosen": -4.2648515701293945, + "logps/rejected": -4.919008255004883, + "loss": 0.0502, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.2648515701293945, + "rewards/margins": 0.6541560888290405, + "rewards/rejected": -4.919008255004883, + "sft_loss": 3.9281792640686035, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 0.4677871805435953, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.40193086862564087, + "logits/rejected": -0.22237029671669006, + "logps/chosen": -4.344948768615723, + "logps/rejected": -5.091097831726074, + "loss": 0.0515, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.344948768615723, + "rewards/margins": 0.7461491823196411, + "rewards/rejected": -5.091097831726074, + "sft_loss": 4.087722301483154, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 0.47711087871367364, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.3756980299949646, + "logits/rejected": -0.34971341490745544, + "logps/chosen": -4.448171615600586, + "logps/rejected": -5.076592445373535, + "loss": 0.0516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.448171615600586, + "rewards/margins": 0.6284207105636597, + "rewards/rejected": -5.076592445373535, + "sft_loss": 4.206480026245117, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 0.6378803019183643, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.31666165590286255, + "logits/rejected": -0.04280168563127518, + "logps/chosen": -4.166003227233887, + "logps/rejected": -5.370185852050781, + "loss": 0.0484, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.166003227233887, + "rewards/margins": 1.2041823863983154, + "rewards/rejected": -5.370185852050781, + "sft_loss": 3.9951655864715576, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 0.4774801536345404, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.4547714293003082, + "logits/rejected": -0.2548532783985138, + "logps/chosen": -4.340898036956787, + "logps/rejected": -5.196542263031006, + "loss": 0.0508, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.340898036956787, + "rewards/margins": 0.8556438684463501, + "rewards/rejected": -5.196542263031006, + "sft_loss": 4.143154144287109, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 1.1243775716679412, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.38904356956481934, + "logits/rejected": -0.289012610912323, + "logps/chosen": -4.332601547241211, + "logps/rejected": -5.166747570037842, + "loss": 0.0507, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.332601547241211, + "rewards/margins": 0.8341460227966309, + "rewards/rejected": -5.166747570037842, + "sft_loss": 4.06862735748291, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 0.5001429058189203, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.38545042276382446, + "logits/rejected": -0.16294018924236298, + "logps/chosen": -4.417631149291992, + "logps/rejected": -5.205663204193115, + "loss": 0.0536, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.417631149291992, + "rewards/margins": 0.7880316972732544, + "rewards/rejected": -5.205663204193115, + "sft_loss": 4.275336265563965, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 0.7599250222037716, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.3082340955734253, + "logits/rejected": -0.15593746304512024, + "logps/chosen": -4.3028974533081055, + "logps/rejected": -5.224949836730957, + "loss": 0.05, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.3028974533081055, + "rewards/margins": 0.9220517873764038, + "rewards/rejected": -5.224949836730957, + "sft_loss": 4.00002908706665, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 0.4147075847544824, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.3403518795967102, + "logits/rejected": -0.2342265546321869, + "logps/chosen": -4.296928882598877, + "logps/rejected": -5.133492469787598, + "loss": 0.0512, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.296928882598877, + "rewards/margins": 0.836563766002655, + "rewards/rejected": -5.133492469787598, + "sft_loss": 3.9882049560546875, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 0.4884901044848451, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.4230726659297943, + "logits/rejected": -0.30629175901412964, + "logps/chosen": -4.268218040466309, + "logps/rejected": -5.060492038726807, + "loss": 0.0508, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.268218040466309, + "rewards/margins": 0.7922734022140503, + "rewards/rejected": -5.060492038726807, + "sft_loss": 4.090802192687988, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 0.503796507703663, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.35504621267318726, + "logits/rejected": -0.3044523596763611, + "logps/chosen": -4.365761756896973, + "logps/rejected": -5.030825138092041, + "loss": 0.0517, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.365761756896973, + "rewards/margins": 0.6650637984275818, + "rewards/rejected": -5.030825138092041, + "sft_loss": 4.040749549865723, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 0.5400763074236795, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.4038558900356293, + "logits/rejected": -0.28878653049468994, + "logps/chosen": -4.315426826477051, + "logps/rejected": -5.185266494750977, + "loss": 0.0513, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.315426826477051, + "rewards/margins": 0.8698400259017944, + "rewards/rejected": -5.185266494750977, + "sft_loss": 4.082798957824707, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 0.7780856726072443, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.3635145425796509, + "logits/rejected": -0.24427835643291473, + "logps/chosen": -4.404698848724365, + "logps/rejected": -5.149372100830078, + "loss": 0.0512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.404698848724365, + "rewards/margins": 0.7446734309196472, + "rewards/rejected": -5.149372100830078, + "sft_loss": 4.210453987121582, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 0.5490396914459478, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.359261691570282, + "logits/rejected": -0.23605218529701233, + "logps/chosen": -4.36661434173584, + "logps/rejected": -5.151679515838623, + "loss": 0.0513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.36661434173584, + "rewards/margins": 0.7850648164749146, + "rewards/rejected": -5.151679515838623, + "sft_loss": 4.143317699432373, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.07847874611616135, + "eval_logits/rejected": 0.17865531146526337, + "eval_logps/chosen": -4.370919227600098, + "eval_logps/rejected": -5.116001605987549, + "eval_loss": 0.04999160394072533, + "eval_rewards/accuracies": 0.6943620443344116, + "eval_rewards/chosen": -4.370919227600098, + "eval_rewards/margins": 0.7450823187828064, + "eval_rewards/rejected": -5.116001605987549, + "eval_runtime": 44.4974, + "eval_samples_per_second": 30.226, + "eval_sft_loss": 3.973156213760376, + "eval_steps_per_second": 7.573, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 0.4366784779733865, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.4094223976135254, + "logits/rejected": -0.3121436834335327, + "logps/chosen": -4.305196285247803, + "logps/rejected": -5.099860191345215, + "loss": 0.0508, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.305196285247803, + "rewards/margins": 0.7946635484695435, + "rewards/rejected": -5.099860191345215, + "sft_loss": 4.0197625160217285, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 0.785271503992536, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.3667059540748596, + "logits/rejected": -0.289445161819458, + "logps/chosen": -4.2451043128967285, + "logps/rejected": -5.16410493850708, + "loss": 0.0501, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.2451043128967285, + "rewards/margins": 0.9190011024475098, + "rewards/rejected": -5.16410493850708, + "sft_loss": 3.976490020751953, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 0.657010538300424, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.36434513330459595, + "logits/rejected": -0.198884516954422, + "logps/chosen": -4.356508731842041, + "logps/rejected": -5.266931056976318, + "loss": 0.0515, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.356508731842041, + "rewards/margins": 0.9104223251342773, + "rewards/rejected": -5.266931056976318, + "sft_loss": 4.159215927124023, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 0.5736893794094124, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.3771006464958191, + "logits/rejected": -0.2650943994522095, + "logps/chosen": -4.496792793273926, + "logps/rejected": -5.096777439117432, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.496792793273926, + "rewards/margins": 0.5999849438667297, + "rewards/rejected": -5.096777439117432, + "sft_loss": 4.190961837768555, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 0.5535433811032741, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.31344643235206604, + "logits/rejected": -0.15652044117450714, + "logps/chosen": -4.29534387588501, + "logps/rejected": -5.13301944732666, + "loss": 0.0496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.29534387588501, + "rewards/margins": 0.837675929069519, + "rewards/rejected": -5.13301944732666, + "sft_loss": 4.011329174041748, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 0.5464049154304894, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.3962657153606415, + "logits/rejected": -0.20990128815174103, + "logps/chosen": -4.331116676330566, + "logps/rejected": -5.164947509765625, + "loss": 0.0504, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.331116676330566, + "rewards/margins": 0.8338314890861511, + "rewards/rejected": -5.164947509765625, + "sft_loss": 4.151193618774414, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 0.5312457533985412, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.44756752252578735, + "logits/rejected": -0.2316436767578125, + "logps/chosen": -4.406530857086182, + "logps/rejected": -5.2714948654174805, + "loss": 0.0499, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.406530857086182, + "rewards/margins": 0.8649638295173645, + "rewards/rejected": -5.2714948654174805, + "sft_loss": 4.123922824859619, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 0.6044695941107733, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.4090364873409271, + "logits/rejected": -0.15606291592121124, + "logps/chosen": -4.223238945007324, + "logps/rejected": -5.190573692321777, + "loss": 0.0475, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.223238945007324, + "rewards/margins": 0.9673342704772949, + "rewards/rejected": -5.190573692321777, + "sft_loss": 3.814497709274292, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 0.9832175459629173, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.3239671587944031, + "logits/rejected": -0.2865348160266876, + "logps/chosen": -4.409046173095703, + "logps/rejected": -5.107858657836914, + "loss": 0.0512, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.409046173095703, + "rewards/margins": 0.69881272315979, + "rewards/rejected": -5.107858657836914, + "sft_loss": 4.158252716064453, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 0.5454129043795762, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.5119531750679016, + "logits/rejected": -0.30409321188926697, + "logps/chosen": -4.390221118927002, + "logps/rejected": -5.251105785369873, + "loss": 0.0517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.390221118927002, + "rewards/margins": 0.8608850240707397, + "rewards/rejected": -5.251105785369873, + "sft_loss": 4.141351222991943, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 0.5888971213716986, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.28511926531791687, + "logits/rejected": -0.11991135776042938, + "logps/chosen": -4.358060359954834, + "logps/rejected": -5.080539703369141, + "loss": 0.0517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.358060359954834, + "rewards/margins": 0.7224793434143066, + "rewards/rejected": -5.080539703369141, + "sft_loss": 4.084187030792236, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 0.7700086644895395, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.3371310234069824, + "logits/rejected": -0.21796874701976776, + "logps/chosen": -4.2935686111450195, + "logps/rejected": -5.074094295501709, + "loss": 0.0521, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.2935686111450195, + "rewards/margins": 0.7805261611938477, + "rewards/rejected": -5.074094295501709, + "sft_loss": 4.1205830574035645, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 0.49366993906433826, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.45760640501976013, + "logits/rejected": -0.35235825181007385, + "logps/chosen": -4.228287220001221, + "logps/rejected": -5.1750922203063965, + "loss": 0.049, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.228287220001221, + "rewards/margins": 0.9468050003051758, + "rewards/rejected": -5.1750922203063965, + "sft_loss": 4.011025428771973, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 0.6999895260792679, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.2858087420463562, + "logits/rejected": -0.16768726706504822, + "logps/chosen": -4.2022809982299805, + "logps/rejected": -5.1616997718811035, + "loss": 0.0494, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.2022809982299805, + "rewards/margins": 0.9594185948371887, + "rewards/rejected": -5.1616997718811035, + "sft_loss": 3.9046719074249268, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 0.731734899158958, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.2684374451637268, + "logits/rejected": -0.16356366872787476, + "logps/chosen": -4.374823093414307, + "logps/rejected": -5.219290733337402, + "loss": 0.0511, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.374823093414307, + "rewards/margins": 0.8444677591323853, + "rewards/rejected": -5.219290733337402, + "sft_loss": 4.082675457000732, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 0.6332443526161032, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.27860182523727417, + "logits/rejected": -0.15365949273109436, + "logps/chosen": -4.211956024169922, + "logps/rejected": -5.227756023406982, + "loss": 0.0498, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.211956024169922, + "rewards/margins": 1.01580011844635, + "rewards/rejected": -5.227756023406982, + "sft_loss": 3.9831924438476562, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 0.5668796452945385, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.27431243658065796, + "logits/rejected": -0.22958464920520782, + "logps/chosen": -4.336443901062012, + "logps/rejected": -5.17434024810791, + "loss": 0.0511, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.336443901062012, + "rewards/margins": 0.8378962278366089, + "rewards/rejected": -5.17434024810791, + "sft_loss": 4.1342644691467285, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 0.5156007934403071, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.4549378752708435, + "logits/rejected": -0.2466200590133667, + "logps/chosen": -4.278968811035156, + "logps/rejected": -5.1642656326293945, + "loss": 0.0507, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.278968811035156, + "rewards/margins": 0.8852967023849487, + "rewards/rejected": -5.1642656326293945, + "sft_loss": 4.013367652893066, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 0.5975897826628281, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.3374733030796051, + "logits/rejected": -0.2785305976867676, + "logps/chosen": -4.33203649520874, + "logps/rejected": -5.043580055236816, + "loss": 0.051, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.33203649520874, + "rewards/margins": 0.7115433812141418, + "rewards/rejected": -5.043580055236816, + "sft_loss": 4.101061820983887, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 0.7935012047266726, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.30603188276290894, + "logits/rejected": -0.1277734935283661, + "logps/chosen": -4.321929931640625, + "logps/rejected": -5.279784202575684, + "loss": 0.05, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.321929931640625, + "rewards/margins": 0.9578543901443481, + "rewards/rejected": -5.279784202575684, + "sft_loss": 4.039951801300049, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 0.5670757081729894, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.32785436511039734, + "logits/rejected": -0.17707887291908264, + "logps/chosen": -4.282454013824463, + "logps/rejected": -5.355355739593506, + "loss": 0.0481, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.282454013824463, + "rewards/margins": 1.072901964187622, + "rewards/rejected": -5.355355739593506, + "sft_loss": 4.057803630828857, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 0.7615789792899859, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.3168491721153259, + "logits/rejected": -0.37255367636680603, + "logps/chosen": -4.4251322746276855, + "logps/rejected": -5.246760368347168, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.4251322746276855, + "rewards/margins": 0.8216277360916138, + "rewards/rejected": -5.246760368347168, + "sft_loss": 4.182235240936279, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 0.4919973143825708, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.3505459725856781, + "logits/rejected": -0.1527949571609497, + "logps/chosen": -4.304749488830566, + "logps/rejected": -5.080583095550537, + "loss": 0.0504, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.304749488830566, + "rewards/margins": 0.7758339643478394, + "rewards/rejected": -5.080583095550537, + "sft_loss": 4.022045612335205, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 0.677617849418434, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.37543028593063354, + "logits/rejected": -0.2667499780654907, + "logps/chosen": -4.324775218963623, + "logps/rejected": -5.045456409454346, + "loss": 0.0503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.324775218963623, + "rewards/margins": 0.7206807136535645, + "rewards/rejected": -5.045456409454346, + "sft_loss": 4.059634208679199, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 0.573609389327282, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.5110586881637573, + "logits/rejected": -0.22267436981201172, + "logps/chosen": -4.291056156158447, + "logps/rejected": -5.223393440246582, + "loss": 0.0495, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.291056156158447, + "rewards/margins": 0.9323371648788452, + "rewards/rejected": -5.223393440246582, + "sft_loss": 4.041526794433594, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 0.5475086704999267, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.3410526216030121, + "logits/rejected": -0.11093126237392426, + "logps/chosen": -4.463144779205322, + "logps/rejected": -5.343132019042969, + "loss": 0.0511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.463144779205322, + "rewards/margins": 0.8799868822097778, + "rewards/rejected": -5.343132019042969, + "sft_loss": 4.178741455078125, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 0.48781828384348774, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.42462921142578125, + "logits/rejected": -0.2016540765762329, + "logps/chosen": -4.22618293762207, + "logps/rejected": -5.108307838439941, + "loss": 0.0498, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.22618293762207, + "rewards/margins": 0.8821243047714233, + "rewards/rejected": -5.108307838439941, + "sft_loss": 3.8731770515441895, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 0.5380379219048013, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.4162037968635559, + "logits/rejected": -0.32498764991760254, + "logps/chosen": -4.193484306335449, + "logps/rejected": -4.916443824768066, + "loss": 0.0501, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.193484306335449, + "rewards/margins": 0.7229597568511963, + "rewards/rejected": -4.916443824768066, + "sft_loss": 3.9398722648620605, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 0.553196750733812, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.3653312027454376, + "logits/rejected": -0.14281252026557922, + "logps/chosen": -4.307368278503418, + "logps/rejected": -5.142277240753174, + "loss": 0.0498, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.307368278503418, + "rewards/margins": 0.8349090814590454, + "rewards/rejected": -5.142277240753174, + "sft_loss": 3.9290771484375, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 0.6956238773053149, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.32470768690109253, + "logits/rejected": -0.12303704023361206, + "logps/chosen": -4.257458686828613, + "logps/rejected": -5.020730018615723, + "loss": 0.0512, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.257458686828613, + "rewards/margins": 0.763271689414978, + "rewards/rejected": -5.020730018615723, + "sft_loss": 3.9516940116882324, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 0.5982140764367317, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.3422200381755829, + "logits/rejected": -0.31732505559921265, + "logps/chosen": -4.543044567108154, + "logps/rejected": -5.1809258460998535, + "loss": 0.0505, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.543044567108154, + "rewards/margins": 0.6378811001777649, + "rewards/rejected": -5.1809258460998535, + "sft_loss": 4.218386650085449, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 0.5497994656475738, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.3104451298713684, + "logits/rejected": -0.2130269706249237, + "logps/chosen": -4.269969940185547, + "logps/rejected": -5.149529457092285, + "loss": 0.0509, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.269969940185547, + "rewards/margins": 0.8795592188835144, + "rewards/rejected": -5.149529457092285, + "sft_loss": 4.051764965057373, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 0.445435483211514, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.4530177712440491, + "logits/rejected": -0.3011803925037384, + "logps/chosen": -4.275309085845947, + "logps/rejected": -5.169306755065918, + "loss": 0.0502, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.275309085845947, + "rewards/margins": 0.8939980268478394, + "rewards/rejected": -5.169306755065918, + "sft_loss": 4.088404655456543, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 0.5856523820842301, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.23590047657489777, + "logits/rejected": -0.20851969718933105, + "logps/chosen": -4.351941108703613, + "logps/rejected": -5.117374897003174, + "loss": 0.0489, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.351941108703613, + "rewards/margins": 0.7654340863227844, + "rewards/rejected": -5.117374897003174, + "sft_loss": 3.916269302368164, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 0.6735940707026237, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.29202502965927124, + "logits/rejected": -0.2616046965122223, + "logps/chosen": -4.410584926605225, + "logps/rejected": -5.258556365966797, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.410584926605225, + "rewards/margins": 0.8479716181755066, + "rewards/rejected": -5.258556365966797, + "sft_loss": 4.140559196472168, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 0.45364675526602016, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.18361981213092804, + "logits/rejected": -0.17511440813541412, + "logps/chosen": -4.39959716796875, + "logps/rejected": -5.171971321105957, + "loss": 0.0512, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.39959716796875, + "rewards/margins": 0.7723743915557861, + "rewards/rejected": -5.171971321105957, + "sft_loss": 4.043013572692871, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 0.5784270412666574, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.2834405303001404, + "logits/rejected": -0.0836217850446701, + "logps/chosen": -4.3537139892578125, + "logps/rejected": -5.3170366287231445, + "loss": 0.0495, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.3537139892578125, + "rewards/margins": 0.9633221626281738, + "rewards/rejected": -5.3170366287231445, + "sft_loss": 4.033775329589844, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 0.530877020771359, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.4802896976470947, + "logits/rejected": -0.29643306136131287, + "logps/chosen": -4.233144760131836, + "logps/rejected": -5.321981906890869, + "loss": 0.0488, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.233144760131836, + "rewards/margins": 1.088836669921875, + "rewards/rejected": -5.321981906890869, + "sft_loss": 4.009449481964111, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 0.6747135053910271, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.385011225938797, + "logits/rejected": -0.4125341773033142, + "logps/chosen": -4.425052642822266, + "logps/rejected": -5.143741607666016, + "loss": 0.0518, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.425052642822266, + "rewards/margins": 0.7186892628669739, + "rewards/rejected": -5.143741607666016, + "sft_loss": 4.189584732055664, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 0.5537563795116268, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.35988473892211914, + "logits/rejected": -0.3347950577735901, + "logps/chosen": -4.537070274353027, + "logps/rejected": -5.29421854019165, + "loss": 0.0512, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.537070274353027, + "rewards/margins": 0.7571475505828857, + "rewards/rejected": -5.29421854019165, + "sft_loss": 4.247420310974121, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 0.6340292788783681, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.31119558215141296, + "logits/rejected": -0.24769540131092072, + "logps/chosen": -4.337439060211182, + "logps/rejected": -5.011359214782715, + "loss": 0.0505, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.337439060211182, + "rewards/margins": 0.673919677734375, + "rewards/rejected": -5.011359214782715, + "sft_loss": 4.067869186401367, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 0.4995947109233343, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.27810150384902954, + "logits/rejected": -0.22744593024253845, + "logps/chosen": -4.439764976501465, + "logps/rejected": -5.297387599945068, + "loss": 0.0498, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.439764976501465, + "rewards/margins": 0.8576227426528931, + "rewards/rejected": -5.297387599945068, + "sft_loss": 4.06976842880249, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 0.4910614447723621, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.3293796181678772, + "logits/rejected": -0.2629985809326172, + "logps/chosen": -4.394562244415283, + "logps/rejected": -5.318913459777832, + "loss": 0.0489, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.394562244415283, + "rewards/margins": 0.9243508577346802, + "rewards/rejected": -5.318913459777832, + "sft_loss": 4.018808364868164, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 0.5700669196553817, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.42885392904281616, + "logits/rejected": -0.15153753757476807, + "logps/chosen": -4.297393798828125, + "logps/rejected": -5.1750288009643555, + "loss": 0.0496, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.297393798828125, + "rewards/margins": 0.87763512134552, + "rewards/rejected": -5.1750288009643555, + "sft_loss": 4.046809196472168, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 0.4570361869947965, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.41207781434059143, + "logits/rejected": -0.2339569628238678, + "logps/chosen": -4.3805341720581055, + "logps/rejected": -5.119570255279541, + "loss": 0.0515, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.3805341720581055, + "rewards/margins": 0.7390362024307251, + "rewards/rejected": -5.119570255279541, + "sft_loss": 4.159833908081055, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 0.6189906085316598, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.3554043173789978, + "logits/rejected": -0.2785646915435791, + "logps/chosen": -4.2589263916015625, + "logps/rejected": -5.20426082611084, + "loss": 0.0502, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.2589263916015625, + "rewards/margins": 0.9453340768814087, + "rewards/rejected": -5.20426082611084, + "sft_loss": 4.017521858215332, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 0.4158614258443814, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.3380299210548401, + "logits/rejected": -0.24163658916950226, + "logps/chosen": -4.41104793548584, + "logps/rejected": -5.293055534362793, + "loss": 0.0509, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.41104793548584, + "rewards/margins": 0.8820083737373352, + "rewards/rejected": -5.293055534362793, + "sft_loss": 4.177367210388184, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 0.6648398728304554, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": -0.1934209167957306, + "logits/rejected": -0.25234749913215637, + "logps/chosen": -4.489556312561035, + "logps/rejected": -5.1544575691223145, + "loss": 0.0514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.489556312561035, + "rewards/margins": 0.6649015545845032, + "rewards/rejected": -5.1544575691223145, + "sft_loss": 4.170198917388916, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 0.8852667353919006, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.22296173870563507, + "logits/rejected": -0.2605739235877991, + "logps/chosen": -4.362579345703125, + "logps/rejected": -5.030747413635254, + "loss": 0.05, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.362579345703125, + "rewards/margins": 0.6681679487228394, + "rewards/rejected": -5.030747413635254, + "sft_loss": 4.08138370513916, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 0.865852246921178, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.32960090041160583, + "logits/rejected": -0.24417448043823242, + "logps/chosen": -4.399921894073486, + "logps/rejected": -5.093794822692871, + "loss": 0.0506, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.399921894073486, + "rewards/margins": 0.6938729882240295, + "rewards/rejected": -5.093794822692871, + "sft_loss": 4.056646347045898, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 0.5888597395995303, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.4537014067173004, + "logits/rejected": -0.37281662225723267, + "logps/chosen": -4.381228446960449, + "logps/rejected": -5.203554630279541, + "loss": 0.0502, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.381228446960449, + "rewards/margins": 0.8223265409469604, + "rewards/rejected": -5.203554630279541, + "sft_loss": 4.094564914703369, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 0.7002768460470424, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.28774571418762207, + "logits/rejected": -0.24743354320526123, + "logps/chosen": -4.308173179626465, + "logps/rejected": -5.016545295715332, + "loss": 0.0519, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.308173179626465, + "rewards/margins": 0.7083726525306702, + "rewards/rejected": -5.016545295715332, + "sft_loss": 4.048774242401123, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 0.5846364610229815, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.31515955924987793, + "logits/rejected": -0.04892424866557121, + "logps/chosen": -4.327048301696777, + "logps/rejected": -5.070046424865723, + "loss": 0.0505, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.327048301696777, + "rewards/margins": 0.7429983019828796, + "rewards/rejected": -5.070046424865723, + "sft_loss": 4.062623023986816, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 0.5600521422021959, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.2908742427825928, + "logits/rejected": -0.13980403542518616, + "logps/chosen": -4.316672325134277, + "logps/rejected": -5.053095817565918, + "loss": 0.05, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.316672325134277, + "rewards/margins": 0.7364233732223511, + "rewards/rejected": -5.053095817565918, + "sft_loss": 4.109248161315918, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 0.6693016025580347, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.31663063168525696, + "logits/rejected": -0.26752668619155884, + "logps/chosen": -4.199077606201172, + "logps/rejected": -4.882410049438477, + "loss": 0.0522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.199077606201172, + "rewards/margins": 0.6833322644233704, + "rewards/rejected": -4.882410049438477, + "sft_loss": 4.017780303955078, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 0.5585138005543521, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.4366474151611328, + "logits/rejected": -0.26478832960128784, + "logps/chosen": -4.335009574890137, + "logps/rejected": -5.138851165771484, + "loss": 0.0512, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.335009574890137, + "rewards/margins": 0.8038414716720581, + "rewards/rejected": -5.138851165771484, + "sft_loss": 4.115544319152832, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 0.5115652071606784, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.39152833819389343, + "logits/rejected": -0.25694766640663147, + "logps/chosen": -4.440808296203613, + "logps/rejected": -5.392234802246094, + "loss": 0.0503, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.440808296203613, + "rewards/margins": 0.9514263272285461, + "rewards/rejected": -5.392234802246094, + "sft_loss": 4.244625568389893, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 0.4685102979697998, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.48699942231178284, + "logits/rejected": -0.27683359384536743, + "logps/chosen": -4.237309455871582, + "logps/rejected": -5.1978759765625, + "loss": 0.0488, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.237309455871582, + "rewards/margins": 0.9605666995048523, + "rewards/rejected": -5.1978759765625, + "sft_loss": 3.9901108741760254, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 0.4037980863154395, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.5207828283309937, + "logits/rejected": -0.2705624997615814, + "logps/chosen": -4.286375045776367, + "logps/rejected": -5.243197441101074, + "loss": 0.0501, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.286375045776367, + "rewards/margins": 0.9568222761154175, + "rewards/rejected": -5.243197441101074, + "sft_loss": 4.061440944671631, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 0.5770932093871353, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.30437716841697693, + "logits/rejected": -0.14506526291370392, + "logps/chosen": -4.377502918243408, + "logps/rejected": -5.2365498542785645, + "loss": 0.0513, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.377502918243408, + "rewards/margins": 0.859046459197998, + "rewards/rejected": -5.2365498542785645, + "sft_loss": 4.1016740798950195, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 0.5933158360229613, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.3560345768928528, + "logits/rejected": -0.169934943318367, + "logps/chosen": -4.517114162445068, + "logps/rejected": -5.2097673416137695, + "loss": 0.05, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.517114162445068, + "rewards/margins": 0.6926525831222534, + "rewards/rejected": -5.2097673416137695, + "sft_loss": 4.1477837562561035, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 0.4786597019537258, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.35335972905158997, + "logits/rejected": -0.27022966742515564, + "logps/chosen": -4.437841892242432, + "logps/rejected": -5.270761013031006, + "loss": 0.05, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.437841892242432, + "rewards/margins": 0.8329197764396667, + "rewards/rejected": -5.270761013031006, + "sft_loss": 4.032576084136963, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 0.4840975269990418, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.48118534684181213, + "logits/rejected": -0.25537365674972534, + "logps/chosen": -4.186007022857666, + "logps/rejected": -5.362186908721924, + "loss": 0.0484, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.186007022857666, + "rewards/margins": 1.1761797666549683, + "rewards/rejected": -5.362186908721924, + "sft_loss": 3.971787929534912, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 0.6293456593288709, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.35397639870643616, + "logits/rejected": -0.2746841311454773, + "logps/chosen": -4.240942478179932, + "logps/rejected": -5.1576995849609375, + "loss": 0.0501, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.240942478179932, + "rewards/margins": 0.916756808757782, + "rewards/rejected": -5.1576995849609375, + "sft_loss": 4.038487911224365, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 0.6509939909042833, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.5028539299964905, + "logits/rejected": -0.22538447380065918, + "logps/chosen": -4.42642068862915, + "logps/rejected": -5.166515350341797, + "loss": 0.0505, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.42642068862915, + "rewards/margins": 0.7400942444801331, + "rewards/rejected": -5.166515350341797, + "sft_loss": 4.125155448913574, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 0.4241187697346418, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.5067978501319885, + "logits/rejected": -0.2537830173969269, + "logps/chosen": -4.308220863342285, + "logps/rejected": -5.137172222137451, + "loss": 0.0507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.308220863342285, + "rewards/margins": 0.828951358795166, + "rewards/rejected": -5.137172222137451, + "sft_loss": 4.0658135414123535, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 0.6211864652488323, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.26569199562072754, + "logits/rejected": -0.2149532586336136, + "logps/chosen": -4.469195365905762, + "logps/rejected": -5.14188289642334, + "loss": 0.0519, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.469195365905762, + "rewards/margins": 0.6726875901222229, + "rewards/rejected": -5.14188289642334, + "sft_loss": 4.128327369689941, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 0.6295297741777652, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.30820053815841675, + "logits/rejected": -0.16536325216293335, + "logps/chosen": -4.0469970703125, + "logps/rejected": -5.018959045410156, + "loss": 0.0492, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.0469970703125, + "rewards/margins": 0.9719620943069458, + "rewards/rejected": -5.018959045410156, + "sft_loss": 3.805692195892334, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 0.6033387892484273, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.2960502803325653, + "logits/rejected": -0.14533351361751556, + "logps/chosen": -4.287169456481934, + "logps/rejected": -5.153046131134033, + "loss": 0.0497, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.287169456481934, + "rewards/margins": 0.8658763766288757, + "rewards/rejected": -5.153046131134033, + "sft_loss": 3.994807004928589, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 0.3556324310223384, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.41091519594192505, + "logits/rejected": -0.25811105966567993, + "logps/chosen": -4.2101640701293945, + "logps/rejected": -5.0592851638793945, + "loss": 0.05, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.2101640701293945, + "rewards/margins": 0.8491213917732239, + "rewards/rejected": -5.0592851638793945, + "sft_loss": 3.965034008026123, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 0.45918008796138426, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.42323416471481323, + "logits/rejected": -0.33379656076431274, + "logps/chosen": -4.424660682678223, + "logps/rejected": -5.175339698791504, + "loss": 0.0511, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.424660682678223, + "rewards/margins": 0.7506788372993469, + "rewards/rejected": -5.175339698791504, + "sft_loss": 4.200197696685791, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 0.4557503980800935, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.35138794779777527, + "logits/rejected": -0.34763628244400024, + "logps/chosen": -4.337948322296143, + "logps/rejected": -5.078485012054443, + "loss": 0.0509, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.337948322296143, + "rewards/margins": 0.7405366897583008, + "rewards/rejected": -5.078485012054443, + "sft_loss": 4.110854148864746, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 0.4334158893596347, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.3873664140701294, + "logits/rejected": -0.18847136199474335, + "logps/chosen": -4.274514675140381, + "logps/rejected": -5.120308876037598, + "loss": 0.0503, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.274514675140381, + "rewards/margins": 0.8457947969436646, + "rewards/rejected": -5.120308876037598, + "sft_loss": 4.066677570343018, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 0.5085900205081568, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.3473703861236572, + "logits/rejected": -0.23684850335121155, + "logps/chosen": -4.316211700439453, + "logps/rejected": -5.224501609802246, + "loss": 0.0492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.316211700439453, + "rewards/margins": 0.9082896113395691, + "rewards/rejected": -5.224501609802246, + "sft_loss": 4.066177845001221, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 0.5671962752351475, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.34605956077575684, + "logits/rejected": -0.28665685653686523, + "logps/chosen": -4.406753063201904, + "logps/rejected": -5.275516510009766, + "loss": 0.0489, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.406753063201904, + "rewards/margins": 0.8687634468078613, + "rewards/rejected": -5.275516510009766, + "sft_loss": 4.098984241485596, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 0.6257159164544014, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.30671659111976624, + "logits/rejected": -0.23999974131584167, + "logps/chosen": -4.360651969909668, + "logps/rejected": -5.106849193572998, + "loss": 0.0516, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.360651969909668, + "rewards/margins": 0.7461972832679749, + "rewards/rejected": -5.106849193572998, + "sft_loss": 4.17466402053833, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 0.4299714719732261, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.30148938298225403, + "logits/rejected": -0.14409419894218445, + "logps/chosen": -4.323808193206787, + "logps/rejected": -5.228428840637207, + "loss": 0.0502, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.323808193206787, + "rewards/margins": 0.9046202898025513, + "rewards/rejected": -5.228428840637207, + "sft_loss": 4.030442237854004, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 0.5583552209547061, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.45117372274398804, + "logits/rejected": -0.25930553674697876, + "logps/chosen": -4.293408393859863, + "logps/rejected": -5.134814262390137, + "loss": 0.0495, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.293408393859863, + "rewards/margins": 0.841405987739563, + "rewards/rejected": -5.134814262390137, + "sft_loss": 4.050257682800293, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 0.5503319371421598, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.30263951420783997, + "logits/rejected": -0.1634255200624466, + "logps/chosen": -4.305006980895996, + "logps/rejected": -5.143341064453125, + "loss": 0.0488, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.305006980895996, + "rewards/margins": 0.8383339047431946, + "rewards/rejected": -5.143341064453125, + "sft_loss": 4.050868988037109, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 0.44537831132872474, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.4748370051383972, + "logits/rejected": -0.2880566418170929, + "logps/chosen": -4.3707966804504395, + "logps/rejected": -5.182746410369873, + "loss": 0.0498, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.3707966804504395, + "rewards/margins": 0.8119505643844604, + "rewards/rejected": -5.182746410369873, + "sft_loss": 3.9548544883728027, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.11339718848466873, + "eval_logits/rejected": 0.21380193531513214, + "eval_logps/chosen": -4.331807613372803, + "eval_logps/rejected": -5.096880912780762, + "eval_loss": 0.049965761601924896, + "eval_rewards/accuracies": 0.68916916847229, + "eval_rewards/chosen": -4.331807613372803, + "eval_rewards/margins": 0.7650735974311829, + "eval_rewards/rejected": -5.096880912780762, + "eval_runtime": 44.5977, + "eval_samples_per_second": 30.159, + "eval_sft_loss": 3.937229871749878, + "eval_steps_per_second": 7.556, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 0.63650009255052, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.3355264961719513, + "logits/rejected": -0.3086664378643036, + "logps/chosen": -4.177042007446289, + "logps/rejected": -5.167255401611328, + "loss": 0.0491, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.177042007446289, + "rewards/margins": 0.9902137517929077, + "rewards/rejected": -5.167255401611328, + "sft_loss": 3.9284653663635254, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 0.6238382815083907, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.36723607778549194, + "logits/rejected": -0.25406724214553833, + "logps/chosen": -4.518461227416992, + "logps/rejected": -5.312617301940918, + "loss": 0.051, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.518461227416992, + "rewards/margins": 0.7941561937332153, + "rewards/rejected": -5.312617301940918, + "sft_loss": 4.1740217208862305, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 0.4804024279136467, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.3804596960544586, + "logits/rejected": -0.2999555468559265, + "logps/chosen": -4.189560413360596, + "logps/rejected": -5.1501569747924805, + "loss": 0.0492, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.189560413360596, + "rewards/margins": 0.9605971574783325, + "rewards/rejected": -5.1501569747924805, + "sft_loss": 4.015095233917236, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 0.532833051713504, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.3272797763347626, + "logits/rejected": -0.2577647864818573, + "logps/chosen": -4.291745185852051, + "logps/rejected": -5.294327735900879, + "loss": 0.0488, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.291745185852051, + "rewards/margins": 1.00258207321167, + "rewards/rejected": -5.294327735900879, + "sft_loss": 4.015843391418457, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 0.5326192755761749, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.3303220868110657, + "logits/rejected": -0.1386190950870514, + "logps/chosen": -4.393420219421387, + "logps/rejected": -5.122389793395996, + "loss": 0.0522, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.393420219421387, + "rewards/margins": 0.7289689779281616, + "rewards/rejected": -5.122389793395996, + "sft_loss": 4.1007585525512695, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 0.6058086369493448, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.43916431069374084, + "logits/rejected": -0.2893034517765045, + "logps/chosen": -4.339142799377441, + "logps/rejected": -5.268829822540283, + "loss": 0.0484, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.339142799377441, + "rewards/margins": 0.9296862483024597, + "rewards/rejected": -5.268829822540283, + "sft_loss": 3.947129487991333, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 0.43408048665096755, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.43070071935653687, + "logits/rejected": -0.25799840688705444, + "logps/chosen": -4.269518852233887, + "logps/rejected": -5.2071733474731445, + "loss": 0.0492, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.269518852233887, + "rewards/margins": 0.9376543164253235, + "rewards/rejected": -5.2071733474731445, + "sft_loss": 3.977576494216919, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 0.5127739715217935, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.43267449736595154, + "logits/rejected": -0.20683661103248596, + "logps/chosen": -4.34108304977417, + "logps/rejected": -5.199608325958252, + "loss": 0.0495, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.34108304977417, + "rewards/margins": 0.8585250973701477, + "rewards/rejected": -5.199608325958252, + "sft_loss": 4.119553089141846, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 0.44168776044394253, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.35988324880599976, + "logits/rejected": -0.3104974329471588, + "logps/chosen": -4.353398323059082, + "logps/rejected": -5.132560729980469, + "loss": 0.0511, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.353398323059082, + "rewards/margins": 0.779162585735321, + "rewards/rejected": -5.132560729980469, + "sft_loss": 4.198001384735107, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 0.5776884999882473, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.46909505128860474, + "logits/rejected": -0.30127614736557007, + "logps/chosen": -4.262785911560059, + "logps/rejected": -5.127320766448975, + "loss": 0.0505, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.262785911560059, + "rewards/margins": 0.8645352125167847, + "rewards/rejected": -5.127320766448975, + "sft_loss": 4.043597221374512, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 0.5770063136674014, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.4060916006565094, + "logits/rejected": -0.16923363506793976, + "logps/chosen": -4.230871677398682, + "logps/rejected": -5.168854713439941, + "loss": 0.0502, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.230871677398682, + "rewards/margins": 0.9379828572273254, + "rewards/rejected": -5.168854713439941, + "sft_loss": 4.0604095458984375, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 0.48959950508599864, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.3215634226799011, + "logits/rejected": -0.2579534649848938, + "logps/chosen": -4.367952346801758, + "logps/rejected": -5.0876593589782715, + "loss": 0.053, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.367952346801758, + "rewards/margins": 0.7197073698043823, + "rewards/rejected": -5.0876593589782715, + "sft_loss": 4.103386878967285, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 0.4936544311232592, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.48994073271751404, + "logits/rejected": -0.23816998302936554, + "logps/chosen": -4.147706031799316, + "logps/rejected": -5.310173988342285, + "loss": 0.0476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.147706031799316, + "rewards/margins": 1.162468671798706, + "rewards/rejected": -5.310173988342285, + "sft_loss": 3.913297176361084, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 0.48184078763381777, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.3174903988838196, + "logits/rejected": -0.1744978427886963, + "logps/chosen": -4.236815452575684, + "logps/rejected": -5.054623603820801, + "loss": 0.0503, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.236815452575684, + "rewards/margins": 0.8178078532218933, + "rewards/rejected": -5.054623603820801, + "sft_loss": 4.0225510597229, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 0.49873076680439726, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.3413674831390381, + "logits/rejected": -0.19086118042469025, + "logps/chosen": -4.315398216247559, + "logps/rejected": -5.07651424407959, + "loss": 0.0504, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.315398216247559, + "rewards/margins": 0.7611164450645447, + "rewards/rejected": -5.07651424407959, + "sft_loss": 4.1059393882751465, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 0.49656743919960683, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.3963681161403656, + "logits/rejected": -0.16303816437721252, + "logps/chosen": -4.286141395568848, + "logps/rejected": -5.307049751281738, + "loss": 0.0503, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.286141395568848, + "rewards/margins": 1.0209077596664429, + "rewards/rejected": -5.307049751281738, + "sft_loss": 3.983715772628784, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 0.44754978271656065, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.4166085124015808, + "logits/rejected": -0.284808874130249, + "logps/chosen": -4.283874034881592, + "logps/rejected": -5.237959861755371, + "loss": 0.05, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.283874034881592, + "rewards/margins": 0.954084575176239, + "rewards/rejected": -5.237959861755371, + "sft_loss": 4.053045749664307, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 0.5852176436351291, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.4020010828971863, + "logits/rejected": -0.2330581694841385, + "logps/chosen": -4.204090118408203, + "logps/rejected": -5.062168598175049, + "loss": 0.0493, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.204090118408203, + "rewards/margins": 0.8580780029296875, + "rewards/rejected": -5.062168598175049, + "sft_loss": 3.945356845855713, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 0.5227330600678359, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.42851123213768005, + "logits/rejected": -0.21556782722473145, + "logps/chosen": -4.467846870422363, + "logps/rejected": -5.213390350341797, + "loss": 0.0505, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.467846870422363, + "rewards/margins": 0.745543897151947, + "rewards/rejected": -5.213390350341797, + "sft_loss": 4.137904644012451, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 0.7779699882330686, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.3555363416671753, + "logits/rejected": -0.2877839207649231, + "logps/chosen": -4.3804931640625, + "logps/rejected": -5.2144551277160645, + "loss": 0.0493, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.3804931640625, + "rewards/margins": 0.833962082862854, + "rewards/rejected": -5.2144551277160645, + "sft_loss": 4.057433128356934, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 0.5454910221823269, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.33761829137802124, + "logits/rejected": -0.22594241797924042, + "logps/chosen": -4.206428527832031, + "logps/rejected": -5.088019847869873, + "loss": 0.0494, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.206428527832031, + "rewards/margins": 0.8815921545028687, + "rewards/rejected": -5.088019847869873, + "sft_loss": 3.8602423667907715, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 0.6438981638318502, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.5344552993774414, + "logits/rejected": -0.2571745216846466, + "logps/chosen": -4.3302435874938965, + "logps/rejected": -5.282116889953613, + "loss": 0.05, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.3302435874938965, + "rewards/margins": 0.9518739581108093, + "rewards/rejected": -5.282116889953613, + "sft_loss": 4.086483001708984, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 0.5958659951379691, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.29295647144317627, + "logits/rejected": -0.08710186183452606, + "logps/chosen": -4.312572002410889, + "logps/rejected": -5.109742164611816, + "loss": 0.0509, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.312572002410889, + "rewards/margins": 0.7971704006195068, + "rewards/rejected": -5.109742164611816, + "sft_loss": 4.068450927734375, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 0.4619365533143964, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.33097267150878906, + "logits/rejected": -0.307807981967926, + "logps/chosen": -4.2144365310668945, + "logps/rejected": -5.045806884765625, + "loss": 0.0494, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.2144365310668945, + "rewards/margins": 0.8313705325126648, + "rewards/rejected": -5.045806884765625, + "sft_loss": 3.9541831016540527, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 0.5208760764017685, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.291950523853302, + "logits/rejected": -0.10099242627620697, + "logps/chosen": -4.414985179901123, + "logps/rejected": -5.283970832824707, + "loss": 0.0494, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.414985179901123, + "rewards/margins": 0.8689855337142944, + "rewards/rejected": -5.283970832824707, + "sft_loss": 4.126601696014404, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 0.6338941827024959, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.2738792300224304, + "logits/rejected": -0.18372344970703125, + "logps/chosen": -4.438006401062012, + "logps/rejected": -5.203482627868652, + "loss": 0.0505, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.438006401062012, + "rewards/margins": 0.7654756903648376, + "rewards/rejected": -5.203482627868652, + "sft_loss": 4.058730602264404, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 0.5443912681849259, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.23748035728931427, + "logits/rejected": -0.3234420716762543, + "logps/chosen": -4.397795677185059, + "logps/rejected": -4.979608058929443, + "loss": 0.0515, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.397795677185059, + "rewards/margins": 0.5818119049072266, + "rewards/rejected": -4.979608058929443, + "sft_loss": 4.0953369140625, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 0.535553105450855, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.3057805299758911, + "logits/rejected": -0.2653143107891083, + "logps/chosen": -4.209263324737549, + "logps/rejected": -4.970522880554199, + "loss": 0.0512, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.209263324737549, + "rewards/margins": 0.7612598538398743, + "rewards/rejected": -4.970522880554199, + "sft_loss": 3.9734046459198, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 1.4483798734565578, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.512469470500946, + "logits/rejected": -0.31466788053512573, + "logps/chosen": -4.2295098304748535, + "logps/rejected": -5.069262981414795, + "loss": 0.0502, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.2295098304748535, + "rewards/margins": 0.8397535085678101, + "rewards/rejected": -5.069262981414795, + "sft_loss": 3.985532283782959, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 0.9910451717757924, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.28064030408859253, + "logits/rejected": -0.30460792779922485, + "logps/chosen": -4.310183525085449, + "logps/rejected": -5.07125186920166, + "loss": 0.0514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.310183525085449, + "rewards/margins": 0.7610687017440796, + "rewards/rejected": -5.07125186920166, + "sft_loss": 4.0810065269470215, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 0.5874113570648387, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.38583841919898987, + "logits/rejected": -0.3223554491996765, + "logps/chosen": -4.332071781158447, + "logps/rejected": -5.063580513000488, + "loss": 0.0525, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.332071781158447, + "rewards/margins": 0.7315087914466858, + "rewards/rejected": -5.063580513000488, + "sft_loss": 4.141894817352295, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 0.9590204351063107, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.41534432768821716, + "logits/rejected": -0.1952972561120987, + "logps/chosen": -4.223122596740723, + "logps/rejected": -5.143168926239014, + "loss": 0.0497, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.223122596740723, + "rewards/margins": 0.9200462102890015, + "rewards/rejected": -5.143168926239014, + "sft_loss": 3.9259402751922607, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 0.5117516358708537, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.3982107639312744, + "logits/rejected": -0.241659015417099, + "logps/chosen": -4.251479625701904, + "logps/rejected": -5.179772853851318, + "loss": 0.0488, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.251479625701904, + "rewards/margins": 0.9282932281494141, + "rewards/rejected": -5.179772853851318, + "sft_loss": 3.968653440475464, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 0.8535394718224574, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.23298446834087372, + "logits/rejected": -0.21911530196666718, + "logps/chosen": -4.3338518142700195, + "logps/rejected": -4.973204612731934, + "loss": 0.051, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.3338518142700195, + "rewards/margins": 0.6393526196479797, + "rewards/rejected": -4.973204612731934, + "sft_loss": 3.9991040229797363, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 0.6187135581988519, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.32231375575065613, + "logits/rejected": -0.2524074912071228, + "logps/chosen": -4.291468620300293, + "logps/rejected": -5.2907395362854, + "loss": 0.0501, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.291468620300293, + "rewards/margins": 0.9992705583572388, + "rewards/rejected": -5.2907395362854, + "sft_loss": 4.086352825164795, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 0.5124358191216212, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.43386736512184143, + "logits/rejected": -0.15008467435836792, + "logps/chosen": -4.215126991271973, + "logps/rejected": -5.108205318450928, + "loss": 0.0494, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.215126991271973, + "rewards/margins": 0.8930784463882446, + "rewards/rejected": -5.108205318450928, + "sft_loss": 3.9824631214141846, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 0.540372134185639, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.26383471488952637, + "logits/rejected": -0.19916556775569916, + "logps/chosen": -4.226814270019531, + "logps/rejected": -4.9440226554870605, + "loss": 0.0515, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.226814270019531, + "rewards/margins": 0.7172079086303711, + "rewards/rejected": -4.9440226554870605, + "sft_loss": 3.9626152515411377, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 0.6233774203905628, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.382610023021698, + "logits/rejected": -0.19507412612438202, + "logps/chosen": -4.561144828796387, + "logps/rejected": -5.150514125823975, + "loss": 0.0525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.561144828796387, + "rewards/margins": 0.5893692970275879, + "rewards/rejected": -5.150514125823975, + "sft_loss": 4.331704139709473, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 0.4977719197784728, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.5408740639686584, + "logits/rejected": -0.2679939866065979, + "logps/chosen": -4.267232418060303, + "logps/rejected": -5.070743083953857, + "loss": 0.0504, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.267232418060303, + "rewards/margins": 0.8035109639167786, + "rewards/rejected": -5.070743083953857, + "sft_loss": 4.043534278869629, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 0.381660827593946, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.4261409342288971, + "logits/rejected": -0.1743827611207962, + "logps/chosen": -4.142676830291748, + "logps/rejected": -5.355725288391113, + "loss": 0.0469, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.142676830291748, + "rewards/margins": 1.2130485773086548, + "rewards/rejected": -5.355725288391113, + "sft_loss": 3.7679145336151123, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 0.43441229618811533, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.3080061376094818, + "logits/rejected": -0.2564330995082855, + "logps/chosen": -4.471229076385498, + "logps/rejected": -5.220850944519043, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.471229076385498, + "rewards/margins": 0.7496218681335449, + "rewards/rejected": -5.220850944519043, + "sft_loss": 4.176270484924316, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 0.5246889640905302, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.38305962085723877, + "logits/rejected": -0.2757953405380249, + "logps/chosen": -4.224427700042725, + "logps/rejected": -5.185813903808594, + "loss": 0.0488, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.224427700042725, + "rewards/margins": 0.9613859057426453, + "rewards/rejected": -5.185813903808594, + "sft_loss": 3.924294948577881, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 0.43745005810743176, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.4080559313297272, + "logits/rejected": -0.18248072266578674, + "logps/chosen": -4.294047832489014, + "logps/rejected": -5.07814884185791, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.294047832489014, + "rewards/margins": 0.7841013669967651, + "rewards/rejected": -5.07814884185791, + "sft_loss": 4.130050182342529, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 0.5080954271922328, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.3728869557380676, + "logits/rejected": -0.29249298572540283, + "logps/chosen": -4.324324607849121, + "logps/rejected": -4.972744941711426, + "loss": 0.0529, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.324324607849121, + "rewards/margins": 0.6484203338623047, + "rewards/rejected": -4.972744941711426, + "sft_loss": 4.04088020324707, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 0.5684492771021781, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.39560359716415405, + "logits/rejected": -0.22769984602928162, + "logps/chosen": -4.299488544464111, + "logps/rejected": -5.184499740600586, + "loss": 0.0488, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.299488544464111, + "rewards/margins": 0.8850114941596985, + "rewards/rejected": -5.184499740600586, + "sft_loss": 3.9442543983459473, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 0.48701394112032037, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.41835254430770874, + "logits/rejected": -0.2046811580657959, + "logps/chosen": -4.341827392578125, + "logps/rejected": -5.209946632385254, + "loss": 0.0503, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.341827392578125, + "rewards/margins": 0.8681195378303528, + "rewards/rejected": -5.209946632385254, + "sft_loss": 4.115506172180176, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 0.49401293076834946, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.4481055736541748, + "logits/rejected": -0.37085479497909546, + "logps/chosen": -4.246690273284912, + "logps/rejected": -5.07697057723999, + "loss": 0.0508, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.246690273284912, + "rewards/margins": 0.8302801847457886, + "rewards/rejected": -5.07697057723999, + "sft_loss": 4.007492542266846, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 0.5275705290982255, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.4367121160030365, + "logits/rejected": -0.2713041305541992, + "logps/chosen": -4.409333229064941, + "logps/rejected": -5.059312343597412, + "loss": 0.0515, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.409333229064941, + "rewards/margins": 0.6499795913696289, + "rewards/rejected": -5.059312343597412, + "sft_loss": 4.199063301086426, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 0.6097523673192077, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": -0.17959146201610565, + "logits/rejected": -0.11355265229940414, + "logps/chosen": -4.34035062789917, + "logps/rejected": -5.241170406341553, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.34035062789917, + "rewards/margins": 0.900820255279541, + "rewards/rejected": -5.241170406341553, + "sft_loss": 4.116297721862793, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 0.8818009140689909, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.17665904760360718, + "logits/rejected": -0.10692217200994492, + "logps/chosen": -4.236802101135254, + "logps/rejected": -5.121260643005371, + "loss": 0.0502, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.236802101135254, + "rewards/margins": 0.8844582438468933, + "rewards/rejected": -5.121260643005371, + "sft_loss": 4.044351100921631, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 0.6358504999909266, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.43429645895957947, + "logits/rejected": -0.229470893740654, + "logps/chosen": -4.325311660766602, + "logps/rejected": -4.965158462524414, + "loss": 0.0526, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.325311660766602, + "rewards/margins": 0.6398465633392334, + "rewards/rejected": -4.965158462524414, + "sft_loss": 4.134278774261475, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 0.5806505011410361, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.3749215602874756, + "logits/rejected": -0.21662676334381104, + "logps/chosen": -4.304712295532227, + "logps/rejected": -5.107754707336426, + "loss": 0.0491, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.304712295532227, + "rewards/margins": 0.8030425906181335, + "rewards/rejected": -5.107754707336426, + "sft_loss": 3.934671401977539, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 0.557229329842499, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.4316619038581848, + "logits/rejected": -0.16651205718517303, + "logps/chosen": -4.384096145629883, + "logps/rejected": -5.088347911834717, + "loss": 0.0507, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.384096145629883, + "rewards/margins": 0.7042518854141235, + "rewards/rejected": -5.088347911834717, + "sft_loss": 4.104673862457275, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 0.6567540054055532, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.34968018531799316, + "logits/rejected": -0.2774543762207031, + "logps/chosen": -4.437806129455566, + "logps/rejected": -5.0514817237854, + "loss": 0.054, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.437806129455566, + "rewards/margins": 0.6136748790740967, + "rewards/rejected": -5.0514817237854, + "sft_loss": 4.183988094329834, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 0.4106224978496865, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.2766024172306061, + "logits/rejected": -0.3795970380306244, + "logps/chosen": -4.4523186683654785, + "logps/rejected": -5.04266881942749, + "loss": 0.0514, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.4523186683654785, + "rewards/margins": 0.5903505086898804, + "rewards/rejected": -5.04266881942749, + "sft_loss": 4.153774738311768, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 0.5261648848812568, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.3736271858215332, + "logits/rejected": -0.25079482793807983, + "logps/chosen": -4.285164833068848, + "logps/rejected": -5.054616451263428, + "loss": 0.0504, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.285164833068848, + "rewards/margins": 0.7694514989852905, + "rewards/rejected": -5.054616451263428, + "sft_loss": 4.0142927169799805, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 0.5227266083540951, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.3007558286190033, + "logits/rejected": -0.07791855931282043, + "logps/chosen": -4.21551513671875, + "logps/rejected": -5.0369462966918945, + "loss": 0.0503, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.21551513671875, + "rewards/margins": 0.8214312791824341, + "rewards/rejected": -5.0369462966918945, + "sft_loss": 3.962756633758545, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 0.5479204943216744, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.37504953145980835, + "logits/rejected": -0.34630855917930603, + "logps/chosen": -4.44875431060791, + "logps/rejected": -5.066702365875244, + "loss": 0.0508, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.44875431060791, + "rewards/margins": 0.6179476976394653, + "rewards/rejected": -5.066702365875244, + "sft_loss": 4.060311317443848, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 0.6364064477895179, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.45711517333984375, + "logits/rejected": -0.20373289287090302, + "logps/chosen": -4.335000991821289, + "logps/rejected": -5.186996936798096, + "loss": 0.051, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.335000991821289, + "rewards/margins": 0.8519953489303589, + "rewards/rejected": -5.186996936798096, + "sft_loss": 4.1510090827941895, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 0.37950094909926635, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.2739645838737488, + "logits/rejected": -0.19639113545417786, + "logps/chosen": -4.4395341873168945, + "logps/rejected": -5.209003448486328, + "loss": 0.0501, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.4395341873168945, + "rewards/margins": 0.7694700956344604, + "rewards/rejected": -5.209003448486328, + "sft_loss": 4.123523235321045, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 0.5038478075619528, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.2963625490665436, + "logits/rejected": -0.3096460700035095, + "logps/chosen": -4.370425701141357, + "logps/rejected": -5.104419231414795, + "loss": 0.0509, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.370425701141357, + "rewards/margins": 0.7339931130409241, + "rewards/rejected": -5.104419231414795, + "sft_loss": 4.177587509155273, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 0.7755431035459227, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.3818827271461487, + "logits/rejected": -0.21020355820655823, + "logps/chosen": -4.361794948577881, + "logps/rejected": -5.4317145347595215, + "loss": 0.0504, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.361794948577881, + "rewards/margins": 1.0699187517166138, + "rewards/rejected": -5.4317145347595215, + "sft_loss": 4.175992965698242, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 0.47191157552693663, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.46422845125198364, + "logits/rejected": -0.30519360303878784, + "logps/chosen": -4.419317722320557, + "logps/rejected": -5.268907070159912, + "loss": 0.0495, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.419317722320557, + "rewards/margins": 0.849589467048645, + "rewards/rejected": -5.268907070159912, + "sft_loss": 3.9879608154296875, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 0.6659865489877009, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.35505035519599915, + "logits/rejected": -0.3015265166759491, + "logps/chosen": -4.270551681518555, + "logps/rejected": -5.060952186584473, + "loss": 0.0502, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.270551681518555, + "rewards/margins": 0.7904006838798523, + "rewards/rejected": -5.060952186584473, + "sft_loss": 3.9596004486083984, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 0.5271862878415422, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.39922767877578735, + "logits/rejected": -0.200235053896904, + "logps/chosen": -4.428971290588379, + "logps/rejected": -5.323958396911621, + "loss": 0.0519, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.428971290588379, + "rewards/margins": 0.8949869871139526, + "rewards/rejected": -5.323958396911621, + "sft_loss": 4.2644548416137695, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 0.7184273436158761, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.46444278955459595, + "logits/rejected": -0.20382392406463623, + "logps/chosen": -4.392231464385986, + "logps/rejected": -5.154418468475342, + "loss": 0.0511, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.392231464385986, + "rewards/margins": 0.7621868848800659, + "rewards/rejected": -5.154418468475342, + "sft_loss": 4.077957630157471, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 0.7951708455272362, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.3619312644004822, + "logits/rejected": -0.32184141874313354, + "logps/chosen": -4.419110298156738, + "logps/rejected": -5.143651008605957, + "loss": 0.0528, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.419110298156738, + "rewards/margins": 0.7245412468910217, + "rewards/rejected": -5.143651008605957, + "sft_loss": 4.224579334259033, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 0.5457381779871645, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.47487345337867737, + "logits/rejected": -0.3351452052593231, + "logps/chosen": -4.251893043518066, + "logps/rejected": -5.026724338531494, + "loss": 0.0503, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.251893043518066, + "rewards/margins": 0.7748310565948486, + "rewards/rejected": -5.026724338531494, + "sft_loss": 4.000130653381348, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 0.5333018714647327, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.3330259919166565, + "logits/rejected": -0.22684387862682343, + "logps/chosen": -4.1902852058410645, + "logps/rejected": -5.100726127624512, + "loss": 0.0498, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.1902852058410645, + "rewards/margins": 0.9104412794113159, + "rewards/rejected": -5.100726127624512, + "sft_loss": 3.9527220726013184, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 0.5068612902949374, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.23308193683624268, + "logits/rejected": -0.19373756647109985, + "logps/chosen": -4.388981342315674, + "logps/rejected": -5.182524681091309, + "loss": 0.0505, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.388981342315674, + "rewards/margins": 0.7935434579849243, + "rewards/rejected": -5.182524681091309, + "sft_loss": 4.066475868225098, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 0.5814757487353343, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.35087448358535767, + "logits/rejected": -0.12827900052070618, + "logps/chosen": -4.3805389404296875, + "logps/rejected": -5.340671539306641, + "loss": 0.0493, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.3805389404296875, + "rewards/margins": 0.9601324796676636, + "rewards/rejected": -5.340671539306641, + "sft_loss": 4.0317511558532715, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 0.550436838494477, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.2993507981300354, + "logits/rejected": -0.29762619733810425, + "logps/chosen": -4.355318546295166, + "logps/rejected": -5.036417007446289, + "loss": 0.0509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.355318546295166, + "rewards/margins": 0.6810978651046753, + "rewards/rejected": -5.036417007446289, + "sft_loss": 4.080462455749512, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 0.5420103145317201, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.4515206217765808, + "logits/rejected": -0.34055566787719727, + "logps/chosen": -4.29352331161499, + "logps/rejected": -5.112155437469482, + "loss": 0.0513, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.29352331161499, + "rewards/margins": 0.818631649017334, + "rewards/rejected": -5.112155437469482, + "sft_loss": 4.067957878112793, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 0.5884906325479862, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.34654372930526733, + "logits/rejected": -0.20942942798137665, + "logps/chosen": -4.317554950714111, + "logps/rejected": -5.352757453918457, + "loss": 0.0479, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.317554950714111, + "rewards/margins": 1.0352026224136353, + "rewards/rejected": -5.352757453918457, + "sft_loss": 3.913717269897461, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 0.8380934579646794, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.3536500930786133, + "logits/rejected": -0.17102427780628204, + "logps/chosen": -4.269620418548584, + "logps/rejected": -5.160853385925293, + "loss": 0.0498, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.269620418548584, + "rewards/margins": 0.8912326693534851, + "rewards/rejected": -5.160853385925293, + "sft_loss": 3.962118148803711, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 0.5255025356739468, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.5044499635696411, + "logits/rejected": -0.19058753550052643, + "logps/chosen": -4.287683963775635, + "logps/rejected": -5.2070136070251465, + "loss": 0.0494, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.287683963775635, + "rewards/margins": 0.9193302392959595, + "rewards/rejected": -5.2070136070251465, + "sft_loss": 3.977626323699951, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 0.5414871942816999, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.46721917390823364, + "logits/rejected": -0.25959575176239014, + "logps/chosen": -4.274872779846191, + "logps/rejected": -5.1689348220825195, + "loss": 0.0508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.274872779846191, + "rewards/margins": 0.8940622210502625, + "rewards/rejected": -5.1689348220825195, + "sft_loss": 4.059477806091309, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 0.40066954094597396, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.3345833122730255, + "logits/rejected": -0.25943347811698914, + "logps/chosen": -4.5316338539123535, + "logps/rejected": -5.23195219039917, + "loss": 0.052, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.5316338539123535, + "rewards/margins": 0.7003186345100403, + "rewards/rejected": -5.23195219039917, + "sft_loss": 4.1835479736328125, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 0.6992304630126898, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.36338135600090027, + "logits/rejected": -0.14956054091453552, + "logps/chosen": -4.3805952072143555, + "logps/rejected": -5.219130039215088, + "loss": 0.0502, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.3805952072143555, + "rewards/margins": 0.838534951210022, + "rewards/rejected": -5.219130039215088, + "sft_loss": 4.159787178039551, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 0.5406575761049948, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.3695183992385864, + "logits/rejected": -0.23002979159355164, + "logps/chosen": -4.364432334899902, + "logps/rejected": -5.423884868621826, + "loss": 0.0496, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.364432334899902, + "rewards/margins": 1.0594522953033447, + "rewards/rejected": -5.423884868621826, + "sft_loss": 4.099183082580566, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.05091719701886177, + "eval_logits/rejected": 0.14437253773212433, + "eval_logps/chosen": -4.325172424316406, + "eval_logps/rejected": -5.104435920715332, + "eval_loss": 0.04997369274497032, + "eval_rewards/accuracies": 0.68916916847229, + "eval_rewards/chosen": -4.325172424316406, + "eval_rewards/margins": 0.7792637348175049, + "eval_rewards/rejected": -5.104435920715332, + "eval_runtime": 45.1351, + "eval_samples_per_second": 29.799, + "eval_sft_loss": 3.922001361846924, + "eval_steps_per_second": 7.466, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.05868237358195538, + "train_runtime": 34511.7277, + "train_samples_per_second": 5.197, + "train_steps_per_second": 0.162 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}