{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991356957649092, "eval_steps": 100, "global_step": 578, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.322265625, "learning_rate": 8.620689655172414e-08, "logits/chosen": -0.2581043839454651, "logits/rejected": 0.10956327617168427, "logps/chosen": -220.3083953857422, "logps/rejected": -168.47125244140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 0.318359375, "learning_rate": 8.620689655172415e-07, "logits/chosen": -0.16030678153038025, "logits/rejected": 0.07822367548942566, "logps/chosen": -236.54421997070312, "logps/rejected": -225.8965606689453, "loss": 0.6931, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": -0.000611037714406848, "rewards/margins": -0.0004342191095929593, "rewards/margins_max": 0.0017699559684842825, "rewards/margins_min": -0.0026383937802165747, "rewards/margins_std": 0.003117174142971635, "rewards/rejected": -0.00017681866302154958, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.3046875, "learning_rate": 1.724137931034483e-06, "logits/chosen": -0.16772077977657318, "logits/rejected": 0.11436957120895386, "logps/chosen": -217.293701171875, "logps/rejected": -207.2777557373047, "loss": 0.6919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00018755244673229754, "rewards/margins": 0.003174250479787588, "rewards/margins_max": 0.006569386459887028, "rewards/margins_min": -0.00022088526748120785, "rewards/margins_std": 0.004801447503268719, "rewards/rejected": -0.003361803013831377, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.31640625, "learning_rate": 2.5862068965517246e-06, "logits/chosen": -0.2165769636631012, "logits/rejected": 0.02331249974668026, "logps/chosen": -210.9033660888672, "logps/rejected": -214.22988891601562, "loss": 0.6893, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0012746901484206319, "rewards/margins": 0.008387003093957901, "rewards/margins_max": 0.013032525777816772, "rewards/margins_min": 0.0037414811085909605, "rewards/margins_std": 0.006569760385900736, "rewards/rejected": -0.0071123139932751656, "step": 30 }, { "epoch": 0.07, "grad_norm": 0.318359375, "learning_rate": 3.448275862068966e-06, "logits/chosen": -0.1581813097000122, "logits/rejected": 0.08394273370504379, "logps/chosen": -208.3163299560547, "logps/rejected": -187.9713592529297, "loss": 0.6852, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.005893433466553688, "rewards/margins": 0.016705039888620377, "rewards/margins_max": 0.02403287962079048, "rewards/margins_min": 0.009377201087772846, "rewards/margins_std": 0.010363129898905754, "rewards/rejected": -0.010811606422066689, "step": 40 }, { "epoch": 0.09, "grad_norm": 0.33203125, "learning_rate": 4.310344827586207e-06, "logits/chosen": -0.18618786334991455, "logits/rejected": 0.1531696617603302, "logps/chosen": -253.7356414794922, "logps/rejected": -227.4288330078125, "loss": 0.6779, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.009903091005980968, "rewards/margins": 0.03495802730321884, "rewards/margins_max": 0.050650179386138916, "rewards/margins_min": 0.019265878945589066, "rewards/margins_std": 0.022192049771547318, "rewards/rejected": -0.0250549353659153, "step": 50 }, { "epoch": 0.1, "grad_norm": 0.3203125, "learning_rate": 4.999817502139027e-06, "logits/chosen": -0.14740853011608124, "logits/rejected": 0.07666012644767761, "logps/chosen": -230.23330688476562, "logps/rejected": -232.44766235351562, "loss": 0.667, "rewards/accuracies": 0.9375, "rewards/chosen": 0.015751253813505173, "rewards/margins": 0.053423039615154266, "rewards/margins_max": 0.07735804468393326, "rewards/margins_min": 0.029488030821084976, "rewards/margins_std": 0.033849213272333145, "rewards/rejected": -0.037671782076358795, "step": 60 }, { "epoch": 0.12, "grad_norm": 0.29296875, "learning_rate": 4.9934328742287285e-06, "logits/chosen": -0.09936733543872833, "logits/rejected": 0.0712391585111618, "logps/chosen": -197.74615478515625, "logps/rejected": -215.8419189453125, "loss": 0.659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02438780851662159, "rewards/margins": 0.07444320619106293, "rewards/margins_max": 0.1244855672121048, "rewards/margins_min": 0.02440086379647255, "rewards/margins_std": 0.07077057659626007, "rewards/rejected": -0.050055403262376785, "step": 70 }, { "epoch": 0.14, "grad_norm": 0.40234375, "learning_rate": 4.977949980164773e-06, "logits/chosen": -0.09551429748535156, "logits/rejected": 0.12741951644420624, "logps/chosen": -203.38258361816406, "logps/rejected": -209.9490509033203, "loss": 0.6404, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.02303355559706688, "rewards/margins": 0.12564963102340698, "rewards/margins_max": 0.18152736127376556, "rewards/margins_min": 0.0697719007730484, "rewards/margins_std": 0.07902304828166962, "rewards/rejected": -0.1026160717010498, "step": 80 }, { "epoch": 0.16, "grad_norm": 0.345703125, "learning_rate": 4.953425315348534e-06, "logits/chosen": -0.1454397439956665, "logits/rejected": 0.0884198546409607, "logps/chosen": -208.8409423828125, "logps/rejected": -226.86929321289062, "loss": 0.6262, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.005688876379281282, "rewards/margins": 0.1296202391386032, "rewards/margins_max": 0.19095788896083832, "rewards/margins_min": 0.0682826116681099, "rewards/margins_std": 0.0867445170879364, "rewards/rejected": -0.1353091299533844, "step": 90 }, { "epoch": 0.17, "grad_norm": 0.40234375, "learning_rate": 4.919948367622307e-06, "logits/chosen": -0.1805122196674347, "logits/rejected": 0.028851622715592384, "logps/chosen": -230.6761016845703, "logps/rejected": -249.4705047607422, "loss": 0.5952, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.04513537138700485, "rewards/margins": 0.1765364557504654, "rewards/margins_max": 0.30944007635116577, "rewards/margins_min": 0.043632835149765015, "rewards/margins_std": 0.1879541426897049, "rewards/rejected": -0.22167184948921204, "step": 100 }, { "epoch": 0.19, "grad_norm": 0.4921875, "learning_rate": 4.8776412907378845e-06, "logits/chosen": -0.16438332200050354, "logits/rejected": 0.15017393231391907, "logps/chosen": -231.96347045898438, "logps/rejected": -236.1349639892578, "loss": 0.5795, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.09757739305496216, "rewards/margins": 0.2760006785392761, "rewards/margins_max": 0.45641541481018066, "rewards/margins_min": 0.09558592736721039, "rewards/margins_std": 0.25514498353004456, "rewards/rejected": -0.3735780715942383, "step": 110 }, { "epoch": 0.21, "grad_norm": 0.470703125, "learning_rate": 4.8266584586307555e-06, "logits/chosen": -0.13952846825122833, "logits/rejected": 0.22902479767799377, "logps/chosen": -231.74502563476562, "logps/rejected": -224.08328247070312, "loss": 0.5643, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09859104454517365, "rewards/margins": 0.3389679193496704, "rewards/margins_max": 0.5449327230453491, "rewards/margins_min": 0.1330030858516693, "rewards/margins_std": 0.29127827286720276, "rewards/rejected": -0.43755894899368286, "step": 120 }, { "epoch": 0.22, "grad_norm": 0.49609375, "learning_rate": 4.7671859021263635e-06, "logits/chosen": -0.18884742259979248, "logits/rejected": 0.1492905616760254, "logps/chosen": -238.9985809326172, "logps/rejected": -275.4112243652344, "loss": 0.5538, "rewards/accuracies": 0.875, "rewards/chosen": -0.1985391080379486, "rewards/margins": 0.3882392942905426, "rewards/margins_max": 0.6334519982337952, "rewards/margins_min": 0.14302656054496765, "rewards/margins_std": 0.3467831611633301, "rewards/rejected": -0.5867784023284912, "step": 130 }, { "epoch": 0.24, "grad_norm": 0.59375, "learning_rate": 4.699440630133794e-06, "logits/chosen": -0.21977896988391876, "logits/rejected": 0.055593542754650116, "logps/chosen": -229.7829132080078, "logps/rejected": -276.37225341796875, "loss": 0.4894, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.17763814330101013, "rewards/margins": 0.5651538968086243, "rewards/margins_max": 0.8883601427078247, "rewards/margins_min": 0.2419476956129074, "rewards/margins_std": 0.457082599401474, "rewards/rejected": -0.7427920699119568, "step": 140 }, { "epoch": 0.26, "grad_norm": 0.71875, "learning_rate": 4.623669837803803e-06, "logits/chosen": -0.17800770699977875, "logits/rejected": 0.08600752800703049, "logps/chosen": -241.20962524414062, "logps/rejected": -287.8876037597656, "loss": 0.4956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.3053871691226959, "rewards/margins": 0.5931288003921509, "rewards/margins_max": 0.9263006448745728, "rewards/margins_min": 0.2599569857120514, "rewards/margins_std": 0.4711760878562927, "rewards/rejected": -0.8985158801078796, "step": 150 }, { "epoch": 0.28, "grad_norm": 0.625, "learning_rate": 4.5401500045405126e-06, "logits/chosen": -0.2114168405532837, "logits/rejected": 0.14958535134792328, "logps/chosen": -272.8676452636719, "logps/rejected": -306.96514892578125, "loss": 0.457, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.43579959869384766, "rewards/margins": 0.6532906293869019, "rewards/margins_max": 1.065685510635376, "rewards/margins_min": 0.2408955842256546, "rewards/margins_std": 0.5832146406173706, "rewards/rejected": -1.08909010887146, "step": 160 }, { "epoch": 0.29, "grad_norm": 0.7109375, "learning_rate": 4.449185885158056e-06, "logits/chosen": -0.18476799130439758, "logits/rejected": 0.016872424632310867, "logps/chosen": -261.2188720703125, "logps/rejected": -352.5684509277344, "loss": 0.4364, "rewards/accuracies": 0.875, "rewards/chosen": -0.6136353015899658, "rewards/margins": 0.7640698552131653, "rewards/margins_max": 1.2162981033325195, "rewards/margins_min": 0.31184151768684387, "rewards/margins_std": 0.6395474076271057, "rewards/rejected": -1.3777052164077759, "step": 170 }, { "epoch": 0.31, "grad_norm": 0.92578125, "learning_rate": 4.351109397863309e-06, "logits/chosen": -0.24529743194580078, "logits/rejected": 0.0493427999317646, "logps/chosen": -286.1247863769531, "logps/rejected": -373.47747802734375, "loss": 0.4404, "rewards/accuracies": 0.875, "rewards/chosen": -0.6114776730537415, "rewards/margins": 0.9327934384346008, "rewards/margins_max": 1.5225698947906494, "rewards/margins_min": 0.34301698207855225, "rewards/margins_std": 0.8340697288513184, "rewards/rejected": -1.5442711114883423, "step": 180 }, { "epoch": 0.33, "grad_norm": 0.6796875, "learning_rate": 4.246278413122344e-06, "logits/chosen": -0.1576433926820755, "logits/rejected": 0.23860666155815125, "logps/chosen": -308.6180114746094, "logps/rejected": -392.26727294921875, "loss": 0.4022, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8496677279472351, "rewards/margins": 1.0323445796966553, "rewards/margins_max": 1.623838186264038, "rewards/margins_min": 0.4408511519432068, "rewards/margins_std": 0.8364981412887573, "rewards/rejected": -1.8820123672485352, "step": 190 }, { "epoch": 0.35, "grad_norm": 0.75390625, "learning_rate": 4.135075447829912e-06, "logits/chosen": -0.07231679558753967, "logits/rejected": 0.31568047404289246, "logps/chosen": -356.1817321777344, "logps/rejected": -434.91534423828125, "loss": 0.4095, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.1538748741149902, "rewards/margins": 1.009655475616455, "rewards/margins_max": 1.4875315427780151, "rewards/margins_min": 0.5317794680595398, "rewards/margins_std": 0.6758188009262085, "rewards/rejected": -2.1635308265686035, "step": 200 }, { "epoch": 0.36, "grad_norm": 1.15625, "learning_rate": 4.017906269546778e-06, "logits/chosen": -0.10543593019247055, "logits/rejected": 0.1417907178401947, "logps/chosen": -343.50628662109375, "logps/rejected": -465.8216857910156, "loss": 0.3719, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2488937377929688, "rewards/margins": 1.2493774890899658, "rewards/margins_max": 1.861472725868225, "rewards/margins_min": 0.63728266954422, "rewards/margins_std": 0.8656330108642578, "rewards/rejected": -2.4982714653015137, "step": 210 }, { "epoch": 0.38, "grad_norm": 0.75390625, "learning_rate": 3.895198415897896e-06, "logits/chosen": -0.07700450718402863, "logits/rejected": 0.22373361885547638, "logps/chosen": -359.3388671875, "logps/rejected": -459.7901306152344, "loss": 0.3355, "rewards/accuracies": 0.875, "rewards/chosen": -1.3689178228378296, "rewards/margins": 1.1828525066375732, "rewards/margins_max": 1.8306289911270142, "rewards/margins_min": 0.5350757837295532, "rewards/margins_std": 0.9160944819450378, "rewards/rejected": -2.5517702102661133, "step": 220 }, { "epoch": 0.4, "grad_norm": 0.90625, "learning_rate": 3.767399634533976e-06, "logits/chosen": -0.084971584379673, "logits/rejected": 0.2124381959438324, "logps/chosen": -401.7665710449219, "logps/rejected": -581.4509887695312, "loss": 0.2685, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.7419540882110596, "rewards/margins": 1.7960789203643799, "rewards/margins_max": 2.7218480110168457, "rewards/margins_min": 0.8703094720840454, "rewards/margins_std": 1.3092355728149414, "rewards/rejected": -3.5380325317382812, "step": 230 }, { "epoch": 0.41, "grad_norm": 0.94140625, "learning_rate": 3.634976249348867e-06, "logits/chosen": -0.07252025604248047, "logits/rejected": 0.3098675608634949, "logps/chosen": -488.57537841796875, "logps/rejected": -654.6478271484375, "loss": 0.2929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5602645874023438, "rewards/margins": 1.9851452112197876, "rewards/margins_max": 2.9839093685150146, "rewards/margins_min": 0.9863810539245605, "rewards/margins_std": 1.4124658107757568, "rewards/rejected": -4.545409679412842, "step": 240 }, { "epoch": 0.43, "grad_norm": 0.7734375, "learning_rate": 3.4984114589142388e-06, "logits/chosen": -0.024917516857385635, "logits/rejected": 0.216413214802742, "logps/chosen": -420.374267578125, "logps/rejected": -685.5641479492188, "loss": 0.2643, "rewards/accuracies": 0.9375, "rewards/chosen": -2.252289056777954, "rewards/margins": 2.3612987995147705, "rewards/margins_max": 3.7250447273254395, "rewards/margins_min": 0.9975533485412598, "rewards/margins_std": 1.928627371788025, "rewards/rejected": -4.613587856292725, "step": 250 }, { "epoch": 0.45, "grad_norm": 0.83984375, "learning_rate": 3.3582035733403963e-06, "logits/chosen": -0.08977536857128143, "logits/rejected": 0.19843626022338867, "logps/chosen": -409.473388671875, "logps/rejected": -679.5908203125, "loss": 0.2432, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.8145253658294678, "rewards/margins": 2.4806766510009766, "rewards/margins_max": 3.3301730155944824, "rewards/margins_min": 1.6311804056167603, "rewards/margins_std": 1.2013694047927856, "rewards/rejected": -4.295202255249023, "step": 260 }, { "epoch": 0.47, "grad_norm": 1.265625, "learning_rate": 3.214864195996723e-06, "logits/chosen": -0.006361374165862799, "logits/rejected": 0.3491051495075226, "logps/chosen": -522.7186279296875, "logps/rejected": -730.1286010742188, "loss": 0.2293, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.8888583183288574, "rewards/margins": 2.2627317905426025, "rewards/margins_max": 3.467148542404175, "rewards/margins_min": 1.0583146810531616, "rewards/margins_std": 1.7033026218414307, "rewards/rejected": -5.151589870452881, "step": 270 }, { "epoch": 0.48, "grad_norm": 1.7109375, "learning_rate": 3.068916356726475e-06, "logits/chosen": -0.02248637191951275, "logits/rejected": 0.3578205108642578, "logps/chosen": -535.5814208984375, "logps/rejected": -763.3458251953125, "loss": 0.2781, "rewards/accuracies": 0.9375, "rewards/chosen": -3.115751028060913, "rewards/margins": 2.397735834121704, "rewards/margins_max": 3.400325059890747, "rewards/margins_min": 1.3951469659805298, "rewards/margins_std": 1.417874813079834, "rewards/rejected": -5.513486385345459, "step": 280 }, { "epoch": 0.5, "grad_norm": 1.3203125, "learning_rate": 2.920892603367596e-06, "logits/chosen": 0.09292706102132797, "logits/rejected": 0.3714202344417572, "logps/chosen": -442.95654296875, "logps/rejected": -720.7830810546875, "loss": 0.2327, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.3281803131103516, "rewards/margins": 2.674379348754883, "rewards/margins_max": 3.9349582195281982, "rewards/margins_min": 1.413800597190857, "rewards/margins_std": 1.7827274799346924, "rewards/rejected": -5.002560138702393, "step": 290 }, { "epoch": 0.52, "grad_norm": 1.0859375, "learning_rate": 2.771333058543416e-06, "logits/chosen": 0.055374812334775925, "logits/rejected": 0.4309922158718109, "logps/chosen": -516.1644287109375, "logps/rejected": -729.7134399414062, "loss": 0.256, "rewards/accuracies": 0.875, "rewards/chosen": -2.969205856323242, "rewards/margins": 2.4310965538024902, "rewards/margins_max": 3.7021121978759766, "rewards/margins_min": 1.1600805521011353, "rewards/margins_std": 1.7974878549575806, "rewards/rejected": -5.400301933288574, "step": 300 }, { "epoch": 0.54, "grad_norm": 2.78125, "learning_rate": 2.620783448813768e-06, "logits/chosen": -0.0059727877378463745, "logits/rejected": 0.29597243666648865, "logps/chosen": -526.832763671875, "logps/rejected": -798.2999877929688, "loss": 0.2099, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.3908095359802246, "rewards/margins": 2.6575443744659424, "rewards/margins_max": 4.092823028564453, "rewards/margins_min": 1.2222658395767212, "rewards/margins_std": 2.0297904014587402, "rewards/rejected": -6.048354148864746, "step": 310 }, { "epoch": 0.55, "grad_norm": 1.1875, "learning_rate": 2.4697931133779566e-06, "logits/chosen": 0.016089849174022675, "logits/rejected": 0.3572506308555603, "logps/chosen": -560.1741333007812, "logps/rejected": -802.227294921875, "loss": 0.2181, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.4205613136291504, "rewards/margins": 2.480717182159424, "rewards/margins_max": 3.7455666065216064, "rewards/margins_min": 1.2158678770065308, "rewards/margins_std": 1.7887670993804932, "rewards/rejected": -5.901278495788574, "step": 320 }, { "epoch": 0.57, "grad_norm": 1.6015625, "learning_rate": 2.3189129995955944e-06, "logits/chosen": 0.045238252729177475, "logits/rejected": 0.3595990538597107, "logps/chosen": -592.646728515625, "logps/rejected": -874.0904541015625, "loss": 0.2125, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7254836559295654, "rewards/margins": 2.973832607269287, "rewards/margins_max": 3.973271131515503, "rewards/margins_min": 1.9743940830230713, "rewards/margins_std": 1.413419485092163, "rewards/rejected": -6.69931697845459, "step": 330 }, { "epoch": 0.59, "grad_norm": 0.9921875, "learning_rate": 2.168693652639432e-06, "logits/chosen": -0.007851422764360905, "logits/rejected": 0.3100680708885193, "logps/chosen": -557.7479248046875, "logps/rejected": -844.9327392578125, "loss": 0.2198, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4041056632995605, "rewards/margins": 2.787471294403076, "rewards/margins_max": 3.8065686225891113, "rewards/margins_min": 1.7683740854263306, "rewards/margins_std": 1.4412208795547485, "rewards/rejected": -6.191576957702637, "step": 340 }, { "epoch": 0.61, "grad_norm": 1.0078125, "learning_rate": 2.019683206615729e-06, "logits/chosen": 0.05400100350379944, "logits/rejected": 0.3827919661998749, "logps/chosen": -614.8720092773438, "logps/rejected": -885.8074951171875, "loss": 0.2104, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8366971015930176, "rewards/margins": 2.7368714809417725, "rewards/margins_max": 3.922558546066284, "rewards/margins_min": 1.5511847734451294, "rewards/margins_std": 1.676814317703247, "rewards/rejected": -6.573568820953369, "step": 350 }, { "epoch": 0.62, "grad_norm": 2.078125, "learning_rate": 1.872425384482346e-06, "logits/chosen": 0.08189837634563446, "logits/rejected": 0.4619109630584717, "logps/chosen": -662.888916015625, "logps/rejected": -1045.6783447265625, "loss": 0.2141, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.37006950378418, "rewards/margins": 3.7319226264953613, "rewards/margins_max": 5.36785364151001, "rewards/margins_min": 2.095991611480713, "rewards/margins_std": 2.313555955886841, "rewards/rejected": -8.101991653442383, "step": 360 }, { "epoch": 0.64, "grad_norm": 0.8984375, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -0.02938716672360897, "logits/rejected": 0.3284686803817749, "logps/chosen": -633.2041625976562, "logps/rejected": -967.3983154296875, "loss": 0.2133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.008258819580078, "rewards/margins": 3.592240571975708, "rewards/margins_max": 5.219714641571045, "rewards/margins_min": 1.964766502380371, "rewards/margins_std": 2.30159592628479, "rewards/rejected": -7.600499629974365, "step": 370 }, { "epoch": 0.66, "grad_norm": 0.7109375, "learning_rate": 1.5853085673944695e-06, "logits/chosen": 0.011644460260868073, "logits/rejected": 0.3021058738231659, "logps/chosen": -605.2432250976562, "logps/rejected": -939.3709106445312, "loss": 0.1579, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.026908874511719, "rewards/margins": 3.3612918853759766, "rewards/margins_max": 4.886077880859375, "rewards/margins_min": 1.836505651473999, "rewards/margins_std": 2.1563732624053955, "rewards/rejected": -7.3882012367248535, "step": 380 }, { "epoch": 0.67, "grad_norm": 1.4609375, "learning_rate": 1.4464972305686778e-06, "logits/chosen": 0.08569404482841492, "logits/rejected": 0.29340532422065735, "logps/chosen": -614.2609252929688, "logps/rejected": -995.1650390625, "loss": 0.1824, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.9831976890563965, "rewards/margins": 3.4943957328796387, "rewards/margins_max": 5.111188888549805, "rewards/margins_min": 1.8776025772094727, "rewards/margins_std": 2.2864904403686523, "rewards/rejected": -7.477593421936035, "step": 390 }, { "epoch": 0.69, "grad_norm": 2.671875, "learning_rate": 1.3115300110997097e-06, "logits/chosen": 0.07091796398162842, "logits/rejected": 0.4579780697822571, "logps/chosen": -626.0310668945312, "logps/rejected": -901.7600708007812, "loss": 0.2056, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.0506792068481445, "rewards/margins": 2.946117877960205, "rewards/margins_max": 4.542931079864502, "rewards/margins_min": 1.3493049144744873, "rewards/margins_std": 2.2582345008850098, "rewards/rejected": -6.99679708480835, "step": 400 }, { "epoch": 0.71, "grad_norm": 1.3125, "learning_rate": 1.1808993897346679e-06, "logits/chosen": -0.008156771771609783, "logits/rejected": 0.3462030589580536, "logps/chosen": -625.7688598632812, "logps/rejected": -990.1158447265625, "loss": 0.1844, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.038730144500732, "rewards/margins": 3.698699474334717, "rewards/margins_max": 5.397822856903076, "rewards/margins_min": 1.999576210975647, "rewards/margins_std": 2.402923107147217, "rewards/rejected": -7.737429618835449, "step": 410 }, { "epoch": 0.73, "grad_norm": 0.9375, "learning_rate": 1.0550820234444627e-06, "logits/chosen": 0.09430710971355438, "logits/rejected": 0.3812723755836487, "logps/chosen": -595.5155029296875, "logps/rejected": -927.47802734375, "loss": 0.186, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.960000514984131, "rewards/margins": 3.320066452026367, "rewards/margins_max": 4.767698287963867, "rewards/margins_min": 1.8724348545074463, "rewards/margins_std": 2.0472605228424072, "rewards/rejected": -7.280067443847656, "step": 420 }, { "epoch": 0.74, "grad_norm": 1.2734375, "learning_rate": 9.345370061542158e-07, "logits/chosen": -0.05127614736557007, "logits/rejected": 0.3036525547504425, "logps/chosen": -594.9441528320312, "logps/rejected": -962.1378173828125, "loss": 0.2192, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.80556058883667, "rewards/margins": 3.532916307449341, "rewards/margins_max": 4.6822099685668945, "rewards/margins_min": 2.383622407913208, "rewards/margins_std": 1.6253468990325928, "rewards/rejected": -7.338476657867432, "step": 430 }, { "epoch": 0.76, "grad_norm": 1.515625, "learning_rate": 8.197041935593181e-07, "logits/chosen": 0.00823027454316616, "logits/rejected": 0.3566310703754425, "logps/chosen": -616.1336059570312, "logps/rejected": -1032.385498046875, "loss": 0.1463, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9791882038116455, "rewards/margins": 4.084582805633545, "rewards/margins_max": 6.057514190673828, "rewards/margins_min": 2.11165189743042, "rewards/margins_std": 2.7901463508605957, "rewards/rejected": -8.06377124786377, "step": 440 }, { "epoch": 0.78, "grad_norm": 3.859375, "learning_rate": 7.110025981396976e-07, "logits/chosen": -0.03782680630683899, "logits/rejected": 0.3085945248603821, "logps/chosen": -653.0308227539062, "logps/rejected": -1006.2077026367188, "loss": 0.2029, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.257491111755371, "rewards/margins": 3.661691665649414, "rewards/margins_max": 5.82886266708374, "rewards/margins_min": 1.4945199489593506, "rewards/margins_std": 3.0648434162139893, "rewards/rejected": -7.919183254241943, "step": 450 }, { "epoch": 0.8, "grad_norm": 2.703125, "learning_rate": 6.088288602287159e-07, "logits/chosen": 0.10529260337352753, "logits/rejected": 0.32944411039352417, "logps/chosen": -692.2623901367188, "logps/rejected": -1112.4979248046875, "loss": 0.2451, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.745668888092041, "rewards/margins": 3.900517702102661, "rewards/margins_max": 5.501496315002441, "rewards/margins_min": 2.29953932762146, "rewards/margins_std": 2.264125108718872, "rewards/rejected": -8.646186828613281, "step": 460 }, { "epoch": 0.81, "grad_norm": 2.921875, "learning_rate": 5.135558007156146e-07, "logits/chosen": -0.016824286431074142, "logits/rejected": 0.32702240347862244, "logps/chosen": -655.9515991210938, "logps/rejected": -1033.0357666015625, "loss": 0.2107, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.3458967208862305, "rewards/margins": 3.7026946544647217, "rewards/margins_max": 5.652235984802246, "rewards/margins_min": 1.7531543970108032, "rewards/margins_std": 2.7570672035217285, "rewards/rejected": -8.048592567443848, "step": 470 }, { "epoch": 0.83, "grad_norm": 2.796875, "learning_rate": 4.255310606625124e-07, "logits/chosen": 0.04231051355600357, "logits/rejected": 0.4128199517726898, "logps/chosen": -678.6295776367188, "logps/rejected": -993.6192626953125, "loss": 0.2425, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.49677038192749, "rewards/margins": 3.3295822143554688, "rewards/margins_max": 5.123600006103516, "rewards/margins_min": 1.5355651378631592, "rewards/margins_std": 2.5371241569519043, "rewards/rejected": -7.826352596282959, "step": 480 }, { "epoch": 0.85, "grad_norm": 1.7109375, "learning_rate": 3.450758327998768e-07, "logits/chosen": 0.08123823255300522, "logits/rejected": 0.3833978772163391, "logps/chosen": -670.7314453125, "logps/rejected": -934.2393798828125, "loss": 0.1899, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.392338752746582, "rewards/margins": 2.830777883529663, "rewards/margins_max": 4.131167411804199, "rewards/margins_min": 1.5303874015808105, "rewards/margins_std": 1.8390296697616577, "rewards/rejected": -7.223116874694824, "step": 490 }, { "epoch": 0.86, "grad_norm": 1.2421875, "learning_rate": 2.7248368952908055e-07, "logits/chosen": -0.027458935976028442, "logits/rejected": 0.36244386434555054, "logps/chosen": -688.2174682617188, "logps/rejected": -995.5029296875, "loss": 0.2123, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.243028163909912, "rewards/margins": 3.488999843597412, "rewards/margins_max": 5.064884662628174, "rewards/margins_min": 1.9131158590316772, "rewards/margins_std": 2.228637218475342, "rewards/rejected": -7.732027530670166, "step": 500 }, { "epoch": 0.88, "grad_norm": 0.93359375, "learning_rate": 2.0801951170854402e-07, "logits/chosen": 0.035914286971092224, "logits/rejected": 0.3463380038738251, "logps/chosen": -690.3873291015625, "logps/rejected": -947.671875, "loss": 0.2168, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.574772834777832, "rewards/margins": 2.7171061038970947, "rewards/margins_max": 4.018531799316406, "rewards/margins_min": 1.415679931640625, "rewards/margins_std": 1.840494155883789, "rewards/rejected": -7.291878700256348, "step": 510 }, { "epoch": 0.9, "grad_norm": 3.90625, "learning_rate": 1.5191852213221198e-07, "logits/chosen": 0.02695058286190033, "logits/rejected": 0.3428993225097656, "logps/chosen": -645.2525634765625, "logps/rejected": -1058.0179443359375, "loss": 0.1993, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.427311420440674, "rewards/margins": 4.059060096740723, "rewards/margins_max": 6.139120578765869, "rewards/margins_min": 1.9789988994598389, "rewards/margins_std": 2.9416496753692627, "rewards/rejected": -8.486371040344238, "step": 520 }, { "epoch": 0.92, "grad_norm": 1.46875, "learning_rate": 1.0438542722708444e-07, "logits/chosen": -0.06257595121860504, "logits/rejected": 0.37371453642845154, "logps/chosen": -663.5725708007812, "logps/rejected": -1019.5596923828125, "loss": 0.2095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.217853546142578, "rewards/margins": 3.8794631958007812, "rewards/margins_max": 5.407975196838379, "rewards/margins_min": 2.3509514331817627, "rewards/margins_std": 2.16164231300354, "rewards/rejected": -8.09731674194336, "step": 530 }, { "epoch": 0.93, "grad_norm": 1.578125, "learning_rate": 6.559367010166629e-08, "logits/chosen": 0.05638759583234787, "logits/rejected": 0.4552284777164459, "logps/chosen": -629.8983764648438, "logps/rejected": -933.1326293945312, "loss": 0.2206, "rewards/accuracies": 0.9375, "rewards/chosen": -4.098700523376465, "rewards/margins": 3.2016823291778564, "rewards/margins_max": 4.7647705078125, "rewards/margins_min": 1.6385942697525024, "rewards/margins_std": 2.210540294647217, "rewards/rejected": -7.300383567810059, "step": 540 }, { "epoch": 0.95, "grad_norm": 1.71875, "learning_rate": 3.568479767087296e-08, "logits/chosen": -0.02560499869287014, "logits/rejected": 0.38633134961128235, "logps/chosen": -686.8065185546875, "logps/rejected": -1091.582275390625, "loss": 0.166, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.320387840270996, "rewards/margins": 4.097320079803467, "rewards/margins_max": 6.098603248596191, "rewards/margins_min": 2.0960371494293213, "rewards/margins_std": 2.8302416801452637, "rewards/rejected": -8.417707443237305, "step": 550 }, { "epoch": 0.97, "grad_norm": 2.734375, "learning_rate": 1.4767944166687032e-08, "logits/chosen": 0.031356751918792725, "logits/rejected": 0.39280715584754944, "logps/chosen": -640.6392822265625, "logps/rejected": -896.2991333007812, "loss": 0.1943, "rewards/accuracies": 0.9375, "rewards/chosen": -4.334517478942871, "rewards/margins": 2.8293161392211914, "rewards/margins_max": 4.200706481933594, "rewards/margins_min": 1.4579263925552368, "rewards/margins_std": 1.9394381046295166, "rewards/rejected": -7.1638336181640625, "step": 560 }, { "epoch": 0.99, "grad_norm": 1.484375, "learning_rate": 2.919432919183396e-09, "logits/chosen": 0.034402523189783096, "logits/rejected": 0.26731401681900024, "logps/chosen": -660.5680541992188, "logps/rejected": -966.1617431640625, "loss": 0.2381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.3432512283325195, "rewards/margins": 2.8476345539093018, "rewards/margins_max": 4.452241897583008, "rewards/margins_min": 1.2430269718170166, "rewards/margins_std": 2.2692577838897705, "rewards/rejected": -7.1908860206604, "step": 570 }, { "epoch": 1.0, "eval_logits/chosen": 0.8392583727836609, "eval_logits/rejected": 0.9920319318771362, "eval_logps/chosen": -363.14178466796875, "eval_logps/rejected": -356.29058837890625, "eval_loss": 0.6883148550987244, "eval_rewards/accuracies": 0.5370000004768372, "eval_rewards/chosen": -0.19742479920387268, "eval_rewards/margins": 0.023633990436792374, "eval_rewards/margins_max": 0.3502767086029053, "eval_rewards/margins_min": -0.25272664427757263, "eval_rewards/margins_std": 0.19813816249370575, "eval_rewards/rejected": -0.22105878591537476, "eval_runtime": 434.3781, "eval_samples_per_second": 9.209, "eval_steps_per_second": 0.288, "step": 578 }, { "epoch": 1.0, "step": 578, "total_flos": 0.0, "train_loss": 0.3441472670198724, "train_runtime": 5904.9783, "train_samples_per_second": 3.134, "train_steps_per_second": 0.098 } ], "logging_steps": 10, "max_steps": 578, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }