{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 67.68711617084571, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0138040781021118, "logits/rejected": -0.9843803644180298, "logps/chosen": -0.27389779686927795, "logps/rejected": -0.27152979373931885, "loss": 2.8701, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -2.738978147506714, "rewards/margins": -0.023679737001657486, "rewards/rejected": -2.7152981758117676, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 45.31354871727359, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.046543836593628, "logits/rejected": -0.9791992902755737, "logps/chosen": -0.2944639325141907, "logps/rejected": -0.29935842752456665, "loss": 3.1335, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.944639205932617, "rewards/margins": 0.048944875597953796, "rewards/rejected": -2.993583917617798, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 54.793044595669066, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9610408544540405, "logits/rejected": -0.9804547429084778, "logps/chosen": -0.26405906677246094, "logps/rejected": -0.30090609192848206, "loss": 3.283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.6405904293060303, "rewards/margins": 0.36847010254859924, "rewards/rejected": -3.009061098098755, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 64.75467330162397, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9646309018135071, "logits/rejected": -0.9382796287536621, "logps/chosen": -0.27755117416381836, "logps/rejected": -0.2911796569824219, "loss": 3.0358, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.7755115032196045, "rewards/margins": 0.13628481328487396, "rewards/rejected": -2.9117963314056396, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 58.135934584159855, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0087430477142334, "logits/rejected": -0.979783833026886, "logps/chosen": -0.2717607021331787, "logps/rejected": -0.27844810485839844, "loss": 3.2801, "rewards/accuracies": 0.5, "rewards/chosen": -2.7176074981689453, "rewards/margins": 0.06687381863594055, "rewards/rejected": -2.7844810485839844, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 54.72060830338181, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.9874817132949829, "logits/rejected": -0.9428399801254272, "logps/chosen": -0.2744089663028717, "logps/rejected": -0.280087947845459, "loss": 3.0622, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.7440896034240723, "rewards/margins": 0.05678989738225937, "rewards/rejected": -2.8008792400360107, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 69.4443195378018, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0501333475112915, "logits/rejected": -0.9746049046516418, "logps/chosen": -0.29651719331741333, "logps/rejected": -0.32457858324050903, "loss": 2.9921, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.9651715755462646, "rewards/margins": 0.28061443567276, "rewards/rejected": -3.24578595161438, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 64.1156151335301, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.9999458193778992, "logits/rejected": -0.9568389654159546, "logps/chosen": -0.28250178694725037, "logps/rejected": -0.33312201499938965, "loss": 2.8648, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.8250176906585693, "rewards/margins": 0.5062021017074585, "rewards/rejected": -3.3312201499938965, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 77.89308260928325, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0530364513397217, "logits/rejected": -1.0099899768829346, "logps/chosen": -0.3571055233478546, "logps/rejected": -0.415992796421051, "loss": 2.9077, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.5710549354553223, "rewards/margins": 0.5888724327087402, "rewards/rejected": -4.159927845001221, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 166.78757879676576, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.0349509716033936, "logits/rejected": -0.98637855052948, "logps/chosen": -0.4092700481414795, "logps/rejected": -0.46339258551597595, "loss": 2.9863, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -4.092700481414795, "rewards/margins": 0.541225790977478, "rewards/rejected": -4.633925914764404, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 90.09601070175329, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.065796136856079, "logits/rejected": -1.0304781198501587, "logps/chosen": -0.3426639139652252, "logps/rejected": -0.41399651765823364, "loss": 2.6366, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.4266390800476074, "rewards/margins": 0.7133262753486633, "rewards/rejected": -4.139965057373047, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 66.51063441631874, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.1138174533843994, "logits/rejected": -1.0807254314422607, "logps/chosen": -0.3372642397880554, "logps/rejected": -0.3515945076942444, "loss": 2.9058, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.3726420402526855, "rewards/margins": 0.14330288767814636, "rewards/rejected": -3.5159454345703125, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 68.34206173142158, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.0417295694351196, "logits/rejected": -1.012995719909668, "logps/chosen": -0.3927769064903259, "logps/rejected": -0.46754512190818787, "loss": 2.8373, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.927769422531128, "rewards/margins": 0.7476823329925537, "rewards/rejected": -4.675451755523682, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 51.65747872582199, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.062339186668396, "logits/rejected": -1.0380957126617432, "logps/chosen": -0.35794615745544434, "logps/rejected": -0.4516576826572418, "loss": 2.7741, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.5794613361358643, "rewards/margins": 0.9371155500411987, "rewards/rejected": -4.516576766967773, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 63.74683146803955, "learning_rate": 9.890738003669027e-07, "logits/chosen": -1.0335708856582642, "logits/rejected": -0.9640315175056458, "logps/chosen": -0.374131977558136, "logps/rejected": -0.427705854177475, "loss": 2.8721, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.7413196563720703, "rewards/margins": 0.5357389450073242, "rewards/rejected": -4.277059078216553, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 52.50658912619265, "learning_rate": 9.848447601883433e-07, "logits/chosen": -1.02134108543396, "logits/rejected": -1.0074421167373657, "logps/chosen": -0.3640156090259552, "logps/rejected": -0.47974318265914917, "loss": 2.5858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.640155792236328, "rewards/margins": 1.1572763919830322, "rewards/rejected": -4.7974324226379395, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 80.71357875335279, "learning_rate": 9.799376207714444e-07, "logits/chosen": -1.0322595834732056, "logits/rejected": -1.0098541975021362, "logps/chosen": -0.3591312766075134, "logps/rejected": -0.4327390789985657, "loss": 2.5535, "rewards/accuracies": 0.625, "rewards/chosen": -3.591312885284424, "rewards/margins": 0.7360779643058777, "rewards/rejected": -4.327390670776367, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 62.211027942701996, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.0733072757720947, "logits/rejected": -1.038777470588684, "logps/chosen": -0.4550396502017975, "logps/rejected": -0.5712156891822815, "loss": 2.7375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.550396919250488, "rewards/margins": 1.1617599725723267, "rewards/rejected": -5.712156295776367, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 51.932192514574744, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.1431421041488647, "logits/rejected": -1.061301827430725, "logps/chosen": -0.4678395390510559, "logps/rejected": -0.5182651281356812, "loss": 2.6348, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.678394794464111, "rewards/margins": 0.5042564272880554, "rewards/rejected": -5.182651042938232, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 65.16945385496659, "learning_rate": 9.612209208833646e-07, "logits/chosen": -1.0209256410598755, "logits/rejected": -0.9965961575508118, "logps/chosen": -0.4655056893825531, "logps/rejected": -0.5174664258956909, "loss": 2.8978, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.655056953430176, "rewards/margins": 0.5196069478988647, "rewards/rejected": -5.174664497375488, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 73.21892637196613, "learning_rate": 9.536793472839324e-07, "logits/chosen": -1.035591959953308, "logits/rejected": -0.9819633364677429, "logps/chosen": -0.4058953821659088, "logps/rejected": -0.5255031585693359, "loss": 2.8857, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.058953762054443, "rewards/margins": 1.1960771083831787, "rewards/rejected": -5.255030632019043, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 66.39827090408234, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9949567914009094, "logits/rejected": -0.933055579662323, "logps/chosen": -0.46234995126724243, "logps/rejected": -0.6080808639526367, "loss": 2.629, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.623498916625977, "rewards/margins": 1.457309603691101, "rewards/rejected": -6.080808162689209, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 71.34302017387473, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.044356346130371, "logits/rejected": -0.9833389520645142, "logps/chosen": -0.4943835735321045, "logps/rejected": -0.5495572686195374, "loss": 2.5655, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.943835735321045, "rewards/margins": 0.55173659324646, "rewards/rejected": -5.495572090148926, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 101.83657869206615, "learning_rate": 9.272941683504808e-07, "logits/chosen": -1.004738211631775, "logits/rejected": -0.9129018783569336, "logps/chosen": -0.5077718496322632, "logps/rejected": -0.721434473991394, "loss": 2.5626, "rewards/accuracies": 0.71875, "rewards/chosen": -5.077718257904053, "rewards/margins": 2.1366257667541504, "rewards/rejected": -7.214344024658203, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 75.83890050308636, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.0654194355010986, "logits/rejected": -1.0217397212982178, "logps/chosen": -0.5593295693397522, "logps/rejected": -0.652821958065033, "loss": 2.294, "rewards/accuracies": 0.625, "rewards/chosen": -5.593295097351074, "rewards/margins": 0.9349241256713867, "rewards/rejected": -6.528220176696777, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 199.14128355265714, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0831068754196167, "logits/rejected": -1.07265305519104, "logps/chosen": -0.5251038670539856, "logps/rejected": -0.8123334646224976, "loss": 2.3811, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.251038551330566, "rewards/margins": 2.8722970485687256, "rewards/rejected": -8.123334884643555, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 128.88813657687064, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.0704658031463623, "logits/rejected": -1.0186035633087158, "logps/chosen": -0.5541162490844727, "logps/rejected": -0.6640071272850037, "loss": 2.209, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.541162967681885, "rewards/margins": 1.09890878200531, "rewards/rejected": -6.640070915222168, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 79.22530063044107, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.136506199836731, "logits/rejected": -1.111253261566162, "logps/chosen": -0.6426690816879272, "logps/rejected": -0.7467389702796936, "loss": 2.3897, "rewards/accuracies": 0.6875, "rewards/chosen": -6.42669153213501, "rewards/margins": 1.0406982898712158, "rewards/rejected": -7.467389106750488, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 76.08443179154013, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.0284653902053833, "logits/rejected": -1.0001672506332397, "logps/chosen": -0.6205825805664062, "logps/rejected": -0.8218838572502136, "loss": 2.2269, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.2058258056640625, "rewards/margins": 2.0130133628845215, "rewards/rejected": -8.218838691711426, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 70.40672339988168, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.064420223236084, "logits/rejected": -1.039480447769165, "logps/chosen": -0.662723183631897, "logps/rejected": -0.8177149891853333, "loss": 2.3277, "rewards/accuracies": 0.75, "rewards/chosen": -6.627232551574707, "rewards/margins": 1.5499169826507568, "rewards/rejected": -8.177148818969727, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 64.00338755339276, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.0651726722717285, "logits/rejected": -1.0403941869735718, "logps/chosen": -0.6938959360122681, "logps/rejected": -0.9076633453369141, "loss": 2.1799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.93895959854126, "rewards/margins": 2.1376729011535645, "rewards/rejected": -9.076631546020508, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 92.6318256385038, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.069675087928772, "logits/rejected": -1.0496129989624023, "logps/chosen": -0.7591303586959839, "logps/rejected": -1.0853347778320312, "loss": 2.104, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.591303825378418, "rewards/margins": 3.2620437145233154, "rewards/rejected": -10.853347778320312, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 70.71500119204794, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.0753862857818604, "logits/rejected": -1.0458089113235474, "logps/chosen": -0.8190089464187622, "logps/rejected": -1.1394321918487549, "loss": 2.1276, "rewards/accuracies": 0.71875, "rewards/chosen": -8.190088272094727, "rewards/margins": 3.204233169555664, "rewards/rejected": -11.39432144165039, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 69.30072890016888, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.0737378597259521, "logits/rejected": -1.0490288734436035, "logps/chosen": -0.7565592527389526, "logps/rejected": -1.1054339408874512, "loss": 1.9844, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.5655927658081055, "rewards/margins": 3.4887466430664062, "rewards/rejected": -11.054338455200195, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 108.7357161171701, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.079113483428955, "logits/rejected": -1.0278595685958862, "logps/chosen": -0.8560416102409363, "logps/rejected": -1.0362493991851807, "loss": 1.9368, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.560415267944336, "rewards/margins": 1.8020769357681274, "rewards/rejected": -10.362493515014648, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 67.3993996516812, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.0435800552368164, "logits/rejected": -1.0182024240493774, "logps/chosen": -0.844826877117157, "logps/rejected": -1.13657546043396, "loss": 1.9327, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.448267936706543, "rewards/margins": 2.9174869060516357, "rewards/rejected": -11.365755081176758, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 84.2500100948778, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.0949599742889404, "logits/rejected": -1.035228967666626, "logps/chosen": -0.9122417569160461, "logps/rejected": -1.2345964908599854, "loss": 1.8399, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.122416496276855, "rewards/margins": 3.2235474586486816, "rewards/rejected": -12.345964431762695, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 81.5025031331217, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.0835707187652588, "logits/rejected": -1.0875083208084106, "logps/chosen": -1.0147769451141357, "logps/rejected": -1.480102300643921, "loss": 1.766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -10.1477689743042, "rewards/margins": 4.653253078460693, "rewards/rejected": -14.80102252960205, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 80.95225741438045, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.0223807096481323, "logits/rejected": -0.998031735420227, "logps/chosen": -1.0001453161239624, "logps/rejected": -1.3144450187683105, "loss": 1.6197, "rewards/accuracies": 0.78125, "rewards/chosen": -10.001453399658203, "rewards/margins": 3.1429970264434814, "rewards/rejected": -13.144450187683105, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 125.73873839424112, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.0253870487213135, "logits/rejected": -1.010780692100525, "logps/chosen": -1.038006067276001, "logps/rejected": -1.403381109237671, "loss": 1.9038, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -10.380061149597168, "rewards/margins": 3.6537506580352783, "rewards/rejected": -14.033811569213867, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 81.37994512389571, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.0707476139068604, "logits/rejected": -1.017699956893921, "logps/chosen": -1.1197742223739624, "logps/rejected": -1.4551405906677246, "loss": 1.8428, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -11.197742462158203, "rewards/margins": 3.353663921356201, "rewards/rejected": -14.551406860351562, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 77.78963459942278, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.0852477550506592, "logits/rejected": -1.0739668607711792, "logps/chosen": -1.2244949340820312, "logps/rejected": -1.6010786294937134, "loss": 1.81, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.244949340820312, "rewards/margins": 3.7658371925354004, "rewards/rejected": -16.010784149169922, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 95.28625999651774, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.042944312095642, "logits/rejected": -1.0247914791107178, "logps/chosen": -1.287192702293396, "logps/rejected": -1.7063806056976318, "loss": 1.6716, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -12.871930122375488, "rewards/margins": 4.1918792724609375, "rewards/rejected": -17.06380844116211, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 111.60013559293391, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.1138712167739868, "logits/rejected": -1.0898603200912476, "logps/chosen": -1.323070764541626, "logps/rejected": -1.7772735357284546, "loss": 1.634, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -13.230708122253418, "rewards/margins": 4.542030334472656, "rewards/rejected": -17.772735595703125, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 67.34558751472282, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.0925663709640503, "logits/rejected": -1.0554755926132202, "logps/chosen": -1.2628586292266846, "logps/rejected": -1.6815145015716553, "loss": 1.5673, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -12.62858772277832, "rewards/margins": 4.186559200286865, "rewards/rejected": -16.81514549255371, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 86.856491905057, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.1707683801651, "logits/rejected": -1.1189748048782349, "logps/chosen": -1.2487156391143799, "logps/rejected": -1.609958291053772, "loss": 1.6558, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -12.487154960632324, "rewards/margins": 3.612428665161133, "rewards/rejected": -16.09958267211914, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 90.1311022017979, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.204642653465271, "logits/rejected": -1.1753454208374023, "logps/chosen": -1.2344571352005005, "logps/rejected": -1.7249053716659546, "loss": 1.6159, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -12.344571113586426, "rewards/margins": 4.904484272003174, "rewards/rejected": -17.249053955078125, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 86.31881429615238, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.18668794631958, "logits/rejected": -1.1855318546295166, "logps/chosen": -1.1789348125457764, "logps/rejected": -1.5857030153274536, "loss": 1.6202, "rewards/accuracies": 0.8125, "rewards/chosen": -11.789347648620605, "rewards/margins": 4.06768274307251, "rewards/rejected": -15.857030868530273, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 114.84363786792451, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.2951276302337646, "logits/rejected": -1.232521414756775, "logps/chosen": -1.1696799993515015, "logps/rejected": -1.6479179859161377, "loss": 1.5369, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -11.696800231933594, "rewards/margins": 4.782378673553467, "rewards/rejected": -16.479177474975586, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 104.33306568769014, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.2323393821716309, "logits/rejected": -1.2163569927215576, "logps/chosen": -1.2473630905151367, "logps/rejected": -1.7756588459014893, "loss": 1.4989, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -12.473630905151367, "rewards/margins": 5.282957077026367, "rewards/rejected": -17.756587982177734, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 84.4729167910645, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.279651403427124, "logits/rejected": -1.2507905960083008, "logps/chosen": -1.3707444667816162, "logps/rejected": -1.817111611366272, "loss": 1.5956, "rewards/accuracies": 0.8125, "rewards/chosen": -13.707443237304688, "rewards/margins": 4.4636712074279785, "rewards/rejected": -18.17111587524414, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 127.60235959093131, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.2543408870697021, "logits/rejected": -1.2242779731750488, "logps/chosen": -1.255432367324829, "logps/rejected": -1.6961147785186768, "loss": 1.4782, "rewards/accuracies": 0.8125, "rewards/chosen": -12.554323196411133, "rewards/margins": 4.40682315826416, "rewards/rejected": -16.96114730834961, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 78.40726215911216, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.2240062952041626, "logits/rejected": -1.2036255598068237, "logps/chosen": -1.3716175556182861, "logps/rejected": -1.8241313695907593, "loss": 1.4752, "rewards/accuracies": 0.8125, "rewards/chosen": -13.71617603302002, "rewards/margins": 4.525136470794678, "rewards/rejected": -18.241313934326172, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 184.50515640629632, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.2216978073120117, "logits/rejected": -1.1910299062728882, "logps/chosen": -1.2710390090942383, "logps/rejected": -1.736193060874939, "loss": 1.6616, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -12.7103910446167, "rewards/margins": 4.651541709899902, "rewards/rejected": -17.36193084716797, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 81.5464791721336, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.189316749572754, "logits/rejected": -1.148740291595459, "logps/chosen": -1.2588434219360352, "logps/rejected": -1.6372572183609009, "loss": 1.6409, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -12.588434219360352, "rewards/margins": 3.7841391563415527, "rewards/rejected": -16.372573852539062, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 88.70459922308653, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.2741549015045166, "logits/rejected": -1.2473857402801514, "logps/chosen": -1.2827661037445068, "logps/rejected": -1.7844867706298828, "loss": 1.5171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.827661514282227, "rewards/margins": 5.017209053039551, "rewards/rejected": -17.84486961364746, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 108.14517308298038, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.2398396730422974, "logits/rejected": -1.2030993700027466, "logps/chosen": -1.3062845468521118, "logps/rejected": -1.8119999170303345, "loss": 1.3104, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -13.062845230102539, "rewards/margins": 5.057154178619385, "rewards/rejected": -18.119998931884766, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 111.38701212744857, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.2592750787734985, "logits/rejected": -1.2147754430770874, "logps/chosen": -1.2920079231262207, "logps/rejected": -1.6999801397323608, "loss": 1.5855, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -12.920079231262207, "rewards/margins": 4.0797224044799805, "rewards/rejected": -16.999801635742188, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 86.74920495684881, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.19474196434021, "logits/rejected": -1.1794744729995728, "logps/chosen": -1.3712317943572998, "logps/rejected": -1.7885491847991943, "loss": 1.4522, "rewards/accuracies": 0.8125, "rewards/chosen": -13.712318420410156, "rewards/margins": 4.173174858093262, "rewards/rejected": -17.885494232177734, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 98.40200674580878, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.125414252281189, "logits/rejected": -1.0979323387145996, "logps/chosen": -1.3092833757400513, "logps/rejected": -1.6569970846176147, "loss": 1.9836, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -13.09283447265625, "rewards/margins": 3.477137327194214, "rewards/rejected": -16.56997299194336, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 91.28064757080084, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.2477811574935913, "logits/rejected": -1.1988236904144287, "logps/chosen": -1.2682318687438965, "logps/rejected": -1.6152031421661377, "loss": 1.5604, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -12.682320594787598, "rewards/margins": 3.4697113037109375, "rewards/rejected": -16.15203094482422, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 127.53302078818454, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.2703194618225098, "logits/rejected": -1.2484561204910278, "logps/chosen": -1.3710952997207642, "logps/rejected": -1.7914714813232422, "loss": 1.7937, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -13.710952758789062, "rewards/margins": 4.203763008117676, "rewards/rejected": -17.914714813232422, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 90.11357421721681, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.2548288106918335, "logits/rejected": -1.1975423097610474, "logps/chosen": -1.2582523822784424, "logps/rejected": -1.6725326776504517, "loss": 1.4717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.582524299621582, "rewards/margins": 4.142804145812988, "rewards/rejected": -16.72532844543457, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 110.25787884605205, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.240769624710083, "logits/rejected": -1.2348911762237549, "logps/chosen": -1.3247087001800537, "logps/rejected": -1.8381633758544922, "loss": 1.5083, "rewards/accuracies": 0.875, "rewards/chosen": -13.247088432312012, "rewards/margins": 5.13454532623291, "rewards/rejected": -18.38163185119629, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 87.86537110684131, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.2655290365219116, "logits/rejected": -1.2139763832092285, "logps/chosen": -1.4641401767730713, "logps/rejected": -2.030886173248291, "loss": 1.4781, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -14.641400337219238, "rewards/margins": 5.6674604415893555, "rewards/rejected": -20.308862686157227, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 67.91419002468011, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.1775472164154053, "logits/rejected": -1.1353617906570435, "logps/chosen": -1.326864242553711, "logps/rejected": -1.714586853981018, "loss": 1.5025, "rewards/accuracies": 0.8125, "rewards/chosen": -13.268641471862793, "rewards/margins": 3.877227783203125, "rewards/rejected": -17.145870208740234, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 115.85251510237708, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.2296580076217651, "logits/rejected": -1.2089414596557617, "logps/chosen": -1.273493766784668, "logps/rejected": -1.7592452764511108, "loss": 1.4573, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -12.73493766784668, "rewards/margins": 4.857515335083008, "rewards/rejected": -17.592453002929688, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 103.0968258674206, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.232074499130249, "logits/rejected": -1.195978045463562, "logps/chosen": -1.3062849044799805, "logps/rejected": -1.7718702554702759, "loss": 1.6156, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -13.062849044799805, "rewards/margins": 4.655852317810059, "rewards/rejected": -17.71870231628418, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 107.53235395682732, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.2648028135299683, "logits/rejected": -1.2383410930633545, "logps/chosen": -1.3434202671051025, "logps/rejected": -1.8917827606201172, "loss": 1.4928, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -13.434202194213867, "rewards/margins": 5.483624458312988, "rewards/rejected": -18.91782569885254, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 129.81915818530751, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.2545734643936157, "logits/rejected": -1.2568590641021729, "logps/chosen": -1.3217370510101318, "logps/rejected": -1.7853542566299438, "loss": 1.609, "rewards/accuracies": 0.8125, "rewards/chosen": -13.217371940612793, "rewards/margins": 4.636171340942383, "rewards/rejected": -17.85354232788086, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 98.13171063803792, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.172861099243164, "logits/rejected": -1.1855199337005615, "logps/chosen": -1.2397878170013428, "logps/rejected": -1.6898447275161743, "loss": 1.367, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -12.397878646850586, "rewards/margins": 4.500570297241211, "rewards/rejected": -16.898448944091797, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 85.50591463710167, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.2363743782043457, "logits/rejected": -1.231896162033081, "logps/chosen": -1.2634873390197754, "logps/rejected": -1.813397765159607, "loss": 1.5488, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -12.634872436523438, "rewards/margins": 5.4991044998168945, "rewards/rejected": -18.133975982666016, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 77.03823618402183, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.2720203399658203, "logits/rejected": -1.2016078233718872, "logps/chosen": -1.313655138015747, "logps/rejected": -1.8480865955352783, "loss": 1.494, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -13.136550903320312, "rewards/margins": 5.344315528869629, "rewards/rejected": -18.480867385864258, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 118.91064994219902, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.2069206237792969, "logits/rejected": -1.1931571960449219, "logps/chosen": -1.3250806331634521, "logps/rejected": -1.7204444408416748, "loss": 1.564, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -13.250805854797363, "rewards/margins": 3.953639268875122, "rewards/rejected": -17.20444679260254, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 136.29955659690972, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.2420449256896973, "logits/rejected": -1.2433878183364868, "logps/chosen": -1.4292832612991333, "logps/rejected": -1.9152038097381592, "loss": 1.7559, "rewards/accuracies": 0.8125, "rewards/chosen": -14.292834281921387, "rewards/margins": 4.8592071533203125, "rewards/rejected": -19.152042388916016, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 133.4026817083598, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.2641063928604126, "logits/rejected": -1.240259051322937, "logps/chosen": -1.2963588237762451, "logps/rejected": -1.7182788848876953, "loss": 1.6838, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.963586807250977, "rewards/margins": 4.219202518463135, "rewards/rejected": -17.182790756225586, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 95.74162334394272, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.2561897039413452, "logits/rejected": -1.2308216094970703, "logps/chosen": -1.3019812107086182, "logps/rejected": -1.8643944263458252, "loss": 1.3289, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -13.019811630249023, "rewards/margins": 5.624133586883545, "rewards/rejected": -18.643943786621094, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 82.55781895212651, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.274784803390503, "logits/rejected": -1.252673625946045, "logps/chosen": -1.3898085355758667, "logps/rejected": -1.9245649576187134, "loss": 1.4908, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -13.898086547851562, "rewards/margins": 5.347563743591309, "rewards/rejected": -19.245651245117188, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 82.99309572753097, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.272572636604309, "logits/rejected": -1.248715877532959, "logps/chosen": -1.3395044803619385, "logps/rejected": -1.8316198587417603, "loss": 1.627, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -13.395045280456543, "rewards/margins": 4.9211530685424805, "rewards/rejected": -18.316198348999023, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 83.64554667200588, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.2721501588821411, "logits/rejected": -1.2531596422195435, "logps/chosen": -1.3134849071502686, "logps/rejected": -1.7619584798812866, "loss": 1.3887, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -13.134851455688477, "rewards/margins": 4.484734058380127, "rewards/rejected": -17.619586944580078, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.4803762435913086, "eval_logits/rejected": -1.487606406211853, "eval_logps/chosen": -1.3091278076171875, "eval_logps/rejected": -1.7820134162902832, "eval_loss": 1.3990418910980225, "eval_rewards/accuracies": 0.8130081295967102, "eval_rewards/chosen": -13.091278076171875, "eval_rewards/margins": 4.728854656219482, "eval_rewards/rejected": -17.820133209228516, "eval_runtime": 97.2046, "eval_samples_per_second": 20.174, "eval_steps_per_second": 1.265, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 115.45390071285422, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.2442853450775146, "logits/rejected": -1.2601758241653442, "logps/chosen": -1.377743124961853, "logps/rejected": -1.7813876867294312, "loss": 1.3838, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -13.777430534362793, "rewards/margins": 4.036444187164307, "rewards/rejected": -17.81387710571289, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 175.32951661092005, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.26015305519104, "logits/rejected": -1.247621774673462, "logps/chosen": -1.3543680906295776, "logps/rejected": -1.785305380821228, "loss": 1.6236, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -13.543681144714355, "rewards/margins": 4.309372425079346, "rewards/rejected": -17.85305404663086, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 128.03583207973946, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.2588884830474854, "logits/rejected": -1.2014896869659424, "logps/chosen": -1.2802027463912964, "logps/rejected": -1.8253154754638672, "loss": 1.691, "rewards/accuracies": 0.875, "rewards/chosen": -12.802027702331543, "rewards/margins": 5.451126575469971, "rewards/rejected": -18.253154754638672, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 131.09667381189382, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.2988319396972656, "logits/rejected": -1.2421010732650757, "logps/chosen": -1.2468286752700806, "logps/rejected": -1.740648627281189, "loss": 1.4921, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -12.468287467956543, "rewards/margins": 4.9382004737854, "rewards/rejected": -17.40648651123047, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 119.28024840311178, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.2894878387451172, "logits/rejected": -1.2769581079483032, "logps/chosen": -1.3600680828094482, "logps/rejected": -1.768842339515686, "loss": 1.8112, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -13.600680351257324, "rewards/margins": 4.087743759155273, "rewards/rejected": -17.688425064086914, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 115.47761461915916, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.2687110900878906, "logits/rejected": -1.2503737211227417, "logps/chosen": -1.4135878086090088, "logps/rejected": -1.8798710107803345, "loss": 1.532, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -14.135879516601562, "rewards/margins": 4.662830829620361, "rewards/rejected": -18.798709869384766, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 92.69552608849006, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.2282474040985107, "logits/rejected": -1.1837421655654907, "logps/chosen": -1.2822808027267456, "logps/rejected": -1.7602012157440186, "loss": 1.5297, "rewards/accuracies": 0.875, "rewards/chosen": -12.822809219360352, "rewards/margins": 4.779202461242676, "rewards/rejected": -17.60201072692871, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 94.73208287089179, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.2425181865692139, "logits/rejected": -1.1809542179107666, "logps/chosen": -1.3228919506072998, "logps/rejected": -1.7335517406463623, "loss": 1.4534, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -13.228915214538574, "rewards/margins": 4.1065993309021, "rewards/rejected": -17.33551597595215, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 87.27412138581354, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.2042580842971802, "logits/rejected": -1.1834803819656372, "logps/chosen": -1.3398433923721313, "logps/rejected": -1.8506107330322266, "loss": 1.2833, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -13.39843463897705, "rewards/margins": 5.107672691345215, "rewards/rejected": -18.506107330322266, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 98.16271043892891, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.312468409538269, "logits/rejected": -1.289397954940796, "logps/chosen": -1.3002225160598755, "logps/rejected": -1.7998195886611938, "loss": 1.6962, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -13.002224922180176, "rewards/margins": 4.995969772338867, "rewards/rejected": -17.998193740844727, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 105.4703537032148, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.2772085666656494, "logits/rejected": -1.2392375469207764, "logps/chosen": -1.2873907089233398, "logps/rejected": -1.8444169759750366, "loss": 1.1123, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -12.873907089233398, "rewards/margins": 5.570265769958496, "rewards/rejected": -18.444171905517578, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 94.48602531246911, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.2766510248184204, "logits/rejected": -1.2629082202911377, "logps/chosen": -1.3569005727767944, "logps/rejected": -1.862248420715332, "loss": 1.3351, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -13.569005012512207, "rewards/margins": 5.05348014831543, "rewards/rejected": -18.622486114501953, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 100.25182910178914, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.2500464916229248, "logits/rejected": -1.251854658126831, "logps/chosen": -1.3458898067474365, "logps/rejected": -1.8206214904785156, "loss": 1.6214, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -13.458898544311523, "rewards/margins": 4.747317314147949, "rewards/rejected": -18.20621681213379, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.9776853305114157, "train_runtime": 11426.5831, "train_samples_per_second": 5.24, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }