{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9952904238618525, "eval_steps": 500, "global_step": 1431, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020931449502878076, "grad_norm": 8.275816917419434, "learning_rate": 0.0, "logits/chosen": 3.5, "logits/rejected": 3.40625, "logps/chosen": -356.0, "logps/rejected": -272.0, "loss": 0.6944, "rewards/accuracies": 0.25, "rewards/chosen": -0.00811767578125, "rewards/margins": -0.0093994140625, "rewards/rejected": 0.001251220703125, "step": 1 }, { "epoch": 0.004186289900575615, "grad_norm": 8.14901351928711, "learning_rate": 8.859191006777895e-08, "logits/chosen": 3.6875, "logits/rejected": 4.1875, "logps/chosen": -472.0, "logps/rejected": -290.0, "loss": 0.6949, "rewards/accuracies": 0.5, "rewards/chosen": 0.010009765625, "rewards/margins": 0.0050048828125, "rewards/rejected": 0.0050048828125, "step": 2 }, { "epoch": 0.006279434850863423, "grad_norm": 7.643013000488281, "learning_rate": 1.404148553246907e-07, "logits/chosen": 3.8125, "logits/rejected": 3.8125, "logps/chosen": -342.0, "logps/rejected": -422.0, "loss": 0.6919, "rewards/accuracies": 0.25, "rewards/chosen": 0.003753662109375, "rewards/margins": 0.0150146484375, "rewards/rejected": -0.01123046875, "step": 3 }, { "epoch": 0.00837257980115123, "grad_norm": 9.3782320022583, "learning_rate": 1.771838201355579e-07, "logits/chosen": 3.90625, "logits/rejected": 3.625, "logps/chosen": -378.0, "logps/rejected": -456.0, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": -0.0238037109375, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.0250244140625, "step": 4 }, { "epoch": 0.010465724751439037, "grad_norm": 13.61230182647705, "learning_rate": 2.057040449661105e-07, "logits/chosen": 3.453125, "logits/rejected": 3.890625, "logps/chosen": -262.0, "logps/rejected": -177.0, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.0087890625, "rewards/rejected": -0.0162353515625, "step": 5 }, { "epoch": 0.012558869701726845, "grad_norm": 8.330711364746094, "learning_rate": 2.2900676539246965e-07, "logits/chosen": 3.765625, "logits/rejected": 4.34375, "logps/chosen": -466.0, "logps/rejected": -286.0, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": 0.00250244140625, "rewards/margins": 0.01373291015625, "rewards/rejected": -0.01123046875, "step": 6 }, { "epoch": 0.014652014652014652, "grad_norm": 7.784880638122559, "learning_rate": 2.4870893478326387e-07, "logits/chosen": 4.15625, "logits/rejected": 3.9375, "logps/chosen": -398.0, "logps/rejected": -290.0, "loss": 0.6938, "rewards/accuracies": 1.0, "rewards/chosen": 0.00750732421875, "rewards/margins": 0.0250244140625, "rewards/rejected": -0.017578125, "step": 7 }, { "epoch": 0.01674515960230246, "grad_norm": 8.997271537780762, "learning_rate": 2.6577573020333683e-07, "logits/chosen": 3.71875, "logits/rejected": 4.125, "logps/chosen": -234.0, "logps/rejected": -211.0, "loss": 0.6937, "rewards/accuracies": 0.75, "rewards/chosen": 0.0137939453125, "rewards/margins": 0.0162353515625, "rewards/rejected": -0.00250244140625, "step": 8 }, { "epoch": 0.018838304552590265, "grad_norm": 7.737761974334717, "learning_rate": 2.808297106493814e-07, "logits/chosen": 3.28125, "logits/rejected": 3.453125, "logps/chosen": -300.0, "logps/rejected": -316.0, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.01123046875, "rewards/margins": 0.00250244140625, "rewards/rejected": 0.0087890625, "step": 9 }, { "epoch": 0.020931449502878074, "grad_norm": 7.448185443878174, "learning_rate": 2.942959550338895e-07, "logits/chosen": 3.1875, "logits/rejected": 3.71875, "logps/chosen": -284.0, "logps/rejected": -231.0, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.0050048828125, "rewards/margins": -0.00439453125, "rewards/rejected": -0.0006256103515625, "step": 10 }, { "epoch": 0.023024594453165882, "grad_norm": 8.635788917541504, "learning_rate": 3.0647765484394645e-07, "logits/chosen": 3.9375, "logits/rejected": 4.78125, "logps/chosen": -804.0, "logps/rejected": -274.0, "loss": 0.6937, "rewards/accuracies": 0.25, "rewards/chosen": 0.0150146484375, "rewards/margins": 0.00506591796875, "rewards/rejected": 0.010009765625, "step": 11 }, { "epoch": 0.02511773940345369, "grad_norm": 8.279386520385742, "learning_rate": 3.175986754602486e-07, "logits/chosen": 3.46875, "logits/rejected": 3.0, "logps/chosen": -536.0, "logps/rejected": -692.0, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.01251220703125, "rewards/margins": 0.01190185546875, "rewards/rejected": 0.0006256103515625, "step": 12 }, { "epoch": 0.027210884353741496, "grad_norm": 7.699203014373779, "learning_rate": 3.2782902272079295e-07, "logits/chosen": 3.421875, "logits/rejected": 3.90625, "logps/chosen": -252.0, "logps/rejected": -118.0, "loss": 0.6937, "rewards/accuracies": 0.25, "rewards/chosen": -0.02880859375, "rewards/margins": -0.0306396484375, "rewards/rejected": 0.0018768310546875, "step": 13 }, { "epoch": 0.029304029304029304, "grad_norm": 7.581473350524902, "learning_rate": 3.373008448510428e-07, "logits/chosen": 2.28125, "logits/rejected": 2.625, "logps/chosen": -364.0, "logps/rejected": -151.0, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.002532958984375, "rewards/rejected": -0.00750732421875, "step": 14 }, { "epoch": 0.03139717425431711, "grad_norm": 8.212568283081055, "learning_rate": 3.461189002908012e-07, "logits/chosen": 3.546875, "logits/rejected": 4.0625, "logps/chosen": -416.0, "logps/rejected": -172.0, "loss": 0.6929, "rewards/accuracies": 0.75, "rewards/chosen": 0.027587890625, "rewards/margins": 0.018798828125, "rewards/rejected": 0.0087890625, "step": 15 }, { "epoch": 0.03349031920460492, "grad_norm": 8.286090850830078, "learning_rate": 3.543676402711158e-07, "logits/chosen": 3.140625, "logits/rejected": 3.765625, "logps/chosen": -688.0, "logps/rejected": -488.0, "loss": 0.6866, "rewards/accuracies": 0.75, "rewards/chosen": 0.01123046875, "rewards/margins": 0.036376953125, "rewards/rejected": -0.0250244140625, "step": 16 }, { "epoch": 0.035583464154892726, "grad_norm": 7.6797285079956055, "learning_rate": 3.621161404374383e-07, "logits/chosen": 3.40625, "logits/rejected": 3.4375, "logps/chosen": -268.0, "logps/rejected": -190.0, "loss": 0.6897, "rewards/accuracies": 0.25, "rewards/chosen": 0.01123046875, "rewards/margins": 0.0137939453125, "rewards/rejected": -0.00250244140625, "step": 17 }, { "epoch": 0.03767660910518053, "grad_norm": 8.51090145111084, "learning_rate": 3.6942162071716033e-07, "logits/chosen": 3.359375, "logits/rejected": 3.796875, "logps/chosen": -548.0, "logps/rejected": -338.0, "loss": 0.6935, "rewards/accuracies": 0.75, "rewards/chosen": 0.010009765625, "rewards/margins": 0.0162353515625, "rewards/rejected": -0.006256103515625, "step": 18 }, { "epoch": 0.03976975405546834, "grad_norm": 7.66601037979126, "learning_rate": 3.76332012245438e-07, "logits/chosen": 4.0, "logits/rejected": 3.765625, "logps/chosen": -140.0, "logps/rejected": -316.0, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0006256103515625, "rewards/margins": 0.003173828125, "rewards/rejected": -0.003753662109375, "step": 19 }, { "epoch": 0.04186289900575615, "grad_norm": 8.81503963470459, "learning_rate": 3.828878651016684e-07, "logits/chosen": 3.96875, "logits/rejected": 4.34375, "logps/chosen": -454.0, "logps/rejected": -324.0, "loss": 0.6945, "rewards/accuracies": 0.75, "rewards/chosen": 0.00970458984375, "rewards/margins": 0.0203857421875, "rewards/rejected": -0.0106201171875, "step": 20 }, { "epoch": 0.04395604395604396, "grad_norm": 7.937436103820801, "learning_rate": 3.891237901079545e-07, "logits/chosen": 4.1875, "logits/rejected": 3.625, "logps/chosen": -268.0, "logps/rejected": -392.0, "loss": 0.6934, "rewards/accuracies": 0.75, "rewards/chosen": 0.0306396484375, "rewards/margins": 0.03564453125, "rewards/rejected": -0.0050048828125, "step": 21 }, { "epoch": 0.046049188906331764, "grad_norm": 7.528475284576416, "learning_rate": 3.9506956491172536e-07, "logits/chosen": 3.625, "logits/rejected": 4.0625, "logps/chosen": -508.0, "logps/rejected": -374.0, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": 0.0150146484375, "rewards/margins": 0.0, "rewards/rejected": 0.0150146484375, "step": 22 }, { "epoch": 0.04814233385661957, "grad_norm": 7.722219467163086, "learning_rate": 4.007509939970292e-07, "logits/chosen": 3.46875, "logits/rejected": 3.640625, "logps/chosen": -376.0, "logps/rejected": -296.0, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": -0.0006256103515625, "rewards/margins": -0.0006256103515625, "rewards/rejected": 0.0, "step": 23 }, { "epoch": 0.05023547880690738, "grad_norm": 8.647167205810547, "learning_rate": 4.061905855280276e-07, "logits/chosen": 3.6875, "logits/rejected": 3.71875, "logps/chosen": -117.0, "logps/rejected": -167.0, "loss": 0.6911, "rewards/accuracies": 0.0, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.014404296875, "rewards/rejected": 0.01190185546875, "step": 24 }, { "epoch": 0.052328623757195186, "grad_norm": 7.382556438446045, "learning_rate": 4.11408089932221e-07, "logits/chosen": 3.28125, "logits/rejected": 3.265625, "logps/chosen": -398.0, "logps/rejected": -548.0, "loss": 0.6914, "rewards/accuracies": 0.75, "rewards/chosen": -0.01251220703125, "rewards/margins": -0.0224609375, "rewards/rejected": 0.010009765625, "step": 25 }, { "epoch": 0.05442176870748299, "grad_norm": 8.106797218322754, "learning_rate": 4.1642093278857186e-07, "logits/chosen": 2.390625, "logits/rejected": 3.5625, "logps/chosen": -680.0, "logps/rejected": -316.0, "loss": 0.6872, "rewards/accuracies": 0.75, "rewards/chosen": 0.021240234375, "rewards/margins": 0.032470703125, "rewards/rejected": -0.01123046875, "step": 26 }, { "epoch": 0.0565149136577708, "grad_norm": 8.886155128479004, "learning_rate": 4.212445659740721e-07, "logits/chosen": 3.578125, "logits/rejected": 4.0, "logps/chosen": -490.0, "logps/rejected": -238.0, "loss": 0.6898, "rewards/accuracies": 0.0, "rewards/chosen": -0.00750732421875, "rewards/margins": -0.01251220703125, "rewards/rejected": 0.0050048828125, "step": 27 }, { "epoch": 0.05860805860805861, "grad_norm": 8.035849571228027, "learning_rate": 4.2589275491882174e-07, "logits/chosen": 3.21875, "logits/rejected": 2.875, "logps/chosen": -212.0, "logps/rejected": -136.0, "loss": 0.6891, "rewards/accuracies": 0.75, "rewards/chosen": 0.00250244140625, "rewards/margins": 0.014404296875, "rewards/rejected": -0.01190185546875, "step": 28 }, { "epoch": 0.06070120355834641, "grad_norm": 8.825554847717285, "learning_rate": 4.303778154313212e-07, "logits/chosen": 2.921875, "logits/rejected": 2.46875, "logps/chosen": -396.0, "logps/rejected": -324.0, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.016845703125, "rewards/margins": 0.03125, "rewards/rejected": -0.014404296875, "step": 29 }, { "epoch": 0.06279434850863422, "grad_norm": 7.398782253265381, "learning_rate": 4.347108103585802e-07, "logits/chosen": 3.78125, "logits/rejected": 4.4375, "logps/chosen": -438.0, "logps/rejected": -374.0, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": -0.0050048828125, "rewards/margins": 0.001251220703125, "rewards/rejected": -0.006256103515625, "step": 30 }, { "epoch": 0.06488749345892203, "grad_norm": 7.316285133361816, "learning_rate": 4.3890171398791635e-07, "logits/chosen": 3.125, "logits/rejected": 2.796875, "logps/chosen": -110.0, "logps/rejected": -161.0, "loss": 0.6901, "rewards/accuracies": 0.25, "rewards/chosen": -0.0087890625, "rewards/margins": -0.0050048828125, "rewards/rejected": -0.003753662109375, "step": 31 }, { "epoch": 0.06698063840920984, "grad_norm": 8.234468460083008, "learning_rate": 4.4295955033889476e-07, "logits/chosen": 3.6875, "logits/rejected": 4.21875, "logps/chosen": -584.0, "logps/rejected": -354.0, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": 0.032470703125, "rewards/margins": 0.0576171875, "rewards/rejected": -0.0250244140625, "step": 32 }, { "epoch": 0.06907378335949764, "grad_norm": 8.064417839050293, "learning_rate": 4.468925101686371e-07, "logits/chosen": 3.625, "logits/rejected": 3.40625, "logps/chosen": -253.0, "logps/rejected": -270.0, "loss": 0.6894, "rewards/accuracies": 0.25, "rewards/chosen": 0.0050048828125, "rewards/margins": -0.00250244140625, "rewards/rejected": 0.00750732421875, "step": 33 }, { "epoch": 0.07116692830978545, "grad_norm": 9.39603042602539, "learning_rate": 4.5070805050521726e-07, "logits/chosen": 3.40625, "logits/rejected": 3.34375, "logps/chosen": -556.0, "logps/rejected": -552.0, "loss": 0.6874, "rewards/accuracies": 0.25, "rewards/chosen": -0.016845703125, "rewards/margins": -0.0194091796875, "rewards/rejected": 0.00250244140625, "step": 34 }, { "epoch": 0.07326007326007326, "grad_norm": 7.400561809539795, "learning_rate": 4.5441297974937435e-07, "logits/chosen": 2.875, "logits/rejected": 3.09375, "logps/chosen": -308.0, "logps/rejected": -560.0, "loss": 0.6936, "rewards/accuracies": 0.75, "rewards/chosen": -0.01312255859375, "rewards/margins": -0.0306396484375, "rewards/rejected": 0.017578125, "step": 35 }, { "epoch": 0.07535321821036106, "grad_norm": 8.39317798614502, "learning_rate": 4.580135307849393e-07, "logits/chosen": 3.84375, "logits/rejected": 3.703125, "logps/chosen": -444.0, "logps/rejected": -464.0, "loss": 0.6848, "rewards/accuracies": 0.5, "rewards/chosen": 0.010009765625, "rewards/margins": 0.0113525390625, "rewards/rejected": -0.001251220703125, "step": 36 }, { "epoch": 0.07744636316064887, "grad_norm": 7.993171215057373, "learning_rate": 4.615154240700883e-07, "logits/chosen": 3.65625, "logits/rejected": 3.140625, "logps/chosen": -384.0, "logps/rejected": -426.0, "loss": 0.6911, "rewards/accuracies": 0.75, "rewards/chosen": 0.02001953125, "rewards/margins": 0.030029296875, "rewards/rejected": -0.010009765625, "step": 37 }, { "epoch": 0.07953950811093669, "grad_norm": 7.946194171905518, "learning_rate": 4.649239223132169e-07, "logits/chosen": 4.6875, "logits/rejected": 4.40625, "logps/chosen": -316.0, "logps/rejected": -506.0, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.01251220703125, "rewards/margins": 0.04248046875, "rewards/rejected": -0.030029296875, "step": 38 }, { "epoch": 0.08163265306122448, "grad_norm": 8.186020851135254, "learning_rate": 4.6824387804548366e-07, "logits/chosen": 4.21875, "logits/rejected": 4.3125, "logps/chosen": -296.0, "logps/rejected": -332.0, "loss": 0.687, "rewards/accuracies": 0.75, "rewards/chosen": 0.01373291015625, "rewards/margins": 0.0294189453125, "rewards/rejected": -0.015625, "step": 39 }, { "epoch": 0.0837257980115123, "grad_norm": 7.499300003051758, "learning_rate": 4.7147977516944737e-07, "logits/chosen": 3.5, "logits/rejected": 3.375, "logps/chosen": -78.5, "logps/rejected": -73.5, "loss": 0.6913, "rewards/accuracies": 0.25, "rewards/chosen": -0.0106201171875, "rewards/margins": -0.0118408203125, "rewards/rejected": 0.001251220703125, "step": 40 }, { "epoch": 0.08581894296180011, "grad_norm": 7.722729206085205, "learning_rate": 4.7463576537657413e-07, "logits/chosen": 3.03125, "logits/rejected": 2.9375, "logps/chosen": -310.0, "logps/rejected": -394.0, "loss": 0.6912, "rewards/accuracies": 0.75, "rewards/chosen": -0.006866455078125, "rewards/margins": 0.0181884765625, "rewards/rejected": -0.0250244140625, "step": 41 }, { "epoch": 0.08791208791208792, "grad_norm": 7.086386680603027, "learning_rate": 4.777157001757335e-07, "logits/chosen": 3.875, "logits/rejected": 3.828125, "logps/chosen": -458.0, "logps/rejected": -444.0, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.0087890625, "rewards/margins": 0.07666015625, "rewards/rejected": -0.06787109375, "step": 42 }, { "epoch": 0.09000523286237572, "grad_norm": 8.327207565307617, "learning_rate": 4.807231591525269e-07, "logits/chosen": 3.21875, "logits/rejected": 3.4375, "logps/chosen": -202.0, "logps/rejected": -176.0, "loss": 0.6819, "rewards/accuracies": 0.0, "rewards/chosen": -0.02880859375, "rewards/margins": -0.02880859375, "rewards/rejected": 0.0, "step": 43 }, { "epoch": 0.09209837781266353, "grad_norm": 8.074577331542969, "learning_rate": 4.836614749795043e-07, "logits/chosen": 3.65625, "logits/rejected": 3.984375, "logps/chosen": -348.0, "logps/rejected": -218.0, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.0087890625, "rewards/margins": 0.027587890625, "rewards/rejected": -0.018798828125, "step": 44 }, { "epoch": 0.09419152276295134, "grad_norm": 8.236300468444824, "learning_rate": 4.865337556154919e-07, "logits/chosen": 3.125, "logits/rejected": 3.0625, "logps/chosen": -276.0, "logps/rejected": -294.0, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": 0.0087890625, "rewards/margins": 0.0181884765625, "rewards/rejected": -0.0093994140625, "step": 45 }, { "epoch": 0.09628466771323914, "grad_norm": 8.110278129577637, "learning_rate": 4.893429040648081e-07, "logits/chosen": 2.640625, "logits/rejected": 2.96875, "logps/chosen": -576.0, "logps/rejected": -544.0, "loss": 0.6844, "rewards/accuracies": 0.5, "rewards/chosen": 0.052490234375, "rewards/margins": 0.0150146484375, "rewards/rejected": 0.03759765625, "step": 46 }, { "epoch": 0.09837781266352695, "grad_norm": 7.28758430480957, "learning_rate": 4.920916360113128e-07, "logits/chosen": 4.15625, "logits/rejected": 3.609375, "logps/chosen": -356.0, "logps/rejected": -528.0, "loss": 0.6908, "rewards/accuracies": 0.75, "rewards/chosen": 0.018798828125, "rewards/margins": 0.03369140625, "rewards/rejected": -0.0150146484375, "step": 47 }, { "epoch": 0.10047095761381476, "grad_norm": 7.295600891113281, "learning_rate": 4.947824955958065e-07, "logits/chosen": 3.5625, "logits/rejected": 4.15625, "logps/chosen": -326.0, "logps/rejected": -320.0, "loss": 0.6847, "rewards/accuracies": 0.5, "rewards/chosen": -0.010009765625, "rewards/margins": -0.00250244140625, "rewards/rejected": -0.00750732421875, "step": 48 }, { "epoch": 0.10256410256410256, "grad_norm": 7.849913597106934, "learning_rate": 4.974178695665277e-07, "logits/chosen": 4.21875, "logits/rejected": 3.921875, "logps/chosen": -266.0, "logps/rejected": -348.0, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": 0.0150146484375, "rewards/margins": 0.05517578125, "rewards/rejected": -0.0400390625, "step": 49 }, { "epoch": 0.10465724751439037, "grad_norm": 8.052017211914062, "learning_rate": 5e-07, "logits/chosen": 3.1875, "logits/rejected": 3.09375, "logps/chosen": -178.0, "logps/rejected": -358.0, "loss": 0.69, "rewards/accuracies": 0.75, "rewards/chosen": -0.0018768310546875, "rewards/margins": 0.0281982421875, "rewards/rejected": -0.030029296875, "step": 50 }, { "epoch": 0.10675039246467818, "grad_norm": 7.91984748840332, "learning_rate": 4.99999353186937e-07, "logits/chosen": 3.21875, "logits/rejected": 4.3125, "logps/chosen": -540.0, "logps/rejected": -280.0, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.0250244140625, "rewards/margins": 0.052490234375, "rewards/rejected": -0.027587890625, "step": 51 }, { "epoch": 0.10884353741496598, "grad_norm": 7.94594144821167, "learning_rate": 4.999974127510951e-07, "logits/chosen": 3.71875, "logits/rejected": 3.96875, "logps/chosen": -241.0, "logps/rejected": -238.0, "loss": 0.6801, "rewards/accuracies": 0.5, "rewards/chosen": -0.0400390625, "rewards/margins": 0.0150146484375, "rewards/rejected": -0.05517578125, "step": 52 }, { "epoch": 0.1109366823652538, "grad_norm": 51.00778579711914, "learning_rate": 4.999941787025163e-07, "logits/chosen": 3.5625, "logits/rejected": 3.78125, "logps/chosen": -540.0, "logps/rejected": -430.0, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.0751953125, "rewards/margins": 0.115234375, "rewards/rejected": -0.0400390625, "step": 53 }, { "epoch": 0.1130298273155416, "grad_norm": 7.948894500732422, "learning_rate": 4.999896510579369e-07, "logits/chosen": 3.453125, "logits/rejected": 3.84375, "logps/chosen": -528.0, "logps/rejected": -284.0, "loss": 0.6866, "rewards/accuracies": 0.5, "rewards/chosen": 0.01123046875, "rewards/margins": 0.02880859375, "rewards/rejected": -0.017578125, "step": 54 }, { "epoch": 0.1151229722658294, "grad_norm": 6.681576251983643, "learning_rate": 4.999838298407872e-07, "logits/chosen": 3.4375, "logits/rejected": 3.5625, "logps/chosen": -245.0, "logps/rejected": -208.0, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": -0.031982421875, "rewards/margins": -0.014404296875, "rewards/rejected": -0.017578125, "step": 55 }, { "epoch": 0.11721611721611722, "grad_norm": 7.2991156578063965, "learning_rate": 4.999767150811926e-07, "logits/chosen": 3.234375, "logits/rejected": 3.515625, "logps/chosen": -204.0, "logps/rejected": -124.0, "loss": 0.6845, "rewards/accuracies": 0.25, "rewards/chosen": -0.03125, "rewards/margins": -0.01507568359375, "rewards/rejected": -0.0162353515625, "step": 56 }, { "epoch": 0.11930926216640503, "grad_norm": 7.868374824523926, "learning_rate": 4.999683068159718e-07, "logits/chosen": 3.03125, "logits/rejected": 3.359375, "logps/chosen": -418.0, "logps/rejected": -344.0, "loss": 0.6786, "rewards/accuracies": 0.5, "rewards/chosen": -0.016845703125, "rewards/margins": -0.0024871826171875, "rewards/rejected": -0.014404296875, "step": 57 }, { "epoch": 0.12140240711669283, "grad_norm": 7.583108901977539, "learning_rate": 4.999586050886378e-07, "logits/chosen": 3.875, "logits/rejected": 4.5625, "logps/chosen": -490.0, "logps/rejected": -264.0, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": 0.0225830078125, "rewards/margins": 0.02880859375, "rewards/rejected": -0.006256103515625, "step": 58 }, { "epoch": 0.12349555206698064, "grad_norm": 7.541992664337158, "learning_rate": 4.999476099493974e-07, "logits/chosen": 2.671875, "logits/rejected": 2.625, "logps/chosen": -234.0, "logps/rejected": -214.0, "loss": 0.6893, "rewards/accuracies": 0.75, "rewards/chosen": -0.02001953125, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.0294189453125, "step": 59 }, { "epoch": 0.12558869701726844, "grad_norm": 7.136630058288574, "learning_rate": 4.999353214551507e-07, "logits/chosen": 3.328125, "logits/rejected": 3.40625, "logps/chosen": -362.0, "logps/rejected": -230.0, "loss": 0.6844, "rewards/accuracies": 0.25, "rewards/chosen": -0.0849609375, "rewards/margins": -0.05517578125, "rewards/rejected": -0.030029296875, "step": 60 }, { "epoch": 0.12768184196755625, "grad_norm": 7.714498043060303, "learning_rate": 4.999217396694907e-07, "logits/chosen": 4.21875, "logits/rejected": 3.890625, "logps/chosen": -388.0, "logps/rejected": -596.0, "loss": 0.6866, "rewards/accuracies": 0.25, "rewards/chosen": -0.026611328125, "rewards/margins": -0.03271484375, "rewards/rejected": 0.006256103515625, "step": 61 }, { "epoch": 0.12977498691784406, "grad_norm": 7.706767559051514, "learning_rate": 4.999068646627036e-07, "logits/chosen": 4.40625, "logits/rejected": 4.09375, "logps/chosen": -348.0, "logps/rejected": -508.0, "loss": 0.6833, "rewards/accuracies": 0.75, "rewards/chosen": -0.02490234375, "rewards/margins": 0.04248046875, "rewards/rejected": -0.0673828125, "step": 62 }, { "epoch": 0.13186813186813187, "grad_norm": 7.379638195037842, "learning_rate": 4.998906965117679e-07, "logits/chosen": 3.78125, "logits/rejected": 4.0, "logps/chosen": -540.0, "logps/rejected": -364.0, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": 0.0150146484375, "rewards/margins": 0.050048828125, "rewards/rejected": -0.03515625, "step": 63 }, { "epoch": 0.13396127681841968, "grad_norm": 7.627968788146973, "learning_rate": 4.99873235300354e-07, "logits/chosen": 3.671875, "logits/rejected": 4.1875, "logps/chosen": -672.0, "logps/rejected": -360.0, "loss": 0.6811, "rewards/accuracies": 0.75, "rewards/chosen": 0.0712890625, "rewards/margins": 0.10498046875, "rewards/rejected": -0.03369140625, "step": 64 }, { "epoch": 0.1360544217687075, "grad_norm": 7.326155662536621, "learning_rate": 4.998544811188243e-07, "logits/chosen": 3.75, "logits/rejected": 3.65625, "logps/chosen": -177.0, "logps/rejected": -169.0, "loss": 0.6796, "rewards/accuracies": 0.75, "rewards/chosen": -0.03759765625, "rewards/margins": 0.05712890625, "rewards/rejected": -0.09423828125, "step": 65 }, { "epoch": 0.13814756671899528, "grad_norm": 7.775496006011963, "learning_rate": 4.998344340642319e-07, "logits/chosen": 4.03125, "logits/rejected": 3.984375, "logps/chosen": -220.0, "logps/rejected": -262.0, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": -0.0262451171875, "rewards/margins": 0.013671875, "rewards/rejected": -0.0400390625, "step": 66 }, { "epoch": 0.1402407116692831, "grad_norm": 7.598969459533691, "learning_rate": 4.998130942403208e-07, "logits/chosen": 3.296875, "logits/rejected": 3.625, "logps/chosen": -207.0, "logps/rejected": -243.0, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": -0.06005859375, "rewards/margins": 0.03564453125, "rewards/rejected": -0.095703125, "step": 67 }, { "epoch": 0.1423338566195709, "grad_norm": 7.5228166580200195, "learning_rate": 4.99790461757525e-07, "logits/chosen": 3.828125, "logits/rejected": 4.1875, "logps/chosen": -388.0, "logps/rejected": -156.0, "loss": 0.6775, "rewards/accuracies": 0.75, "rewards/chosen": 0.03515625, "rewards/margins": 0.134765625, "rewards/rejected": -0.10009765625, "step": 68 }, { "epoch": 0.14442700156985872, "grad_norm": 7.170711517333984, "learning_rate": 4.997665367329683e-07, "logits/chosen": 3.671875, "logits/rejected": 3.96875, "logps/chosen": -648.0, "logps/rejected": -498.0, "loss": 0.6784, "rewards/accuracies": 0.75, "rewards/chosen": 0.07763671875, "rewards/margins": 0.10546875, "rewards/rejected": -0.0274658203125, "step": 69 }, { "epoch": 0.14652014652014653, "grad_norm": 7.3945536613464355, "learning_rate": 4.99741319290463e-07, "logits/chosen": 4.40625, "logits/rejected": 3.5625, "logps/chosen": -188.0, "logps/rejected": -568.0, "loss": 0.676, "rewards/accuracies": 0.5, "rewards/chosen": -0.0712890625, "rewards/margins": -0.03125, "rewards/rejected": -0.0400390625, "step": 70 }, { "epoch": 0.14861329147043434, "grad_norm": 7.3103556632995605, "learning_rate": 4.9971480956051e-07, "logits/chosen": 3.0, "logits/rejected": 3.359375, "logps/chosen": -284.0, "logps/rejected": -248.0, "loss": 0.6843, "rewards/accuracies": 0.5, "rewards/chosen": -0.09765625, "rewards/margins": -0.06494140625, "rewards/rejected": -0.032470703125, "step": 71 }, { "epoch": 0.15070643642072212, "grad_norm": 7.618526935577393, "learning_rate": 4.996870076802977e-07, "logits/chosen": 3.921875, "logits/rejected": 4.1875, "logps/chosen": -238.0, "logps/rejected": -218.0, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": -0.061279296875, "rewards/margins": -0.006591796875, "rewards/rejected": -0.0546875, "step": 72 }, { "epoch": 0.15279958137100993, "grad_norm": 6.8290791511535645, "learning_rate": 4.996579137937015e-07, "logits/chosen": 3.46875, "logits/rejected": 3.625, "logps/chosen": -356.0, "logps/rejected": -474.0, "loss": 0.6848, "rewards/accuracies": 0.25, "rewards/chosen": -0.1103515625, "rewards/margins": -0.03564453125, "rewards/rejected": -0.07421875, "step": 73 }, { "epoch": 0.15489272632129775, "grad_norm": 8.002132415771484, "learning_rate": 4.99627528051283e-07, "logits/chosen": 4.03125, "logits/rejected": 3.625, "logps/chosen": -498.0, "logps/rejected": -440.0, "loss": 0.6832, "rewards/accuracies": 0.75, "rewards/chosen": -0.00750732421875, "rewards/margins": 0.09765625, "rewards/rejected": -0.10498046875, "step": 74 }, { "epoch": 0.15698587127158556, "grad_norm": 7.541519641876221, "learning_rate": 4.99595850610289e-07, "logits/chosen": 3.265625, "logits/rejected": 3.765625, "logps/chosen": -412.0, "logps/rejected": -154.0, "loss": 0.6793, "rewards/accuracies": 0.25, "rewards/chosen": -0.023681640625, "rewards/margins": 0.026123046875, "rewards/rejected": -0.050048828125, "step": 75 }, { "epoch": 0.15907901622187337, "grad_norm": 7.712185859680176, "learning_rate": 4.995628816346507e-07, "logits/chosen": 3.21875, "logits/rejected": 3.3125, "logps/chosen": -300.0, "logps/rejected": -292.0, "loss": 0.6776, "rewards/accuracies": 0.5, "rewards/chosen": -0.0250244140625, "rewards/margins": 0.030029296875, "rewards/rejected": -0.054931640625, "step": 76 }, { "epoch": 0.16117216117216118, "grad_norm": 7.439749240875244, "learning_rate": 4.995286212949837e-07, "logits/chosen": 3.703125, "logits/rejected": 4.03125, "logps/chosen": -382.0, "logps/rejected": -210.0, "loss": 0.6769, "rewards/accuracies": 0.75, "rewards/chosen": -0.0030517578125, "rewards/margins": 0.11474609375, "rewards/rejected": -0.1171875, "step": 77 }, { "epoch": 0.16326530612244897, "grad_norm": 6.7779693603515625, "learning_rate": 4.994930697685857e-07, "logits/chosen": 3.84375, "logits/rejected": 4.0, "logps/chosen": -174.0, "logps/rejected": -186.0, "loss": 0.674, "rewards/accuracies": 0.75, "rewards/chosen": -0.025634765625, "rewards/margins": 0.0106201171875, "rewards/rejected": -0.036376953125, "step": 78 }, { "epoch": 0.16535845107273678, "grad_norm": 7.856261253356934, "learning_rate": 4.994562272394368e-07, "logits/chosen": 3.703125, "logits/rejected": 4.09375, "logps/chosen": -358.0, "logps/rejected": -374.0, "loss": 0.6768, "rewards/accuracies": 0.0, "rewards/chosen": -0.1396484375, "rewards/margins": -0.0849609375, "rewards/rejected": -0.05517578125, "step": 79 }, { "epoch": 0.1674515960230246, "grad_norm": 7.679439544677734, "learning_rate": 4.994180938981979e-07, "logits/chosen": 3.984375, "logits/rejected": 3.96875, "logps/chosen": -384.0, "logps/rejected": -384.0, "loss": 0.6809, "rewards/accuracies": 0.75, "rewards/chosen": 0.00628662109375, "rewards/margins": 0.05322265625, "rewards/rejected": -0.046875, "step": 80 }, { "epoch": 0.1695447409733124, "grad_norm": 7.106894493103027, "learning_rate": 4.993786699422098e-07, "logits/chosen": 2.703125, "logits/rejected": 3.34375, "logps/chosen": -394.0, "logps/rejected": -274.0, "loss": 0.6748, "rewards/accuracies": 0.75, "rewards/chosen": -0.015625, "rewards/margins": 0.08056640625, "rewards/rejected": -0.09619140625, "step": 81 }, { "epoch": 0.17163788592360021, "grad_norm": 7.259335517883301, "learning_rate": 4.993379555754923e-07, "logits/chosen": 2.875, "logits/rejected": 3.328125, "logps/chosen": -320.0, "logps/rejected": -348.0, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": -0.02001953125, "rewards/margins": 0.05859375, "rewards/rejected": -0.07861328125, "step": 82 }, { "epoch": 0.17373103087388803, "grad_norm": 8.156006813049316, "learning_rate": 4.992959510087432e-07, "logits/chosen": 4.40625, "logits/rejected": 4.84375, "logps/chosen": -564.0, "logps/rejected": -620.0, "loss": 0.6873, "rewards/accuracies": 0.25, "rewards/chosen": -0.125, "rewards/margins": -0.0098876953125, "rewards/rejected": -0.115234375, "step": 83 }, { "epoch": 0.17582417582417584, "grad_norm": 8.099892616271973, "learning_rate": 4.992526564593371e-07, "logits/chosen": 3.3125, "logits/rejected": 3.21875, "logps/chosen": -334.0, "logps/rejected": -276.0, "loss": 0.6788, "rewards/accuracies": 0.75, "rewards/chosen": -0.05517578125, "rewards/margins": 0.02490234375, "rewards/rejected": -0.080078125, "step": 84 }, { "epoch": 0.17791732077446362, "grad_norm": 7.183663368225098, "learning_rate": 4.992080721513243e-07, "logits/chosen": 3.40625, "logits/rejected": 3.6875, "logps/chosen": -316.0, "logps/rejected": -284.0, "loss": 0.6828, "rewards/accuracies": 0.75, "rewards/chosen": -0.054931640625, "rewards/margins": 0.0751953125, "rewards/rejected": -0.1298828125, "step": 85 }, { "epoch": 0.18001046572475143, "grad_norm": 7.948578357696533, "learning_rate": 4.991621983154294e-07, "logits/chosen": 2.875, "logits/rejected": 3.0625, "logps/chosen": -656.0, "logps/rejected": -460.0, "loss": 0.6808, "rewards/accuracies": 0.75, "rewards/chosen": -0.0224609375, "rewards/margins": 0.080078125, "rewards/rejected": -0.1025390625, "step": 86 }, { "epoch": 0.18210361067503925, "grad_norm": 7.284086227416992, "learning_rate": 4.991150351890505e-07, "logits/chosen": 2.96875, "logits/rejected": 3.28125, "logps/chosen": -256.0, "logps/rejected": -249.0, "loss": 0.685, "rewards/accuracies": 0.25, "rewards/chosen": -0.10498046875, "rewards/margins": -0.01507568359375, "rewards/rejected": -0.08984375, "step": 87 }, { "epoch": 0.18419675562532706, "grad_norm": 7.705651760101318, "learning_rate": 4.990665830162581e-07, "logits/chosen": 3.109375, "logits/rejected": 3.328125, "logps/chosen": -296.0, "logps/rejected": -223.0, "loss": 0.6761, "rewards/accuracies": 0.5, "rewards/chosen": -0.0908203125, "rewards/margins": 0.0093994140625, "rewards/rejected": -0.10009765625, "step": 88 }, { "epoch": 0.18628990057561487, "grad_norm": 7.44476318359375, "learning_rate": 4.99016842047793e-07, "logits/chosen": 3.65625, "logits/rejected": 3.96875, "logps/chosen": -151.0, "logps/rejected": -123.0, "loss": 0.6743, "rewards/accuracies": 0.75, "rewards/chosen": -0.027587890625, "rewards/margins": -0.002349853515625, "rewards/rejected": -0.0252685546875, "step": 89 }, { "epoch": 0.18838304552590268, "grad_norm": 7.6581950187683105, "learning_rate": 4.989658125410658e-07, "logits/chosen": 4.46875, "logits/rejected": 4.09375, "logps/chosen": -312.0, "logps/rejected": -338.0, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": -0.054931640625, "rewards/margins": 0.05322265625, "rewards/rejected": -0.10791015625, "step": 90 }, { "epoch": 0.19047619047619047, "grad_norm": 7.140282154083252, "learning_rate": 4.989134947601555e-07, "logits/chosen": 2.765625, "logits/rejected": 2.875, "logps/chosen": -288.0, "logps/rejected": -364.0, "loss": 0.6822, "rewards/accuracies": 0.75, "rewards/chosen": -0.083984375, "rewards/margins": 0.017822265625, "rewards/rejected": -0.10205078125, "step": 91 }, { "epoch": 0.19256933542647828, "grad_norm": 7.376110553741455, "learning_rate": 4.988598889758077e-07, "logits/chosen": 3.765625, "logits/rejected": 4.34375, "logps/chosen": -708.0, "logps/rejected": -520.0, "loss": 0.6778, "rewards/accuracies": 0.5, "rewards/chosen": -0.0400390625, "rewards/margins": 0.0247802734375, "rewards/rejected": -0.06494140625, "step": 92 }, { "epoch": 0.1946624803767661, "grad_norm": 7.1125664710998535, "learning_rate": 4.988049954654334e-07, "logits/chosen": 3.5, "logits/rejected": 3.59375, "logps/chosen": -318.0, "logps/rejected": -422.0, "loss": 0.6808, "rewards/accuracies": 0.5, "rewards/chosen": -0.10498046875, "rewards/margins": -0.017578125, "rewards/rejected": -0.08740234375, "step": 93 }, { "epoch": 0.1967556253270539, "grad_norm": 8.067298889160156, "learning_rate": 4.987488145131078e-07, "logits/chosen": 3.90625, "logits/rejected": 4.1875, "logps/chosen": -480.0, "logps/rejected": -302.0, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": -0.0498046875, "rewards/margins": 0.1279296875, "rewards/rejected": -0.177734375, "step": 94 }, { "epoch": 0.1988487702773417, "grad_norm": 7.559632301330566, "learning_rate": 4.986913464095686e-07, "logits/chosen": 3.109375, "logits/rejected": 2.84375, "logps/chosen": -426.0, "logps/rejected": -446.0, "loss": 0.6753, "rewards/accuracies": 0.75, "rewards/chosen": 0.0137939453125, "rewards/margins": 0.2041015625, "rewards/rejected": -0.189453125, "step": 95 }, { "epoch": 0.20094191522762953, "grad_norm": 8.821226119995117, "learning_rate": 4.986325914522145e-07, "logits/chosen": 3.78125, "logits/rejected": 4.125, "logps/chosen": -512.0, "logps/rejected": -412.0, "loss": 0.678, "rewards/accuracies": 0.5, "rewards/chosen": -0.125, "rewards/margins": -0.080078125, "rewards/rejected": -0.044921875, "step": 96 }, { "epoch": 0.2030350601779173, "grad_norm": 7.98090124130249, "learning_rate": 4.985725499451036e-07, "logits/chosen": 3.796875, "logits/rejected": 4.125, "logps/chosen": -540.0, "logps/rejected": -412.0, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": -0.042236328125, "rewards/margins": 0.125, "rewards/rejected": -0.1669921875, "step": 97 }, { "epoch": 0.20512820512820512, "grad_norm": 7.935008525848389, "learning_rate": 4.985112221989522e-07, "logits/chosen": 3.375, "logits/rejected": 3.328125, "logps/chosen": -342.0, "logps/rejected": -294.0, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": -0.04638671875, "rewards/margins": 0.0162353515625, "rewards/rejected": -0.0625, "step": 98 }, { "epoch": 0.20722135007849293, "grad_norm": 7.756891250610352, "learning_rate": 4.984486085311325e-07, "logits/chosen": 3.71875, "logits/rejected": 3.390625, "logps/chosen": -668.0, "logps/rejected": -624.0, "loss": 0.6766, "rewards/accuracies": 0.75, "rewards/chosen": -0.06494140625, "rewards/margins": 0.279296875, "rewards/rejected": -0.34375, "step": 99 }, { "epoch": 0.20931449502878074, "grad_norm": 7.824303150177002, "learning_rate": 4.983847092656719e-07, "logits/chosen": 3.390625, "logits/rejected": 3.625, "logps/chosen": -436.0, "logps/rejected": -266.0, "loss": 0.678, "rewards/accuracies": 0.5, "rewards/chosen": -0.1064453125, "rewards/margins": -0.015380859375, "rewards/rejected": -0.09130859375, "step": 100 }, { "epoch": 0.21140763997906856, "grad_norm": 7.823368549346924, "learning_rate": 4.983195247332502e-07, "logits/chosen": 2.703125, "logits/rejected": 2.859375, "logps/chosen": -286.0, "logps/rejected": -116.5, "loss": 0.6781, "rewards/accuracies": 0.75, "rewards/chosen": -0.09814453125, "rewards/margins": 0.027587890625, "rewards/rejected": -0.1259765625, "step": 101 }, { "epoch": 0.21350078492935637, "grad_norm": 7.150810718536377, "learning_rate": 4.982530552711989e-07, "logits/chosen": 3.375, "logits/rejected": 3.4375, "logps/chosen": -278.0, "logps/rejected": -366.0, "loss": 0.6755, "rewards/accuracies": 0.75, "rewards/chosen": -0.05859375, "rewards/margins": 0.0986328125, "rewards/rejected": -0.1572265625, "step": 102 }, { "epoch": 0.21559392987964415, "grad_norm": 8.53307819366455, "learning_rate": 4.981853012234991e-07, "logits/chosen": 3.71875, "logits/rejected": 3.5625, "logps/chosen": -480.0, "logps/rejected": -672.0, "loss": 0.6811, "rewards/accuracies": 0.5, "rewards/chosen": -0.06787109375, "rewards/margins": 0.01214599609375, "rewards/rejected": -0.080078125, "step": 103 }, { "epoch": 0.21768707482993196, "grad_norm": 7.274529933929443, "learning_rate": 4.981162629407793e-07, "logits/chosen": 4.0625, "logits/rejected": 4.5, "logps/chosen": -820.0, "logps/rejected": -516.0, "loss": 0.6868, "rewards/accuracies": 0.75, "rewards/chosen": -0.050048828125, "rewards/margins": 0.134765625, "rewards/rejected": -0.185546875, "step": 104 }, { "epoch": 0.21978021978021978, "grad_norm": 8.474202156066895, "learning_rate": 4.980459407803141e-07, "logits/chosen": 2.671875, "logits/rejected": 2.890625, "logps/chosen": -246.0, "logps/rejected": -230.0, "loss": 0.6707, "rewards/accuracies": 1.0, "rewards/chosen": -0.0262451171875, "rewards/margins": 0.12890625, "rewards/rejected": -0.1552734375, "step": 105 }, { "epoch": 0.2218733647305076, "grad_norm": 8.309959411621094, "learning_rate": 4.979743351060225e-07, "logits/chosen": 3.390625, "logits/rejected": 4.1875, "logps/chosen": -438.0, "logps/rejected": -432.0, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": -0.1201171875, "rewards/margins": 0.0264892578125, "rewards/rejected": -0.146484375, "step": 106 }, { "epoch": 0.2239665096807954, "grad_norm": 8.156225204467773, "learning_rate": 4.97901446288465e-07, "logits/chosen": 3.5625, "logits/rejected": 3.859375, "logps/chosen": -668.0, "logps/rejected": -552.0, "loss": 0.6714, "rewards/accuracies": 0.5, "rewards/chosen": -0.10498046875, "rewards/margins": 0.057373046875, "rewards/rejected": -0.162109375, "step": 107 }, { "epoch": 0.2260596546310832, "grad_norm": 7.299003601074219, "learning_rate": 4.978272747048432e-07, "logits/chosen": 3.234375, "logits/rejected": 3.703125, "logps/chosen": -382.0, "logps/rejected": -191.0, "loss": 0.6648, "rewards/accuracies": 1.0, "rewards/chosen": -0.04248046875, "rewards/margins": 0.12353515625, "rewards/rejected": -0.166015625, "step": 108 }, { "epoch": 0.228152799581371, "grad_norm": 8.61748218536377, "learning_rate": 4.977518207389965e-07, "logits/chosen": 3.203125, "logits/rejected": 3.40625, "logps/chosen": -242.0, "logps/rejected": -181.0, "loss": 0.6707, "rewards/accuracies": 0.0, "rewards/chosen": -0.162109375, "rewards/margins": -0.0771484375, "rewards/rejected": -0.08544921875, "step": 109 }, { "epoch": 0.2302459445316588, "grad_norm": 6.710155010223389, "learning_rate": 4.97675084781401e-07, "logits/chosen": 3.390625, "logits/rejected": 3.6875, "logps/chosen": -500.0, "logps/rejected": -211.0, "loss": 0.6597, "rewards/accuracies": 0.75, "rewards/chosen": -0.06494140625, "rewards/margins": 0.031494140625, "rewards/rejected": -0.09619140625, "step": 110 }, { "epoch": 0.23233908948194662, "grad_norm": 8.158218383789062, "learning_rate": 4.975970672291667e-07, "logits/chosen": 3.328125, "logits/rejected": 3.21875, "logps/chosen": -368.0, "logps/rejected": -268.0, "loss": 0.6567, "rewards/accuracies": 0.75, "rewards/chosen": -0.04248046875, "rewards/margins": 0.10595703125, "rewards/rejected": -0.1484375, "step": 111 }, { "epoch": 0.23443223443223443, "grad_norm": 7.508507251739502, "learning_rate": 4.975177684860365e-07, "logits/chosen": 3.671875, "logits/rejected": 3.734375, "logps/chosen": -366.0, "logps/rejected": -384.0, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": -0.1328125, "rewards/margins": -0.060302734375, "rewards/rejected": -0.072265625, "step": 112 }, { "epoch": 0.23652537938252224, "grad_norm": 7.649636268615723, "learning_rate": 4.974371889623828e-07, "logits/chosen": 3.125, "logits/rejected": 3.34375, "logps/chosen": -394.0, "logps/rejected": -272.0, "loss": 0.6573, "rewards/accuracies": 0.75, "rewards/chosen": -0.0037841796875, "rewards/margins": 0.0927734375, "rewards/rejected": -0.09619140625, "step": 113 }, { "epoch": 0.23861852433281006, "grad_norm": 7.413567543029785, "learning_rate": 4.973553290752066e-07, "logits/chosen": 2.671875, "logits/rejected": 2.6875, "logps/chosen": -83.5, "logps/rejected": -131.0, "loss": 0.6798, "rewards/accuracies": 0.5, "rewards/chosen": -0.11962890625, "rewards/margins": 0.01434326171875, "rewards/rejected": -0.1337890625, "step": 114 }, { "epoch": 0.24071166928309787, "grad_norm": 8.337164878845215, "learning_rate": 4.972721892481346e-07, "logits/chosen": 3.046875, "logits/rejected": 3.0625, "logps/chosen": -282.0, "logps/rejected": -318.0, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.10546875, "rewards/margins": 0.0791015625, "rewards/rejected": -0.1845703125, "step": 115 }, { "epoch": 0.24280481423338565, "grad_norm": 7.646633148193359, "learning_rate": 4.971877699114173e-07, "logits/chosen": 2.859375, "logits/rejected": 3.234375, "logps/chosen": -390.0, "logps/rejected": -256.0, "loss": 0.6719, "rewards/accuracies": 1.0, "rewards/chosen": -0.044921875, "rewards/margins": 0.08203125, "rewards/rejected": -0.126953125, "step": 116 }, { "epoch": 0.24489795918367346, "grad_norm": 7.244144439697266, "learning_rate": 4.971020715019264e-07, "logits/chosen": 3.640625, "logits/rejected": 3.34375, "logps/chosen": -199.0, "logps/rejected": -396.0, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": -0.076171875, "rewards/margins": 0.07763671875, "rewards/rejected": -0.1533203125, "step": 117 }, { "epoch": 0.24699110413396128, "grad_norm": 7.660480976104736, "learning_rate": 4.970150944631533e-07, "logits/chosen": 3.859375, "logits/rejected": 4.09375, "logps/chosen": -380.0, "logps/rejected": -378.0, "loss": 0.6678, "rewards/accuracies": 0.75, "rewards/chosen": -0.103515625, "rewards/margins": 0.1240234375, "rewards/rejected": -0.2275390625, "step": 118 }, { "epoch": 0.2490842490842491, "grad_norm": 8.113422393798828, "learning_rate": 4.96926839245206e-07, "logits/chosen": 3.515625, "logits/rejected": 3.34375, "logps/chosen": -504.0, "logps/rejected": -740.0, "loss": 0.6703, "rewards/accuracies": 0.75, "rewards/chosen": -0.26171875, "rewards/margins": -0.002685546875, "rewards/rejected": -0.259765625, "step": 119 }, { "epoch": 0.25117739403453687, "grad_norm": 8.210384368896484, "learning_rate": 4.96837306304807e-07, "logits/chosen": 3.71875, "logits/rejected": 4.53125, "logps/chosen": -640.0, "logps/rejected": -250.0, "loss": 0.6781, "rewards/accuracies": 0.5, "rewards/chosen": -0.1455078125, "rewards/margins": -0.0225830078125, "rewards/rejected": -0.12255859375, "step": 120 }, { "epoch": 0.2532705389848247, "grad_norm": 6.795380592346191, "learning_rate": 4.967464961052915e-07, "logits/chosen": 4.375, "logits/rejected": 3.4375, "logps/chosen": -278.0, "logps/rejected": -298.0, "loss": 0.6745, "rewards/accuracies": 0.75, "rewards/chosen": -0.158203125, "rewards/margins": 0.02880859375, "rewards/rejected": -0.1875, "step": 121 }, { "epoch": 0.2553636839351125, "grad_norm": 7.234960556030273, "learning_rate": 4.966544091166043e-07, "logits/chosen": 4.0, "logits/rejected": 3.546875, "logps/chosen": -448.0, "logps/rejected": -460.0, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": -0.030029296875, "rewards/margins": 0.0751953125, "rewards/rejected": -0.10546875, "step": 122 }, { "epoch": 0.25745682888540034, "grad_norm": 8.24029541015625, "learning_rate": 4.965610458152973e-07, "logits/chosen": 4.1875, "logits/rejected": 4.09375, "logps/chosen": -656.0, "logps/rejected": -506.0, "loss": 0.6613, "rewards/accuracies": 0.5, "rewards/chosen": -0.10498046875, "rewards/margins": 0.1796875, "rewards/rejected": -0.28515625, "step": 123 }, { "epoch": 0.2595499738356881, "grad_norm": 7.639450550079346, "learning_rate": 4.96466406684528e-07, "logits/chosen": 3.515625, "logits/rejected": 4.28125, "logps/chosen": -784.0, "logps/rejected": -400.0, "loss": 0.6781, "rewards/accuracies": 0.75, "rewards/chosen": -0.080078125, "rewards/margins": 0.09521484375, "rewards/rejected": -0.1748046875, "step": 124 }, { "epoch": 0.2616431187859759, "grad_norm": 8.00741195678711, "learning_rate": 4.963704922140558e-07, "logits/chosen": 3.390625, "logits/rejected": 3.59375, "logps/chosen": -440.0, "logps/rejected": -370.0, "loss": 0.6794, "rewards/accuracies": 0.25, "rewards/chosen": -0.216796875, "rewards/margins": -0.09228515625, "rewards/rejected": -0.1240234375, "step": 125 }, { "epoch": 0.26373626373626374, "grad_norm": 8.1722993850708, "learning_rate": 4.962733029002401e-07, "logits/chosen": 3.265625, "logits/rejected": 3.78125, "logps/chosen": -436.0, "logps/rejected": -396.0, "loss": 0.6697, "rewards/accuracies": 0.5, "rewards/chosen": -0.16796875, "rewards/margins": 0.0634765625, "rewards/rejected": -0.232421875, "step": 126 }, { "epoch": 0.2658294086865515, "grad_norm": 7.704063415527344, "learning_rate": 4.961748392460379e-07, "logits/chosen": 3.71875, "logits/rejected": 3.59375, "logps/chosen": -235.0, "logps/rejected": -346.0, "loss": 0.6627, "rewards/accuracies": 0.5, "rewards/chosen": -0.1318359375, "rewards/margins": 0.0029296875, "rewards/rejected": -0.134765625, "step": 127 }, { "epoch": 0.26792255363683937, "grad_norm": 8.196044921875, "learning_rate": 4.960751017610008e-07, "logits/chosen": 3.65625, "logits/rejected": 3.21875, "logps/chosen": -284.0, "logps/rejected": -416.0, "loss": 0.6714, "rewards/accuracies": 0.5, "rewards/chosen": -0.10498046875, "rewards/margins": 0.1337890625, "rewards/rejected": -0.23828125, "step": 128 }, { "epoch": 0.27001569858712715, "grad_norm": 7.8072614669799805, "learning_rate": 4.959740909612723e-07, "logits/chosen": 3.546875, "logits/rejected": 3.78125, "logps/chosen": -308.0, "logps/rejected": -276.0, "loss": 0.6706, "rewards/accuracies": 1.0, "rewards/chosen": -0.0255126953125, "rewards/margins": 0.11328125, "rewards/rejected": -0.138671875, "step": 129 }, { "epoch": 0.272108843537415, "grad_norm": 8.080399513244629, "learning_rate": 4.958718073695857e-07, "logits/chosen": 3.15625, "logits/rejected": 3.625, "logps/chosen": -332.0, "logps/rejected": -482.0, "loss": 0.6673, "rewards/accuracies": 0.75, "rewards/chosen": -0.12255859375, "rewards/margins": -0.0047607421875, "rewards/rejected": -0.11767578125, "step": 130 }, { "epoch": 0.2742019884877028, "grad_norm": 8.27035903930664, "learning_rate": 4.957682515152607e-07, "logits/chosen": 3.1875, "logits/rejected": 3.125, "logps/chosen": -480.0, "logps/rejected": -516.0, "loss": 0.6691, "rewards/accuracies": 0.5, "rewards/chosen": -0.10498046875, "rewards/margins": 0.057861328125, "rewards/rejected": -0.162109375, "step": 131 }, { "epoch": 0.27629513343799056, "grad_norm": 8.402571678161621, "learning_rate": 4.956634239342012e-07, "logits/chosen": 4.125, "logits/rejected": 3.859375, "logps/chosen": -338.0, "logps/rejected": -464.0, "loss": 0.6564, "rewards/accuracies": 0.25, "rewards/chosen": -0.255859375, "rewards/margins": -0.0380859375, "rewards/rejected": -0.216796875, "step": 132 }, { "epoch": 0.2783882783882784, "grad_norm": 7.434090614318848, "learning_rate": 4.955573251688922e-07, "logits/chosen": 3.359375, "logits/rejected": 2.859375, "logps/chosen": -278.0, "logps/rejected": -310.0, "loss": 0.6647, "rewards/accuracies": 0.75, "rewards/chosen": -0.1630859375, "rewards/margins": 0.0771484375, "rewards/rejected": -0.240234375, "step": 133 }, { "epoch": 0.2804814233385662, "grad_norm": 8.35785961151123, "learning_rate": 4.954499557683971e-07, "logits/chosen": 3.21875, "logits/rejected": 3.28125, "logps/chosen": -588.0, "logps/rejected": -448.0, "loss": 0.6737, "rewards/accuracies": 0.75, "rewards/chosen": -0.146484375, "rewards/margins": 0.1171875, "rewards/rejected": -0.263671875, "step": 134 }, { "epoch": 0.282574568288854, "grad_norm": 8.191429138183594, "learning_rate": 4.95341316288355e-07, "logits/chosen": 3.109375, "logits/rejected": 3.40625, "logps/chosen": -238.0, "logps/rejected": -268.0, "loss": 0.6541, "rewards/accuracies": 0.5, "rewards/chosen": -0.177734375, "rewards/margins": -0.0322265625, "rewards/rejected": -0.146484375, "step": 135 }, { "epoch": 0.2846677132391418, "grad_norm": 7.505073547363281, "learning_rate": 4.952314072909776e-07, "logits/chosen": 3.171875, "logits/rejected": 2.90625, "logps/chosen": -199.0, "logps/rejected": -390.0, "loss": 0.6669, "rewards/accuracies": 0.5, "rewards/chosen": -0.1845703125, "rewards/margins": 0.07275390625, "rewards/rejected": -0.2578125, "step": 136 }, { "epoch": 0.2867608581894296, "grad_norm": 9.103921890258789, "learning_rate": 4.951202293450464e-07, "logits/chosen": 3.34375, "logits/rejected": 3.5625, "logps/chosen": -884.0, "logps/rejected": -284.0, "loss": 0.676, "rewards/accuracies": 0.5, "rewards/chosen": -0.3515625, "rewards/margins": -0.1767578125, "rewards/rejected": -0.17578125, "step": 137 }, { "epoch": 0.28885400313971743, "grad_norm": 7.8134307861328125, "learning_rate": 4.950077830259097e-07, "logits/chosen": 4.34375, "logits/rejected": 4.28125, "logps/chosen": -524.0, "logps/rejected": -520.0, "loss": 0.666, "rewards/accuracies": 0.25, "rewards/chosen": -0.1669921875, "rewards/margins": -0.0166015625, "rewards/rejected": -0.150390625, "step": 138 }, { "epoch": 0.2909471480900052, "grad_norm": 7.591533184051514, "learning_rate": 4.948940689154794e-07, "logits/chosen": 4.03125, "logits/rejected": 3.78125, "logps/chosen": -302.0, "logps/rejected": -354.0, "loss": 0.6713, "rewards/accuracies": 0.5, "rewards/chosen": -0.1748046875, "rewards/margins": 0.0771484375, "rewards/rejected": -0.251953125, "step": 139 }, { "epoch": 0.29304029304029305, "grad_norm": 8.549483299255371, "learning_rate": 4.94779087602229e-07, "logits/chosen": 3.046875, "logits/rejected": 3.625, "logps/chosen": -472.0, "logps/rejected": -448.0, "loss": 0.669, "rewards/accuracies": 0.0, "rewards/chosen": -0.30859375, "rewards/margins": -0.1396484375, "rewards/rejected": -0.169921875, "step": 140 }, { "epoch": 0.29513343799058084, "grad_norm": 8.168773651123047, "learning_rate": 4.94662839681189e-07, "logits/chosen": 3.5625, "logits/rejected": 3.3125, "logps/chosen": -406.0, "logps/rejected": -330.0, "loss": 0.658, "rewards/accuracies": 0.75, "rewards/chosen": -0.203125, "rewards/margins": 0.01806640625, "rewards/rejected": -0.220703125, "step": 141 }, { "epoch": 0.2972265829408687, "grad_norm": 7.819066047668457, "learning_rate": 4.945453257539451e-07, "logits/chosen": 3.0625, "logits/rejected": 3.078125, "logps/chosen": -512.0, "logps/rejected": -388.0, "loss": 0.6628, "rewards/accuracies": 0.75, "rewards/chosen": -0.054931640625, "rewards/margins": 0.1259765625, "rewards/rejected": -0.1806640625, "step": 142 }, { "epoch": 0.29931972789115646, "grad_norm": 8.835631370544434, "learning_rate": 4.944265464286343e-07, "logits/chosen": 3.09375, "logits/rejected": 3.890625, "logps/chosen": -466.0, "logps/rejected": -256.0, "loss": 0.6803, "rewards/accuracies": 0.5, "rewards/chosen": -0.228515625, "rewards/margins": -0.049560546875, "rewards/rejected": -0.1787109375, "step": 143 }, { "epoch": 0.30141287284144425, "grad_norm": 7.867812633514404, "learning_rate": 4.943065023199424e-07, "logits/chosen": 3.78125, "logits/rejected": 3.40625, "logps/chosen": -470.0, "logps/rejected": -338.0, "loss": 0.6677, "rewards/accuracies": 1.0, "rewards/chosen": -0.03955078125, "rewards/margins": 0.0673828125, "rewards/rejected": -0.1064453125, "step": 144 }, { "epoch": 0.3035060177917321, "grad_norm": 7.702564239501953, "learning_rate": 4.941851940491002e-07, "logits/chosen": 2.546875, "logits/rejected": 3.125, "logps/chosen": -540.0, "logps/rejected": -428.0, "loss": 0.6558, "rewards/accuracies": 0.25, "rewards/chosen": -0.322265625, "rewards/margins": 0.00537109375, "rewards/rejected": -0.328125, "step": 145 }, { "epoch": 0.30559916274201987, "grad_norm": 9.008699417114258, "learning_rate": 4.940626222438808e-07, "logits/chosen": 2.515625, "logits/rejected": 2.703125, "logps/chosen": -164.0, "logps/rejected": -288.0, "loss": 0.6747, "rewards/accuracies": 0.75, "rewards/chosen": -0.2177734375, "rewards/margins": 0.0234375, "rewards/rejected": -0.2412109375, "step": 146 }, { "epoch": 0.3076923076923077, "grad_norm": 7.8209662437438965, "learning_rate": 4.939387875385958e-07, "logits/chosen": 2.828125, "logits/rejected": 2.53125, "logps/chosen": -209.0, "logps/rejected": -276.0, "loss": 0.6712, "rewards/accuracies": 0.0, "rewards/chosen": -0.33203125, "rewards/margins": -0.1533203125, "rewards/rejected": -0.1796875, "step": 147 }, { "epoch": 0.3097854526425955, "grad_norm": 7.214803218841553, "learning_rate": 4.938136905740926e-07, "logits/chosen": 3.609375, "logits/rejected": 3.734375, "logps/chosen": -880.0, "logps/rejected": -740.0, "loss": 0.6711, "rewards/accuracies": 1.0, "rewards/chosen": -0.1103515625, "rewards/margins": 0.41015625, "rewards/rejected": -0.51953125, "step": 148 }, { "epoch": 0.31187859759288333, "grad_norm": 8.265995979309082, "learning_rate": 4.936873319977508e-07, "logits/chosen": 3.203125, "logits/rejected": 2.953125, "logps/chosen": -756.0, "logps/rejected": -632.0, "loss": 0.6612, "rewards/accuracies": 1.0, "rewards/chosen": -0.123046875, "rewards/margins": 0.2470703125, "rewards/rejected": -0.37109375, "step": 149 }, { "epoch": 0.3139717425431711, "grad_norm": 7.911357402801514, "learning_rate": 4.935597124634788e-07, "logits/chosen": 3.140625, "logits/rejected": 2.984375, "logps/chosen": -318.0, "logps/rejected": -332.0, "loss": 0.6705, "rewards/accuracies": 0.5, "rewards/chosen": -0.32421875, "rewards/margins": -0.2333984375, "rewards/rejected": -0.091796875, "step": 150 }, { "epoch": 0.3160648874934589, "grad_norm": 7.947584629058838, "learning_rate": 4.934308326317104e-07, "logits/chosen": 2.859375, "logits/rejected": 3.296875, "logps/chosen": -362.0, "logps/rejected": -368.0, "loss": 0.6506, "rewards/accuracies": 0.5, "rewards/chosen": -0.197265625, "rewards/margins": 0.0152587890625, "rewards/rejected": -0.212890625, "step": 151 }, { "epoch": 0.31815803244374674, "grad_norm": 8.037640571594238, "learning_rate": 4.933006931694018e-07, "logits/chosen": 3.28125, "logits/rejected": 3.25, "logps/chosen": -370.0, "logps/rejected": -364.0, "loss": 0.6649, "rewards/accuracies": 0.75, "rewards/chosen": -0.140625, "rewards/margins": 0.232421875, "rewards/rejected": -0.373046875, "step": 152 }, { "epoch": 0.3202511773940345, "grad_norm": 8.108712196350098, "learning_rate": 4.931692947500272e-07, "logits/chosen": 3.734375, "logits/rejected": 3.84375, "logps/chosen": -432.0, "logps/rejected": -428.0, "loss": 0.6289, "rewards/accuracies": 0.5, "rewards/chosen": -0.185546875, "rewards/margins": 0.171875, "rewards/rejected": -0.357421875, "step": 153 }, { "epoch": 0.32234432234432236, "grad_norm": 8.41116714477539, "learning_rate": 4.930366380535766e-07, "logits/chosen": 3.125, "logits/rejected": 3.296875, "logps/chosen": -264.0, "logps/rejected": -276.0, "loss": 0.6543, "rewards/accuracies": 0.75, "rewards/chosen": -0.361328125, "rewards/margins": 0.1689453125, "rewards/rejected": -0.53125, "step": 154 }, { "epoch": 0.32443746729461015, "grad_norm": 8.361553192138672, "learning_rate": 4.929027237665514e-07, "logits/chosen": 2.640625, "logits/rejected": 2.828125, "logps/chosen": -548.0, "logps/rejected": -370.0, "loss": 0.6697, "rewards/accuracies": 0.75, "rewards/chosen": -0.14453125, "rewards/margins": 0.12890625, "rewards/rejected": -0.2734375, "step": 155 }, { "epoch": 0.32653061224489793, "grad_norm": 7.492923736572266, "learning_rate": 4.927675525819608e-07, "logits/chosen": 2.640625, "logits/rejected": 2.875, "logps/chosen": -165.0, "logps/rejected": -156.0, "loss": 0.6464, "rewards/accuracies": 0.5, "rewards/chosen": -0.263671875, "rewards/margins": -0.01953125, "rewards/rejected": -0.244140625, "step": 156 }, { "epoch": 0.3286237571951858, "grad_norm": 7.923641681671143, "learning_rate": 4.926311251993185e-07, "logits/chosen": 3.09375, "logits/rejected": 3.734375, "logps/chosen": -394.0, "logps/rejected": -255.0, "loss": 0.6645, "rewards/accuracies": 0.5, "rewards/chosen": -0.25390625, "rewards/margins": 0.126953125, "rewards/rejected": -0.380859375, "step": 157 }, { "epoch": 0.33071690214547356, "grad_norm": 9.376420974731445, "learning_rate": 4.924934423246395e-07, "logits/chosen": 2.671875, "logits/rejected": 3.234375, "logps/chosen": -158.0, "logps/rejected": -109.5, "loss": 0.659, "rewards/accuracies": 0.5, "rewards/chosen": -0.2021484375, "rewards/margins": -0.001953125, "rewards/rejected": -0.2001953125, "step": 158 }, { "epoch": 0.3328100470957614, "grad_norm": 8.906984329223633, "learning_rate": 4.923545046704356e-07, "logits/chosen": 3.671875, "logits/rejected": 3.4375, "logps/chosen": -418.0, "logps/rejected": -430.0, "loss": 0.7016, "rewards/accuracies": 0.5, "rewards/chosen": -0.2373046875, "rewards/margins": -0.0322265625, "rewards/rejected": -0.205078125, "step": 159 }, { "epoch": 0.3349031920460492, "grad_norm": 8.105554580688477, "learning_rate": 4.922143129557123e-07, "logits/chosen": 3.71875, "logits/rejected": 4.28125, "logps/chosen": -446.0, "logps/rejected": -216.0, "loss": 0.6532, "rewards/accuracies": 0.75, "rewards/chosen": -0.1259765625, "rewards/margins": 0.1328125, "rewards/rejected": -0.2578125, "step": 160 }, { "epoch": 0.336996336996337, "grad_norm": 9.821364402770996, "learning_rate": 4.920728679059647e-07, "logits/chosen": 2.78125, "logits/rejected": 3.0625, "logps/chosen": -356.0, "logps/rejected": -276.0, "loss": 0.6464, "rewards/accuracies": 0.5, "rewards/chosen": -0.234375, "rewards/margins": -0.000732421875, "rewards/rejected": -0.2333984375, "step": 161 }, { "epoch": 0.3390894819466248, "grad_norm": 7.859842777252197, "learning_rate": 4.91930170253174e-07, "logits/chosen": 3.234375, "logits/rejected": 3.59375, "logps/chosen": -416.0, "logps/rejected": -332.0, "loss": 0.6639, "rewards/accuracies": 0.25, "rewards/chosen": -0.18359375, "rewards/margins": -0.037353515625, "rewards/rejected": -0.146484375, "step": 162 }, { "epoch": 0.3411826268969126, "grad_norm": 8.701375007629395, "learning_rate": 4.917862207358038e-07, "logits/chosen": 2.8125, "logits/rejected": 2.9375, "logps/chosen": -608.0, "logps/rejected": -484.0, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": -0.2470703125, "rewards/margins": 0.205078125, "rewards/rejected": -0.453125, "step": 163 }, { "epoch": 0.34327577184720043, "grad_norm": 7.780459403991699, "learning_rate": 4.91641020098796e-07, "logits/chosen": 3.21875, "logits/rejected": 3.53125, "logps/chosen": -258.0, "logps/rejected": -280.0, "loss": 0.6705, "rewards/accuracies": 0.5, "rewards/chosen": -0.375, "rewards/margins": -0.030517578125, "rewards/rejected": -0.34375, "step": 164 }, { "epoch": 0.3453689167974882, "grad_norm": 8.421460151672363, "learning_rate": 4.914945690935671e-07, "logits/chosen": 3.4375, "logits/rejected": 3.625, "logps/chosen": -532.0, "logps/rejected": -348.0, "loss": 0.6714, "rewards/accuracies": 0.25, "rewards/chosen": -0.359375, "rewards/margins": 0.00341796875, "rewards/rejected": -0.36328125, "step": 165 }, { "epoch": 0.34746206174777605, "grad_norm": 7.451173782348633, "learning_rate": 4.913468684780043e-07, "logits/chosen": 3.953125, "logits/rejected": 3.84375, "logps/chosen": -334.0, "logps/rejected": -356.0, "loss": 0.6533, "rewards/accuracies": 0.5, "rewards/chosen": -0.1162109375, "rewards/margins": 0.1201171875, "rewards/rejected": -0.236328125, "step": 166 }, { "epoch": 0.34955520669806384, "grad_norm": 8.066396713256836, "learning_rate": 4.911979190164615e-07, "logits/chosen": 2.953125, "logits/rejected": 3.15625, "logps/chosen": -334.0, "logps/rejected": -294.0, "loss": 0.6521, "rewards/accuracies": 0.75, "rewards/chosen": -0.287109375, "rewards/margins": 0.1494140625, "rewards/rejected": -0.435546875, "step": 167 }, { "epoch": 0.3516483516483517, "grad_norm": 8.994278907775879, "learning_rate": 4.910477214797554e-07, "logits/chosen": 2.296875, "logits/rejected": 2.84375, "logps/chosen": -438.0, "logps/rejected": -334.0, "loss": 0.6584, "rewards/accuracies": 0.75, "rewards/chosen": -0.140625, "rewards/margins": 0.173828125, "rewards/rejected": -0.314453125, "step": 168 }, { "epoch": 0.35374149659863946, "grad_norm": 7.937893390655518, "learning_rate": 4.908962766451616e-07, "logits/chosen": 3.265625, "logits/rejected": 3.546875, "logps/chosen": -432.0, "logps/rejected": -356.0, "loss": 0.6485, "rewards/accuracies": 0.75, "rewards/chosen": -0.421875, "rewards/margins": 0.04833984375, "rewards/rejected": -0.470703125, "step": 169 }, { "epoch": 0.35583464154892724, "grad_norm": 8.24114990234375, "learning_rate": 4.907435852964103e-07, "logits/chosen": 2.96875, "logits/rejected": 3.4375, "logps/chosen": -410.0, "logps/rejected": -190.0, "loss": 0.641, "rewards/accuracies": 0.25, "rewards/chosen": -0.40625, "rewards/margins": -0.1328125, "rewards/rejected": -0.2734375, "step": 170 }, { "epoch": 0.3579277864992151, "grad_norm": 8.00881576538086, "learning_rate": 4.905896482236829e-07, "logits/chosen": 2.453125, "logits/rejected": 2.609375, "logps/chosen": -219.0, "logps/rejected": -214.0, "loss": 0.647, "rewards/accuracies": 0.5, "rewards/chosen": -0.1962890625, "rewards/margins": 0.134765625, "rewards/rejected": -0.33203125, "step": 171 }, { "epoch": 0.36002093144950287, "grad_norm": 8.792490005493164, "learning_rate": 4.904344662236069e-07, "logits/chosen": 3.375, "logits/rejected": 3.0625, "logps/chosen": -280.0, "logps/rejected": -400.0, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": -0.259765625, "rewards/margins": 0.32421875, "rewards/rejected": -0.5859375, "step": 172 }, { "epoch": 0.3621140763997907, "grad_norm": 8.430554389953613, "learning_rate": 4.902780400992526e-07, "logits/chosen": 3.078125, "logits/rejected": 3.515625, "logps/chosen": -312.0, "logps/rejected": -366.0, "loss": 0.6734, "rewards/accuracies": 0.5, "rewards/chosen": -0.330078125, "rewards/margins": 0.009765625, "rewards/rejected": -0.33984375, "step": 173 }, { "epoch": 0.3642072213500785, "grad_norm": 8.057472229003906, "learning_rate": 4.901203706601288e-07, "logits/chosen": 3.390625, "logits/rejected": 3.15625, "logps/chosen": -780.0, "logps/rejected": -428.0, "loss": 0.6611, "rewards/accuracies": 0.75, "rewards/chosen": -0.2431640625, "rewards/margins": 0.314453125, "rewards/rejected": -0.55859375, "step": 174 }, { "epoch": 0.3663003663003663, "grad_norm": 8.198448181152344, "learning_rate": 4.899614587221782e-07, "logits/chosen": 2.234375, "logits/rejected": 3.09375, "logps/chosen": -506.0, "logps/rejected": -262.0, "loss": 0.6604, "rewards/accuracies": 0.75, "rewards/chosen": -0.2158203125, "rewards/margins": 0.11767578125, "rewards/rejected": -0.33203125, "step": 175 }, { "epoch": 0.3683935112506541, "grad_norm": 8.351573944091797, "learning_rate": 4.898013051077735e-07, "logits/chosen": 3.375, "logits/rejected": 2.921875, "logps/chosen": -165.0, "logps/rejected": -286.0, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -0.21484375, "rewards/margins": 0.0262451171875, "rewards/rejected": -0.2412109375, "step": 176 }, { "epoch": 0.3704866562009419, "grad_norm": 8.730794906616211, "learning_rate": 4.896399106457132e-07, "logits/chosen": 3.46875, "logits/rejected": 3.984375, "logps/chosen": -348.0, "logps/rejected": -330.0, "loss": 0.6684, "rewards/accuracies": 0.75, "rewards/chosen": -0.248046875, "rewards/margins": 0.0380859375, "rewards/rejected": -0.287109375, "step": 177 }, { "epoch": 0.37257980115122974, "grad_norm": 8.036724090576172, "learning_rate": 4.894772761712174e-07, "logits/chosen": 2.921875, "logits/rejected": 3.375, "logps/chosen": -340.0, "logps/rejected": -193.0, "loss": 0.6449, "rewards/accuracies": 0.25, "rewards/chosen": -0.236328125, "rewards/margins": 0.021484375, "rewards/rejected": -0.2578125, "step": 178 }, { "epoch": 0.3746729461015175, "grad_norm": 8.092848777770996, "learning_rate": 4.893134025259228e-07, "logits/chosen": 3.296875, "logits/rejected": 4.0625, "logps/chosen": -544.0, "logps/rejected": -432.0, "loss": 0.6621, "rewards/accuracies": 0.25, "rewards/chosen": -0.484375, "rewards/margins": -0.0849609375, "rewards/rejected": -0.400390625, "step": 179 }, { "epoch": 0.37676609105180536, "grad_norm": 7.546232223510742, "learning_rate": 4.891482905578792e-07, "logits/chosen": 2.953125, "logits/rejected": 2.59375, "logps/chosen": -320.0, "logps/rejected": -364.0, "loss": 0.6624, "rewards/accuracies": 0.25, "rewards/chosen": -0.2890625, "rewards/margins": 0.005859375, "rewards/rejected": -0.294921875, "step": 180 }, { "epoch": 0.37885923600209315, "grad_norm": 8.636161804199219, "learning_rate": 4.889819411215448e-07, "logits/chosen": 2.25, "logits/rejected": 2.09375, "logps/chosen": -106.5, "logps/rejected": -172.0, "loss": 0.6579, "rewards/accuracies": 0.25, "rewards/chosen": -0.310546875, "rewards/margins": -0.08544921875, "rewards/rejected": -0.224609375, "step": 181 }, { "epoch": 0.38095238095238093, "grad_norm": 8.529451370239258, "learning_rate": 4.888143550777814e-07, "logits/chosen": 3.515625, "logits/rejected": 3.703125, "logps/chosen": -340.0, "logps/rejected": -264.0, "loss": 0.6399, "rewards/accuracies": 0.5, "rewards/chosen": -0.337890625, "rewards/margins": -0.083984375, "rewards/rejected": -0.25390625, "step": 182 }, { "epoch": 0.38304552590266877, "grad_norm": 8.135290145874023, "learning_rate": 4.886455332938507e-07, "logits/chosen": 1.6953125, "logits/rejected": 2.078125, "logps/chosen": -306.0, "logps/rejected": -298.0, "loss": 0.667, "rewards/accuracies": 0.5, "rewards/chosen": -0.341796875, "rewards/margins": -0.1474609375, "rewards/rejected": -0.193359375, "step": 183 }, { "epoch": 0.38513867085295656, "grad_norm": 8.299905776977539, "learning_rate": 4.88475476643409e-07, "logits/chosen": 3.171875, "logits/rejected": 3.953125, "logps/chosen": -464.0, "logps/rejected": -444.0, "loss": 0.6306, "rewards/accuracies": 1.0, "rewards/chosen": -0.404296875, "rewards/margins": 0.271484375, "rewards/rejected": -0.67578125, "step": 184 }, { "epoch": 0.3872318158032444, "grad_norm": 8.095890998840332, "learning_rate": 4.883041860065032e-07, "logits/chosen": 3.03125, "logits/rejected": 4.0625, "logps/chosen": -664.0, "logps/rejected": -470.0, "loss": 0.6491, "rewards/accuracies": 0.25, "rewards/chosen": -0.310546875, "rewards/margins": 0.083984375, "rewards/rejected": -0.39453125, "step": 185 }, { "epoch": 0.3893249607535322, "grad_norm": 8.727668762207031, "learning_rate": 4.881316622695661e-07, "logits/chosen": 3.234375, "logits/rejected": 2.71875, "logps/chosen": -161.0, "logps/rejected": -274.0, "loss": 0.643, "rewards/accuracies": 1.0, "rewards/chosen": -0.216796875, "rewards/margins": 0.212890625, "rewards/rejected": -0.4296875, "step": 186 }, { "epoch": 0.39141810570381996, "grad_norm": 8.658443450927734, "learning_rate": 4.87957906325412e-07, "logits/chosen": 2.640625, "logits/rejected": 3.046875, "logps/chosen": -382.0, "logps/rejected": -304.0, "loss": 0.6738, "rewards/accuracies": 0.75, "rewards/chosen": -0.34375, "rewards/margins": 0.15234375, "rewards/rejected": -0.49609375, "step": 187 }, { "epoch": 0.3935112506541078, "grad_norm": 8.59584903717041, "learning_rate": 4.877829190732315e-07, "logits/chosen": 3.375, "logits/rejected": 3.375, "logps/chosen": -344.0, "logps/rejected": -230.0, "loss": 0.6756, "rewards/accuracies": 0.5, "rewards/chosen": -0.2275390625, "rewards/margins": 0.2392578125, "rewards/rejected": -0.466796875, "step": 188 }, { "epoch": 0.3956043956043956, "grad_norm": 8.687397003173828, "learning_rate": 4.876067014185876e-07, "logits/chosen": 3.015625, "logits/rejected": 3.21875, "logps/chosen": -330.0, "logps/rejected": -384.0, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": -0.125, "rewards/margins": 0.3515625, "rewards/rejected": -0.4765625, "step": 189 }, { "epoch": 0.3976975405546834, "grad_norm": 7.772812366485596, "learning_rate": 4.874292542734106e-07, "logits/chosen": 2.84375, "logits/rejected": 3.015625, "logps/chosen": -366.0, "logps/rejected": -282.0, "loss": 0.645, "rewards/accuracies": 0.25, "rewards/chosen": -0.515625, "rewards/margins": -0.0244140625, "rewards/rejected": -0.490234375, "step": 190 }, { "epoch": 0.3997906855049712, "grad_norm": 8.127242088317871, "learning_rate": 4.872505785559932e-07, "logits/chosen": 2.765625, "logits/rejected": 2.96875, "logps/chosen": -208.0, "logps/rejected": -158.0, "loss": 0.647, "rewards/accuracies": 1.0, "rewards/chosen": -0.21484375, "rewards/margins": 0.166015625, "rewards/rejected": -0.3828125, "step": 191 }, { "epoch": 0.40188383045525905, "grad_norm": 8.73538589477539, "learning_rate": 4.870706751909864e-07, "logits/chosen": 3.21875, "logits/rejected": 2.96875, "logps/chosen": -177.0, "logps/rejected": -328.0, "loss": 0.6665, "rewards/accuracies": 0.25, "rewards/chosen": -0.3671875, "rewards/margins": 0.0068359375, "rewards/rejected": -0.375, "step": 192 }, { "epoch": 0.40397697540554683, "grad_norm": 9.213397026062012, "learning_rate": 4.868895451093939e-07, "logits/chosen": 2.34375, "logits/rejected": 2.796875, "logps/chosen": -350.0, "logps/rejected": -158.0, "loss": 0.6662, "rewards/accuracies": 0.75, "rewards/chosen": -0.330078125, "rewards/margins": 0.0888671875, "rewards/rejected": -0.41796875, "step": 193 }, { "epoch": 0.4060701203558346, "grad_norm": 8.662314414978027, "learning_rate": 4.867071892485679e-07, "logits/chosen": 3.3125, "logits/rejected": 4.0, "logps/chosen": -474.0, "logps/rejected": -384.0, "loss": 0.6729, "rewards/accuracies": 0.25, "rewards/chosen": -0.458984375, "rewards/margins": -0.0498046875, "rewards/rejected": -0.408203125, "step": 194 }, { "epoch": 0.40816326530612246, "grad_norm": 8.317453384399414, "learning_rate": 4.865236085522042e-07, "logits/chosen": 3.21875, "logits/rejected": 3.09375, "logps/chosen": -544.0, "logps/rejected": -592.0, "loss": 0.6539, "rewards/accuracies": 0.5, "rewards/chosen": -0.287109375, "rewards/margins": -0.002685546875, "rewards/rejected": -0.28515625, "step": 195 }, { "epoch": 0.41025641025641024, "grad_norm": 7.9303789138793945, "learning_rate": 4.863388039703365e-07, "logits/chosen": 2.9375, "logits/rejected": 2.703125, "logps/chosen": -183.0, "logps/rejected": -296.0, "loss": 0.6524, "rewards/accuracies": 0.25, "rewards/chosen": -0.439453125, "rewards/margins": 0.015625, "rewards/rejected": -0.455078125, "step": 196 }, { "epoch": 0.4123495552066981, "grad_norm": 8.1914644241333, "learning_rate": 4.861527764593328e-07, "logits/chosen": 2.4375, "logits/rejected": 2.921875, "logps/chosen": -432.0, "logps/rejected": -208.0, "loss": 0.6423, "rewards/accuracies": 0.5, "rewards/chosen": -0.5703125, "rewards/margins": -0.08642578125, "rewards/rejected": -0.484375, "step": 197 }, { "epoch": 0.41444270015698587, "grad_norm": 8.490318298339844, "learning_rate": 4.859655269818898e-07, "logits/chosen": 3.5625, "logits/rejected": 3.9375, "logps/chosen": -772.0, "logps/rejected": -1056.0, "loss": 0.6417, "rewards/accuracies": 1.0, "rewards/chosen": -0.224609375, "rewards/margins": 0.66796875, "rewards/rejected": -0.89453125, "step": 198 }, { "epoch": 0.4165358451072737, "grad_norm": 7.858203887939453, "learning_rate": 4.857770565070274e-07, "logits/chosen": 3.34375, "logits/rejected": 3.234375, "logps/chosen": -338.0, "logps/rejected": -402.0, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": -0.294921875, "rewards/margins": 0.34375, "rewards/rejected": -0.63671875, "step": 199 }, { "epoch": 0.4186289900575615, "grad_norm": 9.21390438079834, "learning_rate": 4.855873660100845e-07, "logits/chosen": 3.34375, "logits/rejected": 3.75, "logps/chosen": -636.0, "logps/rejected": -516.0, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": -0.41015625, "rewards/margins": 0.06982421875, "rewards/rejected": -0.478515625, "step": 200 }, { "epoch": 0.4207221350078493, "grad_norm": 8.616729736328125, "learning_rate": 4.853964564727136e-07, "logits/chosen": 2.765625, "logits/rejected": 2.71875, "logps/chosen": -308.0, "logps/rejected": -388.0, "loss": 0.6656, "rewards/accuracies": 0.75, "rewards/chosen": -0.52734375, "rewards/margins": 0.0400390625, "rewards/rejected": -0.5703125, "step": 201 }, { "epoch": 0.4228152799581371, "grad_norm": 8.263435363769531, "learning_rate": 4.852043288828757e-07, "logits/chosen": 3.265625, "logits/rejected": 3.234375, "logps/chosen": -572.0, "logps/rejected": -452.0, "loss": 0.6563, "rewards/accuracies": 0.75, "rewards/chosen": -0.609375, "rewards/margins": 0.16796875, "rewards/rejected": -0.77734375, "step": 202 }, { "epoch": 0.4249084249084249, "grad_norm": 8.495272636413574, "learning_rate": 4.850109842348355e-07, "logits/chosen": 3.0625, "logits/rejected": 3.4375, "logps/chosen": -266.0, "logps/rejected": -210.0, "loss": 0.6447, "rewards/accuracies": 0.25, "rewards/chosen": -0.478515625, "rewards/margins": -0.12890625, "rewards/rejected": -0.349609375, "step": 203 }, { "epoch": 0.42700156985871274, "grad_norm": 8.544230461120605, "learning_rate": 4.848164235291556e-07, "logits/chosen": 2.59375, "logits/rejected": 2.734375, "logps/chosen": -432.0, "logps/rejected": -314.0, "loss": 0.6496, "rewards/accuracies": 1.0, "rewards/chosen": -0.287109375, "rewards/margins": 0.26171875, "rewards/rejected": -0.546875, "step": 204 }, { "epoch": 0.4290947148090005, "grad_norm": 8.632508277893066, "learning_rate": 4.846206477726922e-07, "logits/chosen": 3.84375, "logits/rejected": 3.578125, "logps/chosen": -480.0, "logps/rejected": -450.0, "loss": 0.6327, "rewards/accuracies": 0.5, "rewards/chosen": -0.478515625, "rewards/margins": 0.0908203125, "rewards/rejected": -0.5703125, "step": 205 }, { "epoch": 0.4311878597592883, "grad_norm": 9.020018577575684, "learning_rate": 4.844236579785887e-07, "logits/chosen": 3.53125, "logits/rejected": 4.0, "logps/chosen": -472.0, "logps/rejected": -932.0, "loss": 0.6466, "rewards/accuracies": 1.0, "rewards/chosen": -0.2177734375, "rewards/margins": 0.515625, "rewards/rejected": -0.734375, "step": 206 }, { "epoch": 0.43328100470957615, "grad_norm": 8.227005004882812, "learning_rate": 4.84225455166272e-07, "logits/chosen": 2.046875, "logits/rejected": 2.71875, "logps/chosen": -396.0, "logps/rejected": -360.0, "loss": 0.6483, "rewards/accuracies": 1.0, "rewards/chosen": -0.2021484375, "rewards/margins": 0.1884765625, "rewards/rejected": -0.390625, "step": 207 }, { "epoch": 0.43537414965986393, "grad_norm": 8.853082656860352, "learning_rate": 4.840260403614459e-07, "logits/chosen": 3.4375, "logits/rejected": 3.703125, "logps/chosen": -708.0, "logps/rejected": -502.0, "loss": 0.6511, "rewards/accuracies": 0.75, "rewards/chosen": -0.640625, "rewards/margins": 0.00634765625, "rewards/rejected": -0.6484375, "step": 208 }, { "epoch": 0.43746729461015177, "grad_norm": 9.090826988220215, "learning_rate": 4.838254145960864e-07, "logits/chosen": 3.03125, "logits/rejected": 3.421875, "logps/chosen": -576.0, "logps/rejected": -466.0, "loss": 0.6669, "rewards/accuracies": 0.5, "rewards/chosen": -0.5546875, "rewards/margins": 0.0458984375, "rewards/rejected": -0.6015625, "step": 209 }, { "epoch": 0.43956043956043955, "grad_norm": 8.810733795166016, "learning_rate": 4.836235789084363e-07, "logits/chosen": 3.375, "logits/rejected": 3.515625, "logps/chosen": -338.0, "logps/rejected": -336.0, "loss": 0.6378, "rewards/accuracies": 0.5, "rewards/chosen": -0.265625, "rewards/margins": 0.1591796875, "rewards/rejected": -0.42578125, "step": 210 }, { "epoch": 0.4416535845107274, "grad_norm": 8.438765525817871, "learning_rate": 4.834205343429996e-07, "logits/chosen": 2.9375, "logits/rejected": 3.25, "logps/chosen": -250.0, "logps/rejected": -232.0, "loss": 0.6315, "rewards/accuracies": 0.5, "rewards/chosen": -0.369140625, "rewards/margins": -0.00048828125, "rewards/rejected": -0.369140625, "step": 211 }, { "epoch": 0.4437467294610152, "grad_norm": 8.868012428283691, "learning_rate": 4.832162819505364e-07, "logits/chosen": 1.984375, "logits/rejected": 2.1875, "logps/chosen": -228.0, "logps/rejected": -185.0, "loss": 0.6631, "rewards/accuracies": 0.75, "rewards/chosen": -0.341796875, "rewards/margins": 0.1728515625, "rewards/rejected": -0.515625, "step": 212 }, { "epoch": 0.44583987441130296, "grad_norm": 9.201122283935547, "learning_rate": 4.830108227880576e-07, "logits/chosen": 2.3125, "logits/rejected": 2.734375, "logps/chosen": -552.0, "logps/rejected": -462.0, "loss": 0.6587, "rewards/accuracies": 0.75, "rewards/chosen": -0.376953125, "rewards/margins": 0.326171875, "rewards/rejected": -0.703125, "step": 213 }, { "epoch": 0.4479330193615908, "grad_norm": 9.223275184631348, "learning_rate": 4.828041579188185e-07, "logits/chosen": 2.578125, "logits/rejected": 2.6875, "logps/chosen": -272.0, "logps/rejected": -616.0, "loss": 0.6646, "rewards/accuracies": 0.5, "rewards/chosen": -0.46875, "rewards/margins": 0.177734375, "rewards/rejected": -0.6484375, "step": 214 }, { "epoch": 0.4500261643118786, "grad_norm": 10.037179946899414, "learning_rate": 4.825962884123146e-07, "logits/chosen": 3.25, "logits/rejected": 3.34375, "logps/chosen": -360.0, "logps/rejected": -354.0, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -0.546875, "rewards/margins": 0.146484375, "rewards/rejected": -0.69140625, "step": 215 }, { "epoch": 0.4521193092621664, "grad_norm": 9.276552200317383, "learning_rate": 4.823872153442752e-07, "logits/chosen": 2.0, "logits/rejected": 2.21875, "logps/chosen": -183.0, "logps/rejected": -400.0, "loss": 0.66, "rewards/accuracies": 0.75, "rewards/chosen": -0.47265625, "rewards/margins": 0.302734375, "rewards/rejected": -0.77734375, "step": 216 }, { "epoch": 0.4542124542124542, "grad_norm": 8.489870071411133, "learning_rate": 4.821769397966578e-07, "logits/chosen": 2.546875, "logits/rejected": 2.578125, "logps/chosen": -328.0, "logps/rejected": -556.0, "loss": 0.6381, "rewards/accuracies": 1.0, "rewards/chosen": -0.40234375, "rewards/margins": 0.369140625, "rewards/rejected": -0.7734375, "step": 217 }, { "epoch": 0.456305599162742, "grad_norm": 9.456313133239746, "learning_rate": 4.819654628576432e-07, "logits/chosen": 3.234375, "logits/rejected": 3.28125, "logps/chosen": -532.0, "logps/rejected": -498.0, "loss": 0.6358, "rewards/accuracies": 0.75, "rewards/chosen": -0.384765625, "rewards/margins": 0.240234375, "rewards/rejected": -0.625, "step": 218 }, { "epoch": 0.45839874411302983, "grad_norm": 8.657371520996094, "learning_rate": 4.81752785621629e-07, "logits/chosen": 3.5, "logits/rejected": 3.75, "logps/chosen": -448.0, "logps/rejected": -548.0, "loss": 0.6388, "rewards/accuracies": 0.25, "rewards/chosen": -0.71875, "rewards/margins": -0.1513671875, "rewards/rejected": -0.56640625, "step": 219 }, { "epoch": 0.4604918890633176, "grad_norm": 8.963862419128418, "learning_rate": 4.815389091892249e-07, "logits/chosen": 3.28125, "logits/rejected": 3.234375, "logps/chosen": -408.0, "logps/rejected": -378.0, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": -0.462890625, "rewards/margins": 0.423828125, "rewards/rejected": -0.88671875, "step": 220 }, { "epoch": 0.46258503401360546, "grad_norm": 9.423280715942383, "learning_rate": 4.813238346672459e-07, "logits/chosen": 4.0625, "logits/rejected": 3.953125, "logps/chosen": -334.0, "logps/rejected": -306.0, "loss": 0.6548, "rewards/accuracies": 0.5, "rewards/chosen": -0.458984375, "rewards/margins": 0.2060546875, "rewards/rejected": -0.6640625, "step": 221 }, { "epoch": 0.46467817896389324, "grad_norm": 9.261499404907227, "learning_rate": 4.811075631687073e-07, "logits/chosen": 3.171875, "logits/rejected": 4.0625, "logps/chosen": -486.0, "logps/rejected": -336.0, "loss": 0.672, "rewards/accuracies": 0.5, "rewards/chosen": -0.45703125, "rewards/margins": -0.0068359375, "rewards/rejected": -0.451171875, "step": 222 }, { "epoch": 0.4667713239141811, "grad_norm": 8.188753128051758, "learning_rate": 4.80890095812819e-07, "logits/chosen": 2.328125, "logits/rejected": 2.796875, "logps/chosen": -200.0, "logps/rejected": -207.0, "loss": 0.6451, "rewards/accuracies": 1.0, "rewards/chosen": -0.400390625, "rewards/margins": 0.2060546875, "rewards/rejected": -0.60546875, "step": 223 }, { "epoch": 0.46886446886446886, "grad_norm": 7.763673305511475, "learning_rate": 4.806714337249796e-07, "logits/chosen": 2.90625, "logits/rejected": 3.453125, "logps/chosen": -804.0, "logps/rejected": -330.0, "loss": 0.6344, "rewards/accuracies": 0.5, "rewards/chosen": -0.6953125, "rewards/margins": -0.0556640625, "rewards/rejected": -0.640625, "step": 224 }, { "epoch": 0.47095761381475665, "grad_norm": 9.266056060791016, "learning_rate": 4.804515780367698e-07, "logits/chosen": 2.5, "logits/rejected": 3.375, "logps/chosen": -608.0, "logps/rejected": -498.0, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": -0.50390625, "rewards/margins": 0.5078125, "rewards/rejected": -1.015625, "step": 225 }, { "epoch": 0.4730507587650445, "grad_norm": 8.741589546203613, "learning_rate": 4.802305298859477e-07, "logits/chosen": 3.09375, "logits/rejected": 3.203125, "logps/chosen": -436.0, "logps/rejected": -552.0, "loss": 0.6669, "rewards/accuracies": 0.75, "rewards/chosen": -0.375, "rewards/margins": 0.298828125, "rewards/rejected": -0.67578125, "step": 226 }, { "epoch": 0.47514390371533227, "grad_norm": 8.920707702636719, "learning_rate": 4.800082904164425e-07, "logits/chosen": 3.296875, "logits/rejected": 3.375, "logps/chosen": -178.0, "logps/rejected": -197.0, "loss": 0.6363, "rewards/accuracies": 0.5, "rewards/chosen": -0.30078125, "rewards/margins": 0.1923828125, "rewards/rejected": -0.4921875, "step": 227 }, { "epoch": 0.4772370486656201, "grad_norm": 9.029594421386719, "learning_rate": 4.797848607783484e-07, "logits/chosen": 2.84375, "logits/rejected": 3.21875, "logps/chosen": -388.0, "logps/rejected": -348.0, "loss": 0.6363, "rewards/accuracies": 0.25, "rewards/chosen": -0.6640625, "rewards/margins": -0.12451171875, "rewards/rejected": -0.5390625, "step": 228 }, { "epoch": 0.4793301936159079, "grad_norm": 9.25216293334961, "learning_rate": 4.795602421279185e-07, "logits/chosen": 2.859375, "logits/rejected": 3.109375, "logps/chosen": -756.0, "logps/rejected": -524.0, "loss": 0.6263, "rewards/accuracies": 0.75, "rewards/chosen": -0.470703125, "rewards/margins": 0.39453125, "rewards/rejected": -0.86328125, "step": 229 }, { "epoch": 0.48142333856619574, "grad_norm": 8.571414947509766, "learning_rate": 4.793344356275594e-07, "logits/chosen": 2.078125, "logits/rejected": 2.1875, "logps/chosen": -336.0, "logps/rejected": -462.0, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": -0.337890625, "rewards/margins": 0.162109375, "rewards/rejected": -0.5, "step": 230 }, { "epoch": 0.4835164835164835, "grad_norm": 9.288797378540039, "learning_rate": 4.791074424458246e-07, "logits/chosen": 2.625, "logits/rejected": 2.828125, "logps/chosen": -434.0, "logps/rejected": -432.0, "loss": 0.6546, "rewards/accuracies": 0.75, "rewards/chosen": -0.33984375, "rewards/margins": 0.1484375, "rewards/rejected": -0.48828125, "step": 231 }, { "epoch": 0.4856096284667713, "grad_norm": 8.921971321105957, "learning_rate": 4.788792637574087e-07, "logits/chosen": 2.84375, "logits/rejected": 2.8125, "logps/chosen": -420.0, "logps/rejected": -176.0, "loss": 0.6511, "rewards/accuracies": 0.5, "rewards/chosen": -0.5234375, "rewards/margins": 0.03076171875, "rewards/rejected": -0.55078125, "step": 232 }, { "epoch": 0.48770277341705914, "grad_norm": 8.216841697692871, "learning_rate": 4.786499007431418e-07, "logits/chosen": 3.234375, "logits/rejected": 3.40625, "logps/chosen": -250.0, "logps/rejected": -213.0, "loss": 0.6406, "rewards/accuracies": 0.25, "rewards/chosen": -0.61328125, "rewards/margins": -0.0615234375, "rewards/rejected": -0.55078125, "step": 233 }, { "epoch": 0.4897959183673469, "grad_norm": 9.005748748779297, "learning_rate": 4.784193545899823e-07, "logits/chosen": 2.359375, "logits/rejected": 3.0, "logps/chosen": -400.0, "logps/rejected": -346.0, "loss": 0.633, "rewards/accuracies": 0.25, "rewards/chosen": -0.734375, "rewards/margins": -0.03515625, "rewards/rejected": -0.69921875, "step": 234 }, { "epoch": 0.49188906331763477, "grad_norm": 9.359798431396484, "learning_rate": 4.781876264910116e-07, "logits/chosen": 2.5625, "logits/rejected": 3.375, "logps/chosen": -378.0, "logps/rejected": -262.0, "loss": 0.6571, "rewards/accuracies": 1.0, "rewards/chosen": -0.34375, "rewards/margins": 0.345703125, "rewards/rejected": -0.6875, "step": 235 }, { "epoch": 0.49398220826792255, "grad_norm": 8.288033485412598, "learning_rate": 4.779547176454278e-07, "logits/chosen": 1.5859375, "logits/rejected": 1.484375, "logps/chosen": -175.0, "logps/rejected": -183.0, "loss": 0.6334, "rewards/accuracies": 0.5, "rewards/chosen": -0.625, "rewards/margins": -0.007080078125, "rewards/rejected": -0.6171875, "step": 236 }, { "epoch": 0.49607535321821034, "grad_norm": 9.106061935424805, "learning_rate": 4.777206292585393e-07, "logits/chosen": 3.125, "logits/rejected": 2.984375, "logps/chosen": -612.0, "logps/rejected": -784.0, "loss": 0.6595, "rewards/accuracies": 0.75, "rewards/chosen": -0.8125, "rewards/margins": 0.05908203125, "rewards/rejected": -0.87109375, "step": 237 }, { "epoch": 0.4981684981684982, "grad_norm": 9.198912620544434, "learning_rate": 4.774853625417585e-07, "logits/chosen": 3.4375, "logits/rejected": 3.703125, "logps/chosen": -520.0, "logps/rejected": -356.0, "loss": 0.641, "rewards/accuracies": 0.75, "rewards/chosen": -0.37109375, "rewards/margins": 0.296875, "rewards/rejected": -0.66796875, "step": 238 }, { "epoch": 0.500261643118786, "grad_norm": 9.001884460449219, "learning_rate": 4.772489187125961e-07, "logits/chosen": 2.765625, "logits/rejected": 3.40625, "logps/chosen": -258.0, "logps/rejected": -358.0, "loss": 0.6473, "rewards/accuracies": 0.5, "rewards/chosen": -0.61328125, "rewards/margins": 0.1689453125, "rewards/rejected": -0.78125, "step": 239 }, { "epoch": 0.5023547880690737, "grad_norm": 8.567695617675781, "learning_rate": 4.770112989946538e-07, "logits/chosen": 1.3984375, "logits/rejected": 1.4296875, "logps/chosen": -282.0, "logps/rejected": -202.0, "loss": 0.6417, "rewards/accuracies": 0.5, "rewards/chosen": -0.392578125, "rewards/margins": 0.02685546875, "rewards/rejected": -0.41796875, "step": 240 }, { "epoch": 0.5044479330193616, "grad_norm": 9.044564247131348, "learning_rate": 4.767725046176192e-07, "logits/chosen": 2.546875, "logits/rejected": 2.671875, "logps/chosen": -278.0, "logps/rejected": -344.0, "loss": 0.6656, "rewards/accuracies": 0.25, "rewards/chosen": -0.62109375, "rewards/margins": -0.11181640625, "rewards/rejected": -0.5078125, "step": 241 }, { "epoch": 0.5065410779696494, "grad_norm": 8.735559463500977, "learning_rate": 4.765325368172582e-07, "logits/chosen": 3.3125, "logits/rejected": 3.265625, "logps/chosen": -636.0, "logps/rejected": -512.0, "loss": 0.6522, "rewards/accuracies": 0.75, "rewards/chosen": -0.52734375, "rewards/margins": 0.232421875, "rewards/rejected": -0.76171875, "step": 242 }, { "epoch": 0.5086342229199372, "grad_norm": 9.265771865844727, "learning_rate": 4.7629139683540966e-07, "logits/chosen": 2.859375, "logits/rejected": 2.6875, "logps/chosen": -396.0, "logps/rejected": -512.0, "loss": 0.6323, "rewards/accuracies": 0.25, "rewards/chosen": -0.62109375, "rewards/margins": -0.1181640625, "rewards/rejected": -0.5, "step": 243 }, { "epoch": 0.510727367870225, "grad_norm": 9.315106391906738, "learning_rate": 4.760490859199781e-07, "logits/chosen": 2.609375, "logits/rejected": 3.4375, "logps/chosen": -660.0, "logps/rejected": -444.0, "loss": 0.6428, "rewards/accuracies": 0.5, "rewards/chosen": -0.7421875, "rewards/margins": -0.080078125, "rewards/rejected": -0.66015625, "step": 244 }, { "epoch": 0.5128205128205128, "grad_norm": 8.939321517944336, "learning_rate": 4.75805605324928e-07, "logits/chosen": 3.03125, "logits/rejected": 3.5, "logps/chosen": -372.0, "logps/rejected": -414.0, "loss": 0.6199, "rewards/accuracies": 1.0, "rewards/chosen": -0.419921875, "rewards/margins": 0.26171875, "rewards/rejected": -0.6796875, "step": 245 }, { "epoch": 0.5149136577708007, "grad_norm": 8.98937702178955, "learning_rate": 4.7556095631027667e-07, "logits/chosen": 2.453125, "logits/rejected": 2.59375, "logps/chosen": -324.0, "logps/rejected": -226.0, "loss": 0.6713, "rewards/accuracies": 0.5, "rewards/chosen": -0.4453125, "rewards/margins": 0.1171875, "rewards/rejected": -0.5625, "step": 246 }, { "epoch": 0.5170068027210885, "grad_norm": 8.769051551818848, "learning_rate": 4.7531514014208813e-07, "logits/chosen": 2.59375, "logits/rejected": 2.609375, "logps/chosen": -418.0, "logps/rejected": -324.0, "loss": 0.6452, "rewards/accuracies": 0.75, "rewards/chosen": -0.4609375, "rewards/margins": 0.21875, "rewards/rejected": -0.6796875, "step": 247 }, { "epoch": 0.5190999476713762, "grad_norm": 8.939539909362793, "learning_rate": 4.7506815809246653e-07, "logits/chosen": 3.28125, "logits/rejected": 3.5, "logps/chosen": -512.0, "logps/rejected": -632.0, "loss": 0.6657, "rewards/accuracies": 1.0, "rewards/chosen": -0.265625, "rewards/margins": 0.515625, "rewards/rejected": -0.78125, "step": 248 }, { "epoch": 0.521193092621664, "grad_norm": 9.614801406860352, "learning_rate": 4.7482001143954943e-07, "logits/chosen": 2.9375, "logits/rejected": 2.34375, "logps/chosen": -360.0, "logps/rejected": -454.0, "loss": 0.6703, "rewards/accuracies": 0.5, "rewards/chosen": -0.6796875, "rewards/margins": -0.01416015625, "rewards/rejected": -0.6640625, "step": 249 }, { "epoch": 0.5232862375719518, "grad_norm": 8.643899917602539, "learning_rate": 4.745707014675012e-07, "logits/chosen": 2.90625, "logits/rejected": 2.984375, "logps/chosen": -488.0, "logps/rejected": -520.0, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": -0.515625, "rewards/margins": 0.2470703125, "rewards/rejected": -0.765625, "step": 250 }, { "epoch": 0.5253793825222397, "grad_norm": 8.415505409240723, "learning_rate": 4.743202294665065e-07, "logits/chosen": 2.109375, "logits/rejected": 3.59375, "logps/chosen": -442.0, "logps/rejected": -262.0, "loss": 0.6401, "rewards/accuracies": 0.25, "rewards/chosen": -0.490234375, "rewards/margins": -0.04345703125, "rewards/rejected": -0.4453125, "step": 251 }, { "epoch": 0.5274725274725275, "grad_norm": 8.936569213867188, "learning_rate": 4.7406859673276333e-07, "logits/chosen": 3.015625, "logits/rejected": 2.84375, "logps/chosen": -294.0, "logps/rejected": -496.0, "loss": 0.6242, "rewards/accuracies": 0.75, "rewards/chosen": -0.58984375, "rewards/margins": 0.248046875, "rewards/rejected": -0.8359375, "step": 252 }, { "epoch": 0.5295656724228153, "grad_norm": 8.643172264099121, "learning_rate": 4.738158045684766e-07, "logits/chosen": 3.0625, "logits/rejected": 3.546875, "logps/chosen": -620.0, "logps/rejected": -584.0, "loss": 0.6572, "rewards/accuracies": 0.75, "rewards/chosen": -0.474609375, "rewards/margins": 0.70703125, "rewards/rejected": -1.1796875, "step": 253 }, { "epoch": 0.531658817373103, "grad_norm": 8.712985038757324, "learning_rate": 4.7356185428185145e-07, "logits/chosen": 3.84375, "logits/rejected": 3.5, "logps/chosen": -512.0, "logps/rejected": -620.0, "loss": 0.652, "rewards/accuracies": 0.75, "rewards/chosen": -0.5078125, "rewards/margins": 0.138671875, "rewards/rejected": -0.6484375, "step": 254 }, { "epoch": 0.533751962323391, "grad_norm": 8.554315567016602, "learning_rate": 4.733067471870862e-07, "logits/chosen": 3.015625, "logits/rejected": 2.8125, "logps/chosen": -436.0, "logps/rejected": -494.0, "loss": 0.6377, "rewards/accuracies": 0.75, "rewards/chosen": -0.30859375, "rewards/margins": 0.11376953125, "rewards/rejected": -0.421875, "step": 255 }, { "epoch": 0.5358451072736787, "grad_norm": 10.03768253326416, "learning_rate": 4.7305048460436555e-07, "logits/chosen": 3.53125, "logits/rejected": 3.15625, "logps/chosen": -688.0, "logps/rejected": -612.0, "loss": 0.6636, "rewards/accuracies": 0.75, "rewards/chosen": -0.4609375, "rewards/margins": 0.37109375, "rewards/rejected": -0.83203125, "step": 256 }, { "epoch": 0.5379382522239665, "grad_norm": 8.926187515258789, "learning_rate": 4.727930678598541e-07, "logits/chosen": 2.96875, "logits/rejected": 2.703125, "logps/chosen": -258.0, "logps/rejected": -358.0, "loss": 0.6494, "rewards/accuracies": 0.75, "rewards/chosen": -0.41015625, "rewards/margins": 0.1064453125, "rewards/rejected": -0.515625, "step": 257 }, { "epoch": 0.5400313971742543, "grad_norm": 9.17234992980957, "learning_rate": 4.725344982856891e-07, "logits/chosen": 2.078125, "logits/rejected": 2.125, "logps/chosen": -360.0, "logps/rejected": -243.0, "loss": 0.6528, "rewards/accuracies": 0.75, "rewards/chosen": -0.412109375, "rewards/margins": 0.48046875, "rewards/rejected": -0.89453125, "step": 258 }, { "epoch": 0.5421245421245421, "grad_norm": 9.169678688049316, "learning_rate": 4.7227477721997387e-07, "logits/chosen": 3.203125, "logits/rejected": 3.75, "logps/chosen": -592.0, "logps/rejected": -402.0, "loss": 0.652, "rewards/accuracies": 0.75, "rewards/chosen": -0.384765625, "rewards/margins": 0.296875, "rewards/rejected": -0.6796875, "step": 259 }, { "epoch": 0.54421768707483, "grad_norm": 9.725273132324219, "learning_rate": 4.720139060067706e-07, "logits/chosen": 3.125, "logits/rejected": 4.0625, "logps/chosen": -572.0, "logps/rejected": -330.0, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": -0.6015625, "rewards/margins": 0.08447265625, "rewards/rejected": -0.68359375, "step": 260 }, { "epoch": 0.5463108320251178, "grad_norm": 8.998029708862305, "learning_rate": 4.7175188599609363e-07, "logits/chosen": 2.109375, "logits/rejected": 2.578125, "logps/chosen": -300.0, "logps/rejected": -308.0, "loss": 0.6564, "rewards/accuracies": 0.25, "rewards/chosen": -0.46484375, "rewards/margins": -0.03173828125, "rewards/rejected": -0.43359375, "step": 261 }, { "epoch": 0.5484039769754055, "grad_norm": 9.795524597167969, "learning_rate": 4.7148871854390204e-07, "logits/chosen": 2.0, "logits/rejected": 1.9375, "logps/chosen": -440.0, "logps/rejected": -536.0, "loss": 0.6377, "rewards/accuracies": 0.75, "rewards/chosen": -0.78515625, "rewards/margins": 0.30078125, "rewards/rejected": -1.0859375, "step": 262 }, { "epoch": 0.5504971219256933, "grad_norm": 8.896550178527832, "learning_rate": 4.7122440501209356e-07, "logits/chosen": 2.4375, "logits/rejected": 2.234375, "logps/chosen": -276.0, "logps/rejected": -320.0, "loss": 0.6503, "rewards/accuracies": 1.0, "rewards/chosen": -0.48828125, "rewards/margins": 0.26953125, "rewards/rejected": -0.7578125, "step": 263 }, { "epoch": 0.5525902668759811, "grad_norm": 8.59717082977295, "learning_rate": 4.709589467684962e-07, "logits/chosen": 2.03125, "logits/rejected": 1.7265625, "logps/chosen": -158.0, "logps/rejected": -204.0, "loss": 0.6415, "rewards/accuracies": 0.5, "rewards/chosen": -0.482421875, "rewards/margins": 0.1396484375, "rewards/rejected": -0.62109375, "step": 264 }, { "epoch": 0.554683411826269, "grad_norm": 9.051630973815918, "learning_rate": 4.7069234518686243e-07, "logits/chosen": 2.40625, "logits/rejected": 2.515625, "logps/chosen": -210.0, "logps/rejected": -396.0, "loss": 0.6432, "rewards/accuracies": 1.0, "rewards/chosen": -0.62109375, "rewards/margins": 0.578125, "rewards/rejected": -1.1953125, "step": 265 }, { "epoch": 0.5567765567765568, "grad_norm": 8.217545509338379, "learning_rate": 4.7042460164686113e-07, "logits/chosen": 2.1875, "logits/rejected": 2.09375, "logps/chosen": -536.0, "logps/rejected": -716.0, "loss": 0.6057, "rewards/accuracies": 0.5, "rewards/chosen": -0.49609375, "rewards/margins": 0.1865234375, "rewards/rejected": -0.68359375, "step": 266 }, { "epoch": 0.5588697017268446, "grad_norm": 9.651106834411621, "learning_rate": 4.701557175340711e-07, "logits/chosen": 2.546875, "logits/rejected": 2.609375, "logps/chosen": -191.0, "logps/rejected": -328.0, "loss": 0.663, "rewards/accuracies": 0.5, "rewards/chosen": -0.56640625, "rewards/margins": 0.03466796875, "rewards/rejected": -0.6015625, "step": 267 }, { "epoch": 0.5609628466771324, "grad_norm": 8.923408508300781, "learning_rate": 4.6988569423997357e-07, "logits/chosen": 3.25, "logits/rejected": 2.65625, "logps/chosen": -266.0, "logps/rejected": -696.0, "loss": 0.6271, "rewards/accuracies": 1.0, "rewards/chosen": -0.53125, "rewards/margins": 0.470703125, "rewards/rejected": -1.0, "step": 268 }, { "epoch": 0.5630559916274201, "grad_norm": 8.7314453125, "learning_rate": 4.69614533161945e-07, "logits/chosen": 3.265625, "logits/rejected": 2.828125, "logps/chosen": -464.0, "logps/rejected": -512.0, "loss": 0.6557, "rewards/accuracies": 0.5, "rewards/chosen": -0.765625, "rewards/margins": 0.0390625, "rewards/rejected": -0.8046875, "step": 269 }, { "epoch": 0.565149136577708, "grad_norm": 9.67919921875, "learning_rate": 4.6934223570325e-07, "logits/chosen": 2.53125, "logits/rejected": 2.53125, "logps/chosen": -504.0, "logps/rejected": -520.0, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": -1.15625, "rewards/margins": -0.263671875, "rewards/rejected": -0.8984375, "step": 270 }, { "epoch": 0.5672422815279958, "grad_norm": 8.798002243041992, "learning_rate": 4.6906880327303377e-07, "logits/chosen": 2.546875, "logits/rejected": 2.28125, "logps/chosen": -724.0, "logps/rejected": -736.0, "loss": 0.641, "rewards/accuracies": 0.5, "rewards/chosen": -0.41796875, "rewards/margins": 0.16015625, "rewards/rejected": -0.578125, "step": 271 }, { "epoch": 0.5693354264782836, "grad_norm": 9.1842622756958, "learning_rate": 4.6879423728631526e-07, "logits/chosen": 3.34375, "logits/rejected": 3.578125, "logps/chosen": -608.0, "logps/rejected": -376.0, "loss": 0.6578, "rewards/accuracies": 1.0, "rewards/chosen": -0.5546875, "rewards/margins": 0.12890625, "rewards/rejected": -0.68359375, "step": 272 }, { "epoch": 0.5714285714285714, "grad_norm": 9.7493314743042, "learning_rate": 4.685185391639795e-07, "logits/chosen": 2.640625, "logits/rejected": 2.828125, "logps/chosen": -508.0, "logps/rejected": -544.0, "loss": 0.6402, "rewards/accuracies": 1.0, "rewards/chosen": -0.76953125, "rewards/margins": 0.4453125, "rewards/rejected": -1.21875, "step": 273 }, { "epoch": 0.5735217163788592, "grad_norm": 9.176734924316406, "learning_rate": 4.6824171033277026e-07, "logits/chosen": 2.40625, "logits/rejected": 3.15625, "logps/chosen": -434.0, "logps/rejected": -270.0, "loss": 0.6278, "rewards/accuracies": 1.0, "rewards/chosen": -0.56640625, "rewards/margins": 0.47265625, "rewards/rejected": -1.0390625, "step": 274 }, { "epoch": 0.5756148613291471, "grad_norm": 8.987980842590332, "learning_rate": 4.679637522252829e-07, "logits/chosen": 3.359375, "logits/rejected": 3.4375, "logps/chosen": -536.0, "logps/rejected": -402.0, "loss": 0.6304, "rewards/accuracies": 0.5, "rewards/chosen": -0.43359375, "rewards/margins": 0.2001953125, "rewards/rejected": -0.6328125, "step": 275 }, { "epoch": 0.5777080062794349, "grad_norm": 9.4086332321167, "learning_rate": 4.676846662799566e-07, "logits/chosen": 3.484375, "logits/rejected": 4.21875, "logps/chosen": -544.0, "logps/rejected": -416.0, "loss": 0.6252, "rewards/accuracies": 1.0, "rewards/chosen": -0.41015625, "rewards/margins": 0.37890625, "rewards/rejected": -0.7890625, "step": 276 }, { "epoch": 0.5798011512297226, "grad_norm": 9.234297752380371, "learning_rate": 4.6740445394106755e-07, "logits/chosen": 2.390625, "logits/rejected": 2.359375, "logps/chosen": -262.0, "logps/rejected": -274.0, "loss": 0.6749, "rewards/accuracies": 0.75, "rewards/chosen": -0.7109375, "rewards/margins": -0.02099609375, "rewards/rejected": -0.69140625, "step": 277 }, { "epoch": 0.5818942961800104, "grad_norm": 10.775644302368164, "learning_rate": 4.6712311665872057e-07, "logits/chosen": 1.6875, "logits/rejected": 1.7578125, "logps/chosen": -468.0, "logps/rejected": -532.0, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": -0.70703125, "rewards/margins": -0.0751953125, "rewards/rejected": -0.6328125, "step": 278 }, { "epoch": 0.5839874411302983, "grad_norm": 9.250503540039062, "learning_rate": 4.6684065588884224e-07, "logits/chosen": 2.265625, "logits/rejected": 2.59375, "logps/chosen": -498.0, "logps/rejected": -478.0, "loss": 0.6085, "rewards/accuracies": 1.0, "rewards/chosen": -0.498046875, "rewards/margins": 0.2333984375, "rewards/rejected": -0.73046875, "step": 279 }, { "epoch": 0.5860805860805861, "grad_norm": 9.45741081237793, "learning_rate": 4.6655707309317345e-07, "logits/chosen": 3.28125, "logits/rejected": 3.53125, "logps/chosen": -600.0, "logps/rejected": -444.0, "loss": 0.6351, "rewards/accuracies": 0.75, "rewards/chosen": -0.6640625, "rewards/margins": 0.1962890625, "rewards/rejected": -0.86328125, "step": 280 }, { "epoch": 0.5881737310308739, "grad_norm": 9.61277961730957, "learning_rate": 4.6627236973926126e-07, "logits/chosen": 2.5625, "logits/rejected": 2.46875, "logps/chosen": -376.0, "logps/rejected": -318.0, "loss": 0.6356, "rewards/accuracies": 1.0, "rewards/chosen": -0.6328125, "rewards/margins": 0.14453125, "rewards/rejected": -0.77734375, "step": 281 }, { "epoch": 0.5902668759811617, "grad_norm": 9.949418067932129, "learning_rate": 4.6598654730045177e-07, "logits/chosen": 2.28125, "logits/rejected": 2.15625, "logps/chosen": -253.0, "logps/rejected": -336.0, "loss": 0.6626, "rewards/accuracies": 1.0, "rewards/chosen": -0.5703125, "rewards/margins": 0.43359375, "rewards/rejected": -1.0, "step": 282 }, { "epoch": 0.5923600209314495, "grad_norm": 8.826484680175781, "learning_rate": 4.6569960725588256e-07, "logits/chosen": 3.609375, "logits/rejected": 2.734375, "logps/chosen": -366.0, "logps/rejected": -584.0, "loss": 0.627, "rewards/accuracies": 0.75, "rewards/chosen": -0.78125, "rewards/margins": 0.37890625, "rewards/rejected": -1.15625, "step": 283 }, { "epoch": 0.5944531658817374, "grad_norm": 8.73790454864502, "learning_rate": 4.654115510904746e-07, "logits/chosen": 3.0, "logits/rejected": 2.859375, "logps/chosen": -296.0, "logps/rejected": -232.0, "loss": 0.6401, "rewards/accuracies": 0.25, "rewards/chosen": -0.55859375, "rewards/margins": -0.0615234375, "rewards/rejected": -0.49609375, "step": 284 }, { "epoch": 0.5965463108320251, "grad_norm": 10.235679626464844, "learning_rate": 4.651223802949247e-07, "logits/chosen": 2.734375, "logits/rejected": 2.875, "logps/chosen": -498.0, "logps/rejected": -376.0, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": -0.53515625, "rewards/margins": 0.34375, "rewards/rejected": -0.87890625, "step": 285 }, { "epoch": 0.5986394557823129, "grad_norm": 9.037766456604004, "learning_rate": 4.6483209636569837e-07, "logits/chosen": 2.59375, "logits/rejected": 2.46875, "logps/chosen": -652.0, "logps/rejected": -660.0, "loss": 0.6442, "rewards/accuracies": 0.5, "rewards/chosen": -0.6953125, "rewards/margins": 0.044921875, "rewards/rejected": -0.7421875, "step": 286 }, { "epoch": 0.6007326007326007, "grad_norm": 8.641436576843262, "learning_rate": 4.645407008050212e-07, "logits/chosen": 2.609375, "logits/rejected": 2.640625, "logps/chosen": -324.0, "logps/rejected": -340.0, "loss": 0.6445, "rewards/accuracies": 1.0, "rewards/chosen": -0.52734375, "rewards/margins": 0.3125, "rewards/rejected": -0.83984375, "step": 287 }, { "epoch": 0.6028257456828885, "grad_norm": 9.133179664611816, "learning_rate": 4.6424819512087166e-07, "logits/chosen": 2.625, "logits/rejected": 2.265625, "logps/chosen": -161.0, "logps/rejected": -320.0, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": -0.4765625, "rewards/margins": 0.32421875, "rewards/rejected": -0.80078125, "step": 288 }, { "epoch": 0.6049188906331764, "grad_norm": 10.273738861083984, "learning_rate": 4.639545808269731e-07, "logits/chosen": 3.296875, "logits/rejected": 2.890625, "logps/chosen": -604.0, "logps/rejected": -668.0, "loss": 0.6861, "rewards/accuracies": 0.5, "rewards/chosen": -0.71875, "rewards/margins": -0.162109375, "rewards/rejected": -0.55859375, "step": 289 }, { "epoch": 0.6070120355834642, "grad_norm": 9.593812942504883, "learning_rate": 4.636598594427858e-07, "logits/chosen": 2.671875, "logits/rejected": 2.9375, "logps/chosen": -672.0, "logps/rejected": -600.0, "loss": 0.6522, "rewards/accuracies": 0.75, "rewards/chosen": -0.578125, "rewards/margins": 0.515625, "rewards/rejected": -1.09375, "step": 290 }, { "epoch": 0.609105180533752, "grad_norm": 10.181145668029785, "learning_rate": 4.6336403249349966e-07, "logits/chosen": 2.375, "logits/rejected": 2.796875, "logps/chosen": -458.0, "logps/rejected": -410.0, "loss": 0.6496, "rewards/accuracies": 0.75, "rewards/chosen": -0.56640625, "rewards/margins": 0.43359375, "rewards/rejected": -1.0, "step": 291 }, { "epoch": 0.6111983254840397, "grad_norm": 9.083106994628906, "learning_rate": 4.630671015100255e-07, "logits/chosen": 3.046875, "logits/rejected": 2.875, "logps/chosen": -272.0, "logps/rejected": -370.0, "loss": 0.6419, "rewards/accuracies": 0.75, "rewards/chosen": -0.734375, "rewards/margins": 0.16796875, "rewards/rejected": -0.90234375, "step": 292 }, { "epoch": 0.6132914704343275, "grad_norm": 8.897668838500977, "learning_rate": 4.6276906802898776e-07, "logits/chosen": 2.890625, "logits/rejected": 2.96875, "logps/chosen": -304.0, "logps/rejected": -270.0, "loss": 0.6371, "rewards/accuracies": 0.5, "rewards/chosen": -0.76953125, "rewards/margins": 0.107421875, "rewards/rejected": -0.875, "step": 293 }, { "epoch": 0.6153846153846154, "grad_norm": 8.895344734191895, "learning_rate": 4.624699335927162e-07, "logits/chosen": 1.90625, "logits/rejected": 2.296875, "logps/chosen": -306.0, "logps/rejected": -252.0, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": -0.546875, "rewards/margins": 0.1904296875, "rewards/rejected": -0.73828125, "step": 294 }, { "epoch": 0.6174777603349032, "grad_norm": 9.05825138092041, "learning_rate": 4.6216969974923816e-07, "logits/chosen": 2.90625, "logits/rejected": 2.71875, "logps/chosen": -474.0, "logps/rejected": -376.0, "loss": 0.6607, "rewards/accuracies": 0.25, "rewards/chosen": -0.6875, "rewards/margins": -0.0556640625, "rewards/rejected": -0.6328125, "step": 295 }, { "epoch": 0.619570905285191, "grad_norm": 8.959853172302246, "learning_rate": 4.618683680522703e-07, "logits/chosen": 1.3046875, "logits/rejected": 1.3125, "logps/chosen": -264.0, "logps/rejected": -201.0, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": -0.490234375, "rewards/margins": 0.15625, "rewards/rejected": -0.6484375, "step": 296 }, { "epoch": 0.6216640502354788, "grad_norm": 9.268010139465332, "learning_rate": 4.6156594006121095e-07, "logits/chosen": 2.15625, "logits/rejected": 2.109375, "logps/chosen": -410.0, "logps/rejected": -600.0, "loss": 0.6418, "rewards/accuracies": 1.0, "rewards/chosen": -0.6640625, "rewards/margins": 0.6875, "rewards/rejected": -1.3515625, "step": 297 }, { "epoch": 0.6237571951857667, "grad_norm": 9.329010963439941, "learning_rate": 4.612624173411315e-07, "logits/chosen": 2.578125, "logits/rejected": 2.5625, "logps/chosen": -494.0, "logps/rejected": -548.0, "loss": 0.6407, "rewards/accuracies": 0.75, "rewards/chosen": -0.55859375, "rewards/margins": 0.30859375, "rewards/rejected": -0.8671875, "step": 298 }, { "epoch": 0.6258503401360545, "grad_norm": 9.748213768005371, "learning_rate": 4.609578014627687e-07, "logits/chosen": 2.828125, "logits/rejected": 3.25, "logps/chosen": -740.0, "logps/rejected": -464.0, "loss": 0.6651, "rewards/accuracies": 0.75, "rewards/chosen": -0.63671875, "rewards/margins": 0.119140625, "rewards/rejected": -0.7578125, "step": 299 }, { "epoch": 0.6279434850863422, "grad_norm": 9.283663749694824, "learning_rate": 4.6065209400251655e-07, "logits/chosen": 2.5625, "logits/rejected": 2.078125, "logps/chosen": -207.0, "logps/rejected": -326.0, "loss": 0.6435, "rewards/accuracies": 0.75, "rewards/chosen": -0.6328125, "rewards/margins": 0.201171875, "rewards/rejected": -0.83203125, "step": 300 }, { "epoch": 0.63003663003663, "grad_norm": 10.709654808044434, "learning_rate": 4.6034529654241766e-07, "logits/chosen": 3.28125, "logits/rejected": 2.671875, "logps/chosen": -292.0, "logps/rejected": -332.0, "loss": 0.6894, "rewards/accuracies": 0.75, "rewards/chosen": -0.58203125, "rewards/margins": 0.224609375, "rewards/rejected": -0.80859375, "step": 301 }, { "epoch": 0.6321297749869178, "grad_norm": 9.497162818908691, "learning_rate": 4.600374106701558e-07, "logits/chosen": 2.9375, "logits/rejected": 3.375, "logps/chosen": -656.0, "logps/rejected": -458.0, "loss": 0.6371, "rewards/accuracies": 0.5, "rewards/chosen": -0.640625, "rewards/margins": 0.189453125, "rewards/rejected": -0.828125, "step": 302 }, { "epoch": 0.6342229199372057, "grad_norm": 10.339771270751953, "learning_rate": 4.597284379790471e-07, "logits/chosen": 3.203125, "logits/rejected": 2.84375, "logps/chosen": -398.0, "logps/rejected": -490.0, "loss": 0.6441, "rewards/accuracies": 0.75, "rewards/chosen": -0.4375, "rewards/margins": 0.154296875, "rewards/rejected": -0.58984375, "step": 303 }, { "epoch": 0.6363160648874935, "grad_norm": 8.851433753967285, "learning_rate": 4.5941838006803196e-07, "logits/chosen": 2.359375, "logits/rejected": 3.09375, "logps/chosen": -502.0, "logps/rejected": -356.0, "loss": 0.6332, "rewards/accuracies": 0.75, "rewards/chosen": -0.40234375, "rewards/margins": 0.271484375, "rewards/rejected": -0.67578125, "step": 304 }, { "epoch": 0.6384092098377813, "grad_norm": 8.970887184143066, "learning_rate": 4.591072385416671e-07, "logits/chosen": 3.15625, "logits/rejected": 3.09375, "logps/chosen": -290.0, "logps/rejected": -364.0, "loss": 0.5897, "rewards/accuracies": 0.5, "rewards/chosen": -0.578125, "rewards/margins": 0.083984375, "rewards/rejected": -0.6640625, "step": 305 }, { "epoch": 0.640502354788069, "grad_norm": 9.5183744430542, "learning_rate": 4.5879501501011657e-07, "logits/chosen": 2.859375, "logits/rejected": 2.96875, "logps/chosen": -492.0, "logps/rejected": -532.0, "loss": 0.6454, "rewards/accuracies": 0.75, "rewards/chosen": -0.828125, "rewards/margins": 0.0849609375, "rewards/rejected": -0.9140625, "step": 306 }, { "epoch": 0.6425954997383568, "grad_norm": 10.027036666870117, "learning_rate": 4.5848171108914405e-07, "logits/chosen": 2.78125, "logits/rejected": 3.984375, "logps/chosen": -752.0, "logps/rejected": -560.0, "loss": 0.6652, "rewards/accuracies": 0.5, "rewards/chosen": -0.68359375, "rewards/margins": 0.419921875, "rewards/rejected": -1.1015625, "step": 307 }, { "epoch": 0.6446886446886447, "grad_norm": 9.015626907348633, "learning_rate": 4.581673284001044e-07, "logits/chosen": 2.875, "logits/rejected": 2.96875, "logps/chosen": -378.0, "logps/rejected": -356.0, "loss": 0.6544, "rewards/accuracies": 1.0, "rewards/chosen": -0.671875, "rewards/margins": 0.40625, "rewards/rejected": -1.078125, "step": 308 }, { "epoch": 0.6467817896389325, "grad_norm": 8.90971565246582, "learning_rate": 4.578518685699347e-07, "logits/chosen": 2.96875, "logits/rejected": 3.734375, "logps/chosen": -712.0, "logps/rejected": -580.0, "loss": 0.6208, "rewards/accuracies": 0.75, "rewards/chosen": -0.431640625, "rewards/margins": 0.0654296875, "rewards/rejected": -0.498046875, "step": 309 }, { "epoch": 0.6488749345892203, "grad_norm": 9.74847412109375, "learning_rate": 4.575353332311466e-07, "logits/chosen": 2.609375, "logits/rejected": 2.515625, "logps/chosen": -278.0, "logps/rejected": -354.0, "loss": 0.6351, "rewards/accuracies": 1.0, "rewards/chosen": -0.44921875, "rewards/margins": 0.330078125, "rewards/rejected": -0.78125, "step": 310 }, { "epoch": 0.6509680795395081, "grad_norm": 9.1494722366333, "learning_rate": 4.572177240218175e-07, "logits/chosen": 2.8125, "logits/rejected": 3.203125, "logps/chosen": -512.0, "logps/rejected": -336.0, "loss": 0.6362, "rewards/accuracies": 0.75, "rewards/chosen": -0.6015625, "rewards/margins": 0.1484375, "rewards/rejected": -0.75, "step": 311 }, { "epoch": 0.6530612244897959, "grad_norm": 8.894120216369629, "learning_rate": 4.5689904258558203e-07, "logits/chosen": 2.59375, "logits/rejected": 2.5625, "logps/chosen": -183.0, "logps/rejected": -280.0, "loss": 0.6426, "rewards/accuracies": 0.75, "rewards/chosen": -0.57421875, "rewards/margins": 0.380859375, "rewards/rejected": -0.95703125, "step": 312 }, { "epoch": 0.6551543694400838, "grad_norm": 9.807157516479492, "learning_rate": 4.565792905716236e-07, "logits/chosen": 2.21875, "logits/rejected": 2.96875, "logps/chosen": -456.0, "logps/rejected": -556.0, "loss": 0.6645, "rewards/accuracies": 1.0, "rewards/chosen": -0.76171875, "rewards/margins": 0.55859375, "rewards/rejected": -1.3203125, "step": 313 }, { "epoch": 0.6572475143903715, "grad_norm": 9.278183937072754, "learning_rate": 4.562584696346659e-07, "logits/chosen": 2.1875, "logits/rejected": 2.28125, "logps/chosen": -245.0, "logps/rejected": -300.0, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": -0.578125, "rewards/margins": 0.08544921875, "rewards/rejected": -0.6640625, "step": 314 }, { "epoch": 0.6593406593406593, "grad_norm": 8.839766502380371, "learning_rate": 4.5593658143496447e-07, "logits/chosen": 3.234375, "logits/rejected": 4.0, "logps/chosen": -624.0, "logps/rejected": -404.0, "loss": 0.6102, "rewards/accuracies": 0.75, "rewards/chosen": -0.75390625, "rewards/margins": 0.2470703125, "rewards/rejected": -1.0, "step": 315 }, { "epoch": 0.6614338042909471, "grad_norm": 10.181482315063477, "learning_rate": 4.5561362763829763e-07, "logits/chosen": 2.78125, "logits/rejected": 2.734375, "logps/chosen": -540.0, "logps/rejected": -310.0, "loss": 0.6332, "rewards/accuracies": 1.0, "rewards/chosen": -0.36328125, "rewards/margins": 0.33984375, "rewards/rejected": -0.703125, "step": 316 }, { "epoch": 0.663526949241235, "grad_norm": 8.602537155151367, "learning_rate": 4.5528960991595857e-07, "logits/chosen": 2.671875, "logits/rejected": 1.984375, "logps/chosen": -260.0, "logps/rejected": -229.0, "loss": 0.6315, "rewards/accuracies": 0.5, "rewards/chosen": -0.65234375, "rewards/margins": -0.12890625, "rewards/rejected": -0.5234375, "step": 317 }, { "epoch": 0.6656200941915228, "grad_norm": 8.840538024902344, "learning_rate": 4.549645299447461e-07, "logits/chosen": 1.8046875, "logits/rejected": 2.46875, "logps/chosen": -352.0, "logps/rejected": -524.0, "loss": 0.6126, "rewards/accuracies": 0.75, "rewards/chosen": -0.6953125, "rewards/margins": 0.4296875, "rewards/rejected": -1.125, "step": 318 }, { "epoch": 0.6677132391418106, "grad_norm": 8.675968170166016, "learning_rate": 4.546383894069561e-07, "logits/chosen": 2.640625, "logits/rejected": 3.234375, "logps/chosen": -692.0, "logps/rejected": -616.0, "loss": 0.6181, "rewards/accuracies": 0.5, "rewards/chosen": -0.79296875, "rewards/margins": 0.44140625, "rewards/rejected": -1.234375, "step": 319 }, { "epoch": 0.6698063840920984, "grad_norm": 9.338340759277344, "learning_rate": 4.54311189990373e-07, "logits/chosen": 2.265625, "logits/rejected": 1.859375, "logps/chosen": -247.0, "logps/rejected": -402.0, "loss": 0.6366, "rewards/accuracies": 1.0, "rewards/chosen": -0.4453125, "rewards/margins": 0.30078125, "rewards/rejected": -0.74609375, "step": 320 }, { "epoch": 0.6718995290423861, "grad_norm": 9.707039833068848, "learning_rate": 4.5398293338826126e-07, "logits/chosen": 2.78125, "logits/rejected": 2.859375, "logps/chosen": -544.0, "logps/rejected": -442.0, "loss": 0.6643, "rewards/accuracies": 0.5, "rewards/chosen": -0.59375, "rewards/margins": 0.10546875, "rewards/rejected": -0.69921875, "step": 321 }, { "epoch": 0.673992673992674, "grad_norm": 9.66869068145752, "learning_rate": 4.5365362129935584e-07, "logits/chosen": 1.78125, "logits/rejected": 1.9140625, "logps/chosen": -392.0, "logps/rejected": -251.0, "loss": 0.6646, "rewards/accuracies": 0.25, "rewards/chosen": -0.921875, "rewards/margins": -0.271484375, "rewards/rejected": -0.65234375, "step": 322 }, { "epoch": 0.6760858189429618, "grad_norm": 9.099617958068848, "learning_rate": 4.5332325542785406e-07, "logits/chosen": 2.875, "logits/rejected": 2.59375, "logps/chosen": -468.0, "logps/rejected": -474.0, "loss": 0.6547, "rewards/accuracies": 1.0, "rewards/chosen": -0.55078125, "rewards/margins": 0.6875, "rewards/rejected": -1.234375, "step": 323 }, { "epoch": 0.6781789638932496, "grad_norm": 8.761299133300781, "learning_rate": 4.5299183748340655e-07, "logits/chosen": 2.34375, "logits/rejected": 2.75, "logps/chosen": -286.0, "logps/rejected": -212.0, "loss": 0.6341, "rewards/accuracies": 0.5, "rewards/chosen": -0.80859375, "rewards/margins": 0.0458984375, "rewards/rejected": -0.85546875, "step": 324 }, { "epoch": 0.6802721088435374, "grad_norm": 8.962592124938965, "learning_rate": 4.526593691811084e-07, "logits/chosen": 2.140625, "logits/rejected": 2.5, "logps/chosen": -472.0, "logps/rejected": -406.0, "loss": 0.6351, "rewards/accuracies": 0.75, "rewards/chosen": -0.56640625, "rewards/margins": 0.486328125, "rewards/rejected": -1.046875, "step": 325 }, { "epoch": 0.6823652537938252, "grad_norm": 9.102997779846191, "learning_rate": 4.5232585224149054e-07, "logits/chosen": 2.0625, "logits/rejected": 1.8515625, "logps/chosen": -334.0, "logps/rejected": -366.0, "loss": 0.6107, "rewards/accuracies": 1.0, "rewards/chosen": -0.609375, "rewards/margins": 0.365234375, "rewards/rejected": -0.97265625, "step": 326 }, { "epoch": 0.6844583987441131, "grad_norm": 9.190810203552246, "learning_rate": 4.519912883905105e-07, "logits/chosen": 2.4375, "logits/rejected": 1.5546875, "logps/chosen": -352.0, "logps/rejected": -442.0, "loss": 0.6275, "rewards/accuracies": 0.5, "rewards/chosen": -0.8359375, "rewards/margins": 0.0498046875, "rewards/rejected": -0.88671875, "step": 327 }, { "epoch": 0.6865515436944009, "grad_norm": 9.286701202392578, "learning_rate": 4.516556793595433e-07, "logits/chosen": 2.28125, "logits/rejected": 2.265625, "logps/chosen": -372.0, "logps/rejected": -620.0, "loss": 0.6347, "rewards/accuracies": 0.75, "rewards/chosen": -0.7421875, "rewards/margins": 0.09375, "rewards/rejected": -0.8359375, "step": 328 }, { "epoch": 0.6886446886446886, "grad_norm": 9.687287330627441, "learning_rate": 4.5131902688537337e-07, "logits/chosen": 2.078125, "logits/rejected": 2.4375, "logps/chosen": -412.0, "logps/rejected": -334.0, "loss": 0.6518, "rewards/accuracies": 0.75, "rewards/chosen": -0.7109375, "rewards/margins": 0.044921875, "rewards/rejected": -0.7578125, "step": 329 }, { "epoch": 0.6907378335949764, "grad_norm": 9.833063125610352, "learning_rate": 4.509813327101845e-07, "logits/chosen": 2.5625, "logits/rejected": 3.140625, "logps/chosen": -346.0, "logps/rejected": -422.0, "loss": 0.6264, "rewards/accuracies": 0.5, "rewards/chosen": -0.84375, "rewards/margins": 0.2470703125, "rewards/rejected": -1.0859375, "step": 330 }, { "epoch": 0.6928309785452642, "grad_norm": 10.08375358581543, "learning_rate": 4.5064259858155156e-07, "logits/chosen": 2.015625, "logits/rejected": 1.921875, "logps/chosen": -390.0, "logps/rejected": -312.0, "loss": 0.65, "rewards/accuracies": 0.5, "rewards/chosen": -0.8828125, "rewards/margins": 0.15234375, "rewards/rejected": -1.03125, "step": 331 }, { "epoch": 0.6949241234955521, "grad_norm": 9.232449531555176, "learning_rate": 4.503028262524311e-07, "logits/chosen": 1.984375, "logits/rejected": 2.640625, "logps/chosen": -498.0, "logps/rejected": -144.0, "loss": 0.6328, "rewards/accuracies": 0.75, "rewards/chosen": -0.494140625, "rewards/margins": 0.07666015625, "rewards/rejected": -0.5703125, "step": 332 }, { "epoch": 0.6970172684458399, "grad_norm": 10.257896423339844, "learning_rate": 4.4996201748115235e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.28125, "logps/chosen": -115.0, "logps/rejected": -264.0, "loss": 0.6495, "rewards/accuracies": 0.75, "rewards/chosen": -0.578125, "rewards/margins": 0.322265625, "rewards/rejected": -0.8984375, "step": 333 }, { "epoch": 0.6991104133961277, "grad_norm": 9.689282417297363, "learning_rate": 4.4962017403140816e-07, "logits/chosen": 1.4921875, "logits/rejected": 1.8046875, "logps/chosen": -386.0, "logps/rejected": -376.0, "loss": 0.6146, "rewards/accuracies": 0.5, "rewards/chosen": -0.82421875, "rewards/margins": 0.14453125, "rewards/rejected": -0.96875, "step": 334 }, { "epoch": 0.7012035583464155, "grad_norm": 9.05044937133789, "learning_rate": 4.4927729767224616e-07, "logits/chosen": 2.390625, "logits/rejected": 2.515625, "logps/chosen": -346.0, "logps/rejected": -322.0, "loss": 0.654, "rewards/accuracies": 0.75, "rewards/chosen": -0.5546875, "rewards/margins": 0.244140625, "rewards/rejected": -0.80078125, "step": 335 }, { "epoch": 0.7032967032967034, "grad_norm": 10.08155632019043, "learning_rate": 4.489333901780587e-07, "logits/chosen": 2.28125, "logits/rejected": 1.9609375, "logps/chosen": -442.0, "logps/rejected": -552.0, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": -0.71875, "rewards/margins": 0.125, "rewards/rejected": -0.84375, "step": 336 }, { "epoch": 0.7053898482469911, "grad_norm": 8.750693321228027, "learning_rate": 4.4858845332857485e-07, "logits/chosen": 3.046875, "logits/rejected": 2.8125, "logps/chosen": -576.0, "logps/rejected": -592.0, "loss": 0.6136, "rewards/accuracies": 1.0, "rewards/chosen": -0.73828125, "rewards/margins": 0.5078125, "rewards/rejected": -1.2421875, "step": 337 }, { "epoch": 0.7074829931972789, "grad_norm": 9.24592399597168, "learning_rate": 4.4824248890885044e-07, "logits/chosen": 2.8125, "logits/rejected": 3.15625, "logps/chosen": -544.0, "logps/rejected": -362.0, "loss": 0.6244, "rewards/accuracies": 0.5, "rewards/chosen": -0.5234375, "rewards/margins": 0.12890625, "rewards/rejected": -0.65234375, "step": 338 }, { "epoch": 0.7095761381475667, "grad_norm": 9.083433151245117, "learning_rate": 4.478954987092588e-07, "logits/chosen": 2.25, "logits/rejected": 2.0625, "logps/chosen": -346.0, "logps/rejected": -226.0, "loss": 0.6079, "rewards/accuracies": 0.25, "rewards/chosen": -1.0, "rewards/margins": -0.1474609375, "rewards/rejected": -0.8515625, "step": 339 }, { "epoch": 0.7116692830978545, "grad_norm": 9.71336841583252, "learning_rate": 4.4754748452548186e-07, "logits/chosen": 2.578125, "logits/rejected": 2.9375, "logps/chosen": -576.0, "logps/rejected": -378.0, "loss": 0.6613, "rewards/accuracies": 0.25, "rewards/chosen": -1.03125, "rewards/margins": -0.08203125, "rewards/rejected": -0.953125, "step": 340 }, { "epoch": 0.7137624280481424, "grad_norm": 10.675765037536621, "learning_rate": 4.4719844815850084e-07, "logits/chosen": 2.8125, "logits/rejected": 3.390625, "logps/chosen": -556.0, "logps/rejected": -744.0, "loss": 0.6692, "rewards/accuracies": 0.5, "rewards/chosen": -0.890625, "rewards/margins": 0.30859375, "rewards/rejected": -1.203125, "step": 341 }, { "epoch": 0.7158555729984302, "grad_norm": 9.6324462890625, "learning_rate": 4.468483914145865e-07, "logits/chosen": 1.8359375, "logits/rejected": 2.40625, "logps/chosen": -360.0, "logps/rejected": -306.0, "loss": 0.6413, "rewards/accuracies": 0.25, "rewards/chosen": -1.1015625, "rewards/margins": -0.251953125, "rewards/rejected": -0.8515625, "step": 342 }, { "epoch": 0.717948717948718, "grad_norm": 10.155203819274902, "learning_rate": 4.464973161052901e-07, "logits/chosen": 1.9453125, "logits/rejected": 2.046875, "logps/chosen": -270.0, "logps/rejected": -284.0, "loss": 0.6607, "rewards/accuracies": 1.0, "rewards/chosen": -0.60546875, "rewards/margins": 0.1669921875, "rewards/rejected": -0.7734375, "step": 343 }, { "epoch": 0.7200418628990057, "grad_norm": 9.593433380126953, "learning_rate": 4.461452240474343e-07, "logits/chosen": 2.5, "logits/rejected": 3.0, "logps/chosen": -612.0, "logps/rejected": -580.0, "loss": 0.6474, "rewards/accuracies": 0.75, "rewards/chosen": -0.7890625, "rewards/margins": 0.37109375, "rewards/rejected": -1.1640625, "step": 344 }, { "epoch": 0.7221350078492935, "grad_norm": 9.28181266784668, "learning_rate": 4.457921170631032e-07, "logits/chosen": 2.046875, "logits/rejected": 1.953125, "logps/chosen": -492.0, "logps/rejected": -506.0, "loss": 0.6416, "rewards/accuracies": 0.5, "rewards/chosen": -0.6171875, "rewards/margins": 0.24609375, "rewards/rejected": -0.86328125, "step": 345 }, { "epoch": 0.7242281527995814, "grad_norm": 9.405036926269531, "learning_rate": 4.45437996979633e-07, "logits/chosen": 1.4609375, "logits/rejected": 1.90625, "logps/chosen": -224.0, "logps/rejected": -186.0, "loss": 0.6443, "rewards/accuracies": 0.5, "rewards/chosen": -0.85546875, "rewards/margins": -0.16015625, "rewards/rejected": -0.6953125, "step": 346 }, { "epoch": 0.7263212977498692, "grad_norm": 9.424813270568848, "learning_rate": 4.4508286562960327e-07, "logits/chosen": 2.84375, "logits/rejected": 2.765625, "logps/chosen": -326.0, "logps/rejected": -199.0, "loss": 0.6361, "rewards/accuracies": 0.0, "rewards/chosen": -0.96875, "rewards/margins": -0.390625, "rewards/rejected": -0.578125, "step": 347 }, { "epoch": 0.728414442700157, "grad_norm": 9.514280319213867, "learning_rate": 4.447267248508263e-07, "logits/chosen": 2.859375, "logits/rejected": 3.421875, "logps/chosen": -528.0, "logps/rejected": -458.0, "loss": 0.6744, "rewards/accuracies": 0.5, "rewards/chosen": -0.7109375, "rewards/margins": 0.0849609375, "rewards/rejected": -0.796875, "step": 348 }, { "epoch": 0.7305075876504448, "grad_norm": 9.275189399719238, "learning_rate": 4.4436957648633847e-07, "logits/chosen": 2.921875, "logits/rejected": 3.0625, "logps/chosen": -370.0, "logps/rejected": -406.0, "loss": 0.6114, "rewards/accuracies": 0.75, "rewards/chosen": -0.7421875, "rewards/margins": 0.4375, "rewards/rejected": -1.1796875, "step": 349 }, { "epoch": 0.7326007326007326, "grad_norm": 9.640008926391602, "learning_rate": 4.440114223843906e-07, "logits/chosen": 2.0625, "logits/rejected": 3.015625, "logps/chosen": -398.0, "logps/rejected": -208.0, "loss": 0.6386, "rewards/accuracies": 0.5, "rewards/chosen": -0.7109375, "rewards/margins": 0.07958984375, "rewards/rejected": -0.79296875, "step": 350 }, { "epoch": 0.7346938775510204, "grad_norm": 10.19519329071045, "learning_rate": 4.436522643984378e-07, "logits/chosen": 0.310546875, "logits/rejected": 0.4140625, "logps/chosen": -186.0, "logps/rejected": -286.0, "loss": 0.6782, "rewards/accuracies": 0.5, "rewards/chosen": -0.96875, "rewards/margins": 0.2353515625, "rewards/rejected": -1.203125, "step": 351 }, { "epoch": 0.7367870225013082, "grad_norm": 10.587912559509277, "learning_rate": 4.4329210438713085e-07, "logits/chosen": 3.25, "logits/rejected": 4.0, "logps/chosen": -540.0, "logps/rejected": -712.0, "loss": 0.6384, "rewards/accuracies": 0.25, "rewards/chosen": -0.74609375, "rewards/margins": -0.009765625, "rewards/rejected": -0.734375, "step": 352 }, { "epoch": 0.738880167451596, "grad_norm": 9.61915397644043, "learning_rate": 4.429309442143055e-07, "logits/chosen": 2.171875, "logits/rejected": 2.5, "logps/chosen": -330.0, "logps/rejected": -204.0, "loss": 0.5989, "rewards/accuracies": 0.5, "rewards/chosen": -0.64453125, "rewards/margins": 0.1484375, "rewards/rejected": -0.796875, "step": 353 }, { "epoch": 0.7409733124018838, "grad_norm": 9.3129243850708, "learning_rate": 4.4256878574897375e-07, "logits/chosen": 1.875, "logits/rejected": 2.59375, "logps/chosen": -185.0, "logps/rejected": -155.0, "loss": 0.6421, "rewards/accuracies": 0.5, "rewards/chosen": -0.5546875, "rewards/margins": 0.016357421875, "rewards/rejected": -0.5703125, "step": 354 }, { "epoch": 0.7430664573521716, "grad_norm": 9.82224178314209, "learning_rate": 4.4220563086531347e-07, "logits/chosen": 2.71875, "logits/rejected": 2.5625, "logps/chosen": -452.0, "logps/rejected": -472.0, "loss": 0.6432, "rewards/accuracies": 0.75, "rewards/chosen": -0.64453125, "rewards/margins": 0.298828125, "rewards/rejected": -0.9453125, "step": 355 }, { "epoch": 0.7451596023024595, "grad_norm": 7.942782878875732, "learning_rate": 4.418414814426593e-07, "logits/chosen": 1.5234375, "logits/rejected": 1.890625, "logps/chosen": -236.0, "logps/rejected": -236.0, "loss": 0.6293, "rewards/accuracies": 0.75, "rewards/chosen": -0.7265625, "rewards/margins": 0.154296875, "rewards/rejected": -0.8828125, "step": 356 }, { "epoch": 0.7472527472527473, "grad_norm": 9.060127258300781, "learning_rate": 4.414763393654924e-07, "logits/chosen": 2.796875, "logits/rejected": 3.078125, "logps/chosen": -324.0, "logps/rejected": -340.0, "loss": 0.6477, "rewards/accuracies": 0.5, "rewards/chosen": -0.76171875, "rewards/margins": 0.2099609375, "rewards/rejected": -0.97265625, "step": 357 }, { "epoch": 0.749345892203035, "grad_norm": 9.260727882385254, "learning_rate": 4.4111020652343117e-07, "logits/chosen": 2.71875, "logits/rejected": 3.21875, "logps/chosen": -564.0, "logps/rejected": -370.0, "loss": 0.6309, "rewards/accuracies": 0.5, "rewards/chosen": -0.5546875, "rewards/margins": 0.04541015625, "rewards/rejected": -0.59765625, "step": 358 }, { "epoch": 0.7514390371533228, "grad_norm": 10.070478439331055, "learning_rate": 4.4074308481122106e-07, "logits/chosen": 1.953125, "logits/rejected": 2.625, "logps/chosen": -418.0, "logps/rejected": -452.0, "loss": 0.6358, "rewards/accuracies": 0.25, "rewards/chosen": -0.73828125, "rewards/margins": -0.046875, "rewards/rejected": -0.69140625, "step": 359 }, { "epoch": 0.7535321821036107, "grad_norm": 9.476323127746582, "learning_rate": 4.4037497612872504e-07, "logits/chosen": 2.15625, "logits/rejected": 2.125, "logps/chosen": -174.0, "logps/rejected": -528.0, "loss": 0.6452, "rewards/accuracies": 0.75, "rewards/chosen": -0.6875, "rewards/margins": 0.859375, "rewards/rejected": -1.546875, "step": 360 }, { "epoch": 0.7556253270538985, "grad_norm": 9.520855903625488, "learning_rate": 4.4000588238091365e-07, "logits/chosen": 2.328125, "logits/rejected": 2.1875, "logps/chosen": -184.0, "logps/rejected": -245.0, "loss": 0.6271, "rewards/accuracies": 1.0, "rewards/chosen": -0.484375, "rewards/margins": 0.224609375, "rewards/rejected": -0.7109375, "step": 361 }, { "epoch": 0.7577184720041863, "grad_norm": 9.436513900756836, "learning_rate": 4.3963580547785513e-07, "logits/chosen": 2.515625, "logits/rejected": 2.953125, "logps/chosen": -560.0, "logps/rejected": -330.0, "loss": 0.6483, "rewards/accuracies": 0.75, "rewards/chosen": -0.97265625, "rewards/margins": 0.0458984375, "rewards/rejected": -1.015625, "step": 362 }, { "epoch": 0.7598116169544741, "grad_norm": 9.640151023864746, "learning_rate": 4.3926474733470554e-07, "logits/chosen": 2.796875, "logits/rejected": 3.53125, "logps/chosen": -636.0, "logps/rejected": -494.0, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -0.53515625, "rewards/margins": 0.21875, "rewards/rejected": -0.75390625, "step": 363 }, { "epoch": 0.7619047619047619, "grad_norm": 10.11239242553711, "learning_rate": 4.3889270987169904e-07, "logits/chosen": 2.09375, "logits/rejected": 2.59375, "logps/chosen": -382.0, "logps/rejected": -460.0, "loss": 0.6359, "rewards/accuracies": 0.75, "rewards/chosen": -0.78515625, "rewards/margins": 0.70703125, "rewards/rejected": -1.4921875, "step": 364 }, { "epoch": 0.7639979068550498, "grad_norm": 10.607931137084961, "learning_rate": 4.385196950141377e-07, "logits/chosen": 2.09375, "logits/rejected": 1.953125, "logps/chosen": -348.0, "logps/rejected": -264.0, "loss": 0.6619, "rewards/accuracies": 0.5, "rewards/chosen": -0.54296875, "rewards/margins": 0.171875, "rewards/rejected": -0.71484375, "step": 365 }, { "epoch": 0.7660910518053375, "grad_norm": 10.365743637084961, "learning_rate": 4.381457046923815e-07, "logits/chosen": 2.109375, "logits/rejected": 2.34375, "logps/chosen": -500.0, "logps/rejected": -482.0, "loss": 0.6543, "rewards/accuracies": 0.75, "rewards/chosen": -1.078125, "rewards/margins": 0.134765625, "rewards/rejected": -1.2109375, "step": 366 }, { "epoch": 0.7681841967556253, "grad_norm": 9.481950759887695, "learning_rate": 4.377707408418387e-07, "logits/chosen": 2.09375, "logits/rejected": 2.65625, "logps/chosen": -452.0, "logps/rejected": -312.0, "loss": 0.6312, "rewards/accuracies": 0.5, "rewards/chosen": -0.90234375, "rewards/margins": 0.08203125, "rewards/rejected": -0.984375, "step": 367 }, { "epoch": 0.7702773417059131, "grad_norm": 10.416890144348145, "learning_rate": 4.373948054029554e-07, "logits/chosen": 2.765625, "logits/rejected": 3.4375, "logps/chosen": -820.0, "logps/rejected": -780.0, "loss": 0.5978, "rewards/accuracies": 1.0, "rewards/chosen": -0.349609375, "rewards/margins": 0.51953125, "rewards/rejected": -0.8671875, "step": 368 }, { "epoch": 0.7723704866562009, "grad_norm": 10.182770729064941, "learning_rate": 4.3701790032120584e-07, "logits/chosen": 2.421875, "logits/rejected": 2.484375, "logps/chosen": -452.0, "logps/rejected": -548.0, "loss": 0.6289, "rewards/accuracies": 0.5, "rewards/chosen": -0.9453125, "rewards/margins": 0.1630859375, "rewards/rejected": -1.109375, "step": 369 }, { "epoch": 0.7744636316064888, "grad_norm": 8.755770683288574, "learning_rate": 4.3664002754708203e-07, "logits/chosen": 2.375, "logits/rejected": 2.875, "logps/chosen": -408.0, "logps/rejected": -362.0, "loss": 0.618, "rewards/accuracies": 0.75, "rewards/chosen": -0.9375, "rewards/margins": -0.022216796875, "rewards/rejected": -0.91796875, "step": 370 }, { "epoch": 0.7765567765567766, "grad_norm": 10.320544242858887, "learning_rate": 4.362611890360839e-07, "logits/chosen": 2.6875, "logits/rejected": 3.171875, "logps/chosen": -450.0, "logps/rejected": -408.0, "loss": 0.6703, "rewards/accuracies": 0.75, "rewards/chosen": -0.69140625, "rewards/margins": -0.0234375, "rewards/rejected": -0.66796875, "step": 371 }, { "epoch": 0.7786499215070644, "grad_norm": 9.803793907165527, "learning_rate": 4.358813867487092e-07, "logits/chosen": 2.421875, "logits/rejected": 2.234375, "logps/chosen": -1004.0, "logps/rejected": -540.0, "loss": 0.6332, "rewards/accuracies": 0.75, "rewards/chosen": -0.64453125, "rewards/margins": 0.271484375, "rewards/rejected": -0.9140625, "step": 372 }, { "epoch": 0.7807430664573521, "grad_norm": 9.712671279907227, "learning_rate": 4.3550062265044304e-07, "logits/chosen": 1.9296875, "logits/rejected": 1.6875, "logps/chosen": -660.0, "logps/rejected": -508.0, "loss": 0.6387, "rewards/accuracies": 0.5, "rewards/chosen": -0.5625, "rewards/margins": 0.0849609375, "rewards/rejected": -0.6484375, "step": 373 }, { "epoch": 0.7828362114076399, "grad_norm": 9.95979118347168, "learning_rate": 4.351188987117479e-07, "logits/chosen": 2.953125, "logits/rejected": 3.15625, "logps/chosen": -648.0, "logps/rejected": -536.0, "loss": 0.6454, "rewards/accuracies": 1.0, "rewards/chosen": -0.65234375, "rewards/margins": 0.8828125, "rewards/rejected": -1.53125, "step": 374 }, { "epoch": 0.7849293563579278, "grad_norm": 9.249582290649414, "learning_rate": 4.3473621690805376e-07, "logits/chosen": 2.625, "logits/rejected": 2.96875, "logps/chosen": -328.0, "logps/rejected": -193.0, "loss": 0.6431, "rewards/accuracies": 0.5, "rewards/chosen": -0.77734375, "rewards/margins": 0.07373046875, "rewards/rejected": -0.8515625, "step": 375 }, { "epoch": 0.7870225013082156, "grad_norm": 9.266115188598633, "learning_rate": 4.343525792197472e-07, "logits/chosen": 2.203125, "logits/rejected": 2.109375, "logps/chosen": -266.0, "logps/rejected": -330.0, "loss": 0.6174, "rewards/accuracies": 0.5, "rewards/chosen": -0.84375, "rewards/margins": 0.578125, "rewards/rejected": -1.421875, "step": 376 }, { "epoch": 0.7891156462585034, "grad_norm": 9.573073387145996, "learning_rate": 4.339679876321619e-07, "logits/chosen": 2.8125, "logits/rejected": 3.0625, "logps/chosen": -700.0, "logps/rejected": -494.0, "loss": 0.6442, "rewards/accuracies": 0.75, "rewards/chosen": -0.8359375, "rewards/margins": 0.4453125, "rewards/rejected": -1.28125, "step": 377 }, { "epoch": 0.7912087912087912, "grad_norm": 9.77106761932373, "learning_rate": 4.335824441355677e-07, "logits/chosen": 1.8203125, "logits/rejected": 2.453125, "logps/chosen": -624.0, "logps/rejected": -376.0, "loss": 0.6366, "rewards/accuracies": 0.25, "rewards/chosen": -0.78125, "rewards/margins": -0.1123046875, "rewards/rejected": -0.66796875, "step": 378 }, { "epoch": 0.7933019361590791, "grad_norm": 9.449440002441406, "learning_rate": 4.331959507251606e-07, "logits/chosen": 2.09375, "logits/rejected": 2.328125, "logps/chosen": -162.0, "logps/rejected": -162.0, "loss": 0.6182, "rewards/accuracies": 0.75, "rewards/chosen": -0.5, "rewards/margins": 0.0927734375, "rewards/rejected": -0.59375, "step": 379 }, { "epoch": 0.7953950811093669, "grad_norm": 9.60571575164795, "learning_rate": 4.3280850940105243e-07, "logits/chosen": 3.015625, "logits/rejected": 2.53125, "logps/chosen": -418.0, "logps/rejected": -506.0, "loss": 0.6176, "rewards/accuracies": 1.0, "rewards/chosen": -0.474609375, "rewards/margins": 0.71484375, "rewards/rejected": -1.1875, "step": 380 }, { "epoch": 0.7974882260596546, "grad_norm": 9.75421142578125, "learning_rate": 4.3242012216826084e-07, "logits/chosen": 2.65625, "logits/rejected": 2.578125, "logps/chosen": -418.0, "logps/rejected": -344.0, "loss": 0.6361, "rewards/accuracies": 0.75, "rewards/chosen": -0.4609375, "rewards/margins": 0.234375, "rewards/rejected": -0.6953125, "step": 381 }, { "epoch": 0.7995813710099424, "grad_norm": 9.220489501953125, "learning_rate": 4.3203079103669807e-07, "logits/chosen": 2.125, "logits/rejected": 1.9453125, "logps/chosen": -171.0, "logps/rejected": -264.0, "loss": 0.5963, "rewards/accuracies": 0.75, "rewards/chosen": -0.8125, "rewards/margins": 0.17578125, "rewards/rejected": -0.98828125, "step": 382 }, { "epoch": 0.8016745159602302, "grad_norm": 9.26289176940918, "learning_rate": 4.316405180211615e-07, "logits/chosen": 2.6875, "logits/rejected": 2.296875, "logps/chosen": -334.0, "logps/rejected": -520.0, "loss": 0.6377, "rewards/accuracies": 0.25, "rewards/chosen": -0.7265625, "rewards/margins": 0.0205078125, "rewards/rejected": -0.75, "step": 383 }, { "epoch": 0.8037676609105181, "grad_norm": 10.633591651916504, "learning_rate": 4.312493051413224e-07, "logits/chosen": 2.78125, "logits/rejected": 2.6875, "logps/chosen": -372.0, "logps/rejected": -342.0, "loss": 0.6234, "rewards/accuracies": 1.0, "rewards/chosen": -0.69140625, "rewards/margins": 0.337890625, "rewards/rejected": -1.03125, "step": 384 }, { "epoch": 0.8058608058608059, "grad_norm": 9.378695487976074, "learning_rate": 4.308571544217161e-07, "logits/chosen": 2.8125, "logits/rejected": 3.59375, "logps/chosen": -592.0, "logps/rejected": -592.0, "loss": 0.6395, "rewards/accuracies": 0.0, "rewards/chosen": -1.1796875, "rewards/margins": -0.185546875, "rewards/rejected": -0.99609375, "step": 385 }, { "epoch": 0.8079539508110937, "grad_norm": 10.046592712402344, "learning_rate": 4.3046406789173123e-07, "logits/chosen": 2.3125, "logits/rejected": 2.21875, "logps/chosen": -572.0, "logps/rejected": -560.0, "loss": 0.6069, "rewards/accuracies": 0.5, "rewards/chosen": -1.0078125, "rewards/margins": 0.0029296875, "rewards/rejected": -1.015625, "step": 386 }, { "epoch": 0.8100470957613815, "grad_norm": 9.885677337646484, "learning_rate": 4.300700475855992e-07, "logits/chosen": 2.5625, "logits/rejected": 2.78125, "logps/chosen": -274.0, "logps/rejected": -199.0, "loss": 0.6354, "rewards/accuracies": 0.5, "rewards/chosen": -0.95703125, "rewards/margins": -0.248046875, "rewards/rejected": -0.7109375, "step": 387 }, { "epoch": 0.8121402407116692, "grad_norm": 10.309823036193848, "learning_rate": 4.296750955423837e-07, "logits/chosen": 3.34375, "logits/rejected": 2.859375, "logps/chosen": -580.0, "logps/rejected": -672.0, "loss": 0.6669, "rewards/accuracies": 0.5, "rewards/chosen": -0.80859375, "rewards/margins": -0.0576171875, "rewards/rejected": -0.75, "step": 388 }, { "epoch": 0.8142333856619571, "grad_norm": 9.416850090026855, "learning_rate": 4.2927921380597037e-07, "logits/chosen": 2.84375, "logits/rejected": 2.828125, "logps/chosen": -240.0, "logps/rejected": -256.0, "loss": 0.6156, "rewards/accuracies": 0.5, "rewards/chosen": -0.75, "rewards/margins": -0.09375, "rewards/rejected": -0.65625, "step": 389 }, { "epoch": 0.8163265306122449, "grad_norm": 9.89782428741455, "learning_rate": 4.288824044250558e-07, "logits/chosen": 2.875, "logits/rejected": 2.765625, "logps/chosen": -516.0, "logps/rejected": -716.0, "loss": 0.609, "rewards/accuracies": 0.75, "rewards/chosen": -0.59765625, "rewards/margins": 0.7734375, "rewards/rejected": -1.375, "step": 390 }, { "epoch": 0.8184196755625327, "grad_norm": 8.746621131896973, "learning_rate": 4.284846694531373e-07, "logits/chosen": 1.921875, "logits/rejected": 2.40625, "logps/chosen": -260.0, "logps/rejected": -193.0, "loss": 0.6066, "rewards/accuracies": 0.75, "rewards/chosen": -0.98828125, "rewards/margins": 0.087890625, "rewards/rejected": -1.078125, "step": 391 }, { "epoch": 0.8205128205128205, "grad_norm": 9.77097225189209, "learning_rate": 4.2808601094850214e-07, "logits/chosen": 2.59375, "logits/rejected": 3.53125, "logps/chosen": -792.0, "logps/rejected": -470.0, "loss": 0.575, "rewards/accuracies": 0.75, "rewards/chosen": -1.03125, "rewards/margins": 0.1416015625, "rewards/rejected": -1.171875, "step": 392 }, { "epoch": 0.8226059654631083, "grad_norm": 9.895403861999512, "learning_rate": 4.276864309742169e-07, "logits/chosen": 2.328125, "logits/rejected": 2.21875, "logps/chosen": -572.0, "logps/rejected": -468.0, "loss": 0.658, "rewards/accuracies": 0.5, "rewards/chosen": -1.0390625, "rewards/margins": 0.046875, "rewards/rejected": -1.0859375, "step": 393 }, { "epoch": 0.8246991104133962, "grad_norm": 10.531278610229492, "learning_rate": 4.2728593159811667e-07, "logits/chosen": 2.140625, "logits/rejected": 2.65625, "logps/chosen": -412.0, "logps/rejected": -255.0, "loss": 0.6505, "rewards/accuracies": 0.75, "rewards/chosen": -0.6796875, "rewards/margins": 0.2109375, "rewards/rejected": -0.890625, "step": 394 }, { "epoch": 0.826792255363684, "grad_norm": 9.876930236816406, "learning_rate": 4.268845148927945e-07, "logits/chosen": 1.7890625, "logits/rejected": 1.78125, "logps/chosen": -536.0, "logps/rejected": -672.0, "loss": 0.6202, "rewards/accuracies": 1.0, "rewards/chosen": -0.8515625, "rewards/margins": 0.87109375, "rewards/rejected": -1.7265625, "step": 395 }, { "epoch": 0.8288854003139717, "grad_norm": 9.344927787780762, "learning_rate": 4.264821829355908e-07, "logits/chosen": 2.734375, "logits/rejected": 3.3125, "logps/chosen": -564.0, "logps/rejected": -410.0, "loss": 0.6163, "rewards/accuracies": 0.75, "rewards/chosen": -0.6953125, "rewards/margins": 0.359375, "rewards/rejected": -1.0546875, "step": 396 }, { "epoch": 0.8309785452642595, "grad_norm": 11.048480987548828, "learning_rate": 4.260789378085821e-07, "logits/chosen": 1.890625, "logits/rejected": 1.8671875, "logps/chosen": -238.0, "logps/rejected": -228.0, "loss": 0.6577, "rewards/accuracies": 0.5, "rewards/chosen": -0.890625, "rewards/margins": 0.0693359375, "rewards/rejected": -0.9609375, "step": 397 }, { "epoch": 0.8330716902145474, "grad_norm": 10.133160591125488, "learning_rate": 4.2567478159857087e-07, "logits/chosen": 3.203125, "logits/rejected": 3.390625, "logps/chosen": -640.0, "logps/rejected": -608.0, "loss": 0.6498, "rewards/accuracies": 0.75, "rewards/chosen": -0.80078125, "rewards/margins": 0.349609375, "rewards/rejected": -1.1484375, "step": 398 }, { "epoch": 0.8351648351648352, "grad_norm": 10.660299301147461, "learning_rate": 4.2526971639707456e-07, "logits/chosen": 3.046875, "logits/rejected": 3.8125, "logps/chosen": -664.0, "logps/rejected": -506.0, "loss": 0.6444, "rewards/accuracies": 0.5, "rewards/chosen": -1.046875, "rewards/margins": -0.0458984375, "rewards/rejected": -1.0, "step": 399 }, { "epoch": 0.837257980115123, "grad_norm": 9.645748138427734, "learning_rate": 4.248637443003144e-07, "logits/chosen": 2.203125, "logits/rejected": 2.515625, "logps/chosen": -302.0, "logps/rejected": -219.0, "loss": 0.6131, "rewards/accuracies": 0.5, "rewards/chosen": -0.69140625, "rewards/margins": 0.06103515625, "rewards/rejected": -0.75390625, "step": 400 }, { "epoch": 0.8393511250654108, "grad_norm": 10.709028244018555, "learning_rate": 4.2445686740920484e-07, "logits/chosen": 3.25, "logits/rejected": 2.9375, "logps/chosen": -450.0, "logps/rejected": -480.0, "loss": 0.6438, "rewards/accuracies": 0.5, "rewards/chosen": -1.046875, "rewards/margins": -0.052734375, "rewards/rejected": -0.9921875, "step": 401 }, { "epoch": 0.8414442700156985, "grad_norm": 9.625758171081543, "learning_rate": 4.240490878293428e-07, "logits/chosen": 2.1875, "logits/rejected": 2.734375, "logps/chosen": -296.0, "logps/rejected": -207.0, "loss": 0.625, "rewards/accuracies": 0.0, "rewards/chosen": -1.1328125, "rewards/margins": -0.181640625, "rewards/rejected": -0.953125, "step": 402 }, { "epoch": 0.8435374149659864, "grad_norm": 9.67353630065918, "learning_rate": 4.236404076709967e-07, "logits/chosen": 1.734375, "logits/rejected": 2.25, "logps/chosen": -338.0, "logps/rejected": -430.0, "loss": 0.5896, "rewards/accuracies": 0.75, "rewards/chosen": -0.875, "rewards/margins": 0.40234375, "rewards/rejected": -1.28125, "step": 403 }, { "epoch": 0.8456305599162742, "grad_norm": 10.118279457092285, "learning_rate": 4.232308290490952e-07, "logits/chosen": 2.40625, "logits/rejected": 2.75, "logps/chosen": -808.0, "logps/rejected": -692.0, "loss": 0.6401, "rewards/accuracies": 0.5, "rewards/chosen": -0.77734375, "rewards/margins": 0.42578125, "rewards/rejected": -1.203125, "step": 404 }, { "epoch": 0.847723704866562, "grad_norm": 9.055681228637695, "learning_rate": 4.2282035408321663e-07, "logits/chosen": 2.484375, "logits/rejected": 3.046875, "logps/chosen": -600.0, "logps/rejected": -684.0, "loss": 0.6226, "rewards/accuracies": 0.5, "rewards/chosen": -1.0234375, "rewards/margins": -0.005859375, "rewards/rejected": -1.015625, "step": 405 }, { "epoch": 0.8498168498168498, "grad_norm": 10.447412490844727, "learning_rate": 4.2240898489757816e-07, "logits/chosen": 1.671875, "logits/rejected": 1.6875, "logps/chosen": -394.0, "logps/rejected": -314.0, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": -0.703125, "rewards/margins": 0.59375, "rewards/rejected": -1.296875, "step": 406 }, { "epoch": 0.8519099947671376, "grad_norm": 9.07568359375, "learning_rate": 4.2199672362102435e-07, "logits/chosen": 1.828125, "logits/rejected": 2.546875, "logps/chosen": -416.0, "logps/rejected": -328.0, "loss": 0.6175, "rewards/accuracies": 0.25, "rewards/chosen": -1.015625, "rewards/margins": 0.01171875, "rewards/rejected": -1.03125, "step": 407 }, { "epoch": 0.8540031397174255, "grad_norm": 11.152243614196777, "learning_rate": 4.215835723870162e-07, "logits/chosen": 2.421875, "logits/rejected": 2.421875, "logps/chosen": -330.0, "logps/rejected": -608.0, "loss": 0.6663, "rewards/accuracies": 1.0, "rewards/chosen": -0.69140625, "rewards/margins": 0.5, "rewards/rejected": -1.1875, "step": 408 }, { "epoch": 0.8560962846677133, "grad_norm": 9.441628456115723, "learning_rate": 4.211695333336206e-07, "logits/chosen": 3.328125, "logits/rejected": 2.9375, "logps/chosen": -648.0, "logps/rejected": -528.0, "loss": 0.6234, "rewards/accuracies": 0.5, "rewards/chosen": -1.0546875, "rewards/margins": 0.046875, "rewards/rejected": -1.1015625, "step": 409 }, { "epoch": 0.858189429618001, "grad_norm": 9.586783409118652, "learning_rate": 4.207546086034987e-07, "logits/chosen": 2.6875, "logits/rejected": 2.234375, "logps/chosen": -588.0, "logps/rejected": -892.0, "loss": 0.6212, "rewards/accuracies": 1.0, "rewards/chosen": -0.62109375, "rewards/margins": 0.38671875, "rewards/rejected": -1.0078125, "step": 410 }, { "epoch": 0.8602825745682888, "grad_norm": 9.743223190307617, "learning_rate": 4.203388003438951e-07, "logits/chosen": 1.4921875, "logits/rejected": 1.625, "logps/chosen": -264.0, "logps/rejected": -245.0, "loss": 0.6136, "rewards/accuracies": 1.0, "rewards/chosen": -0.7578125, "rewards/margins": 0.255859375, "rewards/rejected": -1.015625, "step": 411 }, { "epoch": 0.8623757195185766, "grad_norm": 10.374858856201172, "learning_rate": 4.1992211070662686e-07, "logits/chosen": 1.3984375, "logits/rejected": 1.1015625, "logps/chosen": -372.0, "logps/rejected": -488.0, "loss": 0.6324, "rewards/accuracies": 0.5, "rewards/chosen": -0.9921875, "rewards/margins": 0.0126953125, "rewards/rejected": -1.0078125, "step": 412 }, { "epoch": 0.8644688644688645, "grad_norm": 9.237102508544922, "learning_rate": 4.195045418480717e-07, "logits/chosen": 3.03125, "logits/rejected": 3.09375, "logps/chosen": -416.0, "logps/rejected": -434.0, "loss": 0.614, "rewards/accuracies": 0.25, "rewards/chosen": -1.0234375, "rewards/margins": -0.2109375, "rewards/rejected": -0.8125, "step": 413 }, { "epoch": 0.8665620094191523, "grad_norm": 10.228826522827148, "learning_rate": 4.19086095929158e-07, "logits/chosen": 2.3125, "logits/rejected": 2.28125, "logps/chosen": -360.0, "logps/rejected": -368.0, "loss": 0.6384, "rewards/accuracies": 1.0, "rewards/chosen": -0.890625, "rewards/margins": 0.1767578125, "rewards/rejected": -1.0625, "step": 414 }, { "epoch": 0.8686551543694401, "grad_norm": 9.450304985046387, "learning_rate": 4.1866677511535237e-07, "logits/chosen": 1.953125, "logits/rejected": 1.4609375, "logps/chosen": -154.0, "logps/rejected": -292.0, "loss": 0.6115, "rewards/accuracies": 0.75, "rewards/chosen": -0.91796875, "rewards/margins": 0.35546875, "rewards/rejected": -1.2734375, "step": 415 }, { "epoch": 0.8707482993197279, "grad_norm": 10.266785621643066, "learning_rate": 4.1824658157664935e-07, "logits/chosen": 2.359375, "logits/rejected": 2.140625, "logps/chosen": -400.0, "logps/rejected": -490.0, "loss": 0.6201, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.1982421875, "rewards/rejected": -1.3828125, "step": 416 }, { "epoch": 0.8728414442700158, "grad_norm": 11.08837890625, "learning_rate": 4.1782551748755954e-07, "logits/chosen": 1.5, "logits/rejected": 2.1875, "logps/chosen": -350.0, "logps/rejected": -278.0, "loss": 0.6378, "rewards/accuracies": 0.5, "rewards/chosen": -0.890625, "rewards/margins": 0.1796875, "rewards/rejected": -1.0703125, "step": 417 }, { "epoch": 0.8749345892203035, "grad_norm": 9.815693855285645, "learning_rate": 4.174035850270993e-07, "logits/chosen": 3.1875, "logits/rejected": 3.109375, "logps/chosen": -792.0, "logps/rejected": -420.0, "loss": 0.6368, "rewards/accuracies": 0.25, "rewards/chosen": -1.28125, "rewards/margins": -0.50390625, "rewards/rejected": -0.77734375, "step": 418 }, { "epoch": 0.8770277341705913, "grad_norm": 9.853411674499512, "learning_rate": 4.1698078637877795e-07, "logits/chosen": 2.78125, "logits/rejected": 2.796875, "logps/chosen": -928.0, "logps/rejected": -800.0, "loss": 0.64, "rewards/accuracies": 0.75, "rewards/chosen": -0.7265625, "rewards/margins": 0.31640625, "rewards/rejected": -1.046875, "step": 419 }, { "epoch": 0.8791208791208791, "grad_norm": 8.987032890319824, "learning_rate": 4.165571237305881e-07, "logits/chosen": 1.7890625, "logits/rejected": 2.046875, "logps/chosen": -312.0, "logps/rejected": -196.0, "loss": 0.618, "rewards/accuracies": 0.5, "rewards/chosen": -0.96484375, "rewards/margins": -0.1591796875, "rewards/rejected": -0.8046875, "step": 420 }, { "epoch": 0.8812140240711669, "grad_norm": 9.354534149169922, "learning_rate": 4.161325992749931e-07, "logits/chosen": 2.34375, "logits/rejected": 2.375, "logps/chosen": -840.0, "logps/rejected": -496.0, "loss": 0.6295, "rewards/accuracies": 0.75, "rewards/chosen": -0.921875, "rewards/margins": 0.30859375, "rewards/rejected": -1.2265625, "step": 421 }, { "epoch": 0.8833071690214548, "grad_norm": 9.199723243713379, "learning_rate": 4.1570721520891646e-07, "logits/chosen": 3.453125, "logits/rejected": 3.078125, "logps/chosen": -688.0, "logps/rejected": -792.0, "loss": 0.6236, "rewards/accuracies": 0.25, "rewards/chosen": -0.9296875, "rewards/margins": -0.1298828125, "rewards/rejected": -0.80078125, "step": 422 }, { "epoch": 0.8854003139717426, "grad_norm": 9.82619571685791, "learning_rate": 4.1528097373373e-07, "logits/chosen": 1.921875, "logits/rejected": 2.125, "logps/chosen": -616.0, "logps/rejected": -524.0, "loss": 0.5998, "rewards/accuracies": 0.5, "rewards/chosen": -0.85546875, "rewards/margins": 0.0830078125, "rewards/rejected": -0.9375, "step": 423 }, { "epoch": 0.8874934589220304, "grad_norm": 9.572153091430664, "learning_rate": 4.1485387705524277e-07, "logits/chosen": 1.8671875, "logits/rejected": 1.65625, "logps/chosen": -476.0, "logps/rejected": -508.0, "loss": 0.6332, "rewards/accuracies": 0.75, "rewards/chosen": -0.83203125, "rewards/margins": 0.328125, "rewards/rejected": -1.15625, "step": 424 }, { "epoch": 0.8895866038723181, "grad_norm": 9.886564254760742, "learning_rate": 4.144259273836896e-07, "logits/chosen": 2.609375, "logits/rejected": 2.921875, "logps/chosen": -442.0, "logps/rejected": -296.0, "loss": 0.6237, "rewards/accuracies": 0.5, "rewards/chosen": -0.78125, "rewards/margins": 0.0859375, "rewards/rejected": -0.8671875, "step": 425 }, { "epoch": 0.8916797488226059, "grad_norm": 9.998958587646484, "learning_rate": 4.139971269337192e-07, "logits/chosen": 2.640625, "logits/rejected": 2.6875, "logps/chosen": -360.0, "logps/rejected": -416.0, "loss": 0.6264, "rewards/accuracies": 0.5, "rewards/chosen": -0.87890625, "rewards/margins": 0.0615234375, "rewards/rejected": -0.94140625, "step": 426 }, { "epoch": 0.8937728937728938, "grad_norm": 10.484004974365234, "learning_rate": 4.135674779243835e-07, "logits/chosen": 2.65625, "logits/rejected": 2.6875, "logps/chosen": -270.0, "logps/rejected": -440.0, "loss": 0.5778, "rewards/accuracies": 1.0, "rewards/chosen": -0.7109375, "rewards/margins": 0.453125, "rewards/rejected": -1.1640625, "step": 427 }, { "epoch": 0.8958660387231816, "grad_norm": 9.723721504211426, "learning_rate": 4.131369825791256e-07, "logits/chosen": 2.515625, "logits/rejected": 2.59375, "logps/chosen": -564.0, "logps/rejected": -488.0, "loss": 0.5828, "rewards/accuracies": 1.0, "rewards/chosen": -0.8671875, "rewards/margins": 0.4765625, "rewards/rejected": -1.34375, "step": 428 }, { "epoch": 0.8979591836734694, "grad_norm": 10.536715507507324, "learning_rate": 4.127056431257683e-07, "logits/chosen": 1.703125, "logits/rejected": 2.125, "logps/chosen": -334.0, "logps/rejected": -241.0, "loss": 0.6463, "rewards/accuracies": 0.25, "rewards/chosen": -0.87109375, "rewards/margins": -0.0751953125, "rewards/rejected": -0.796875, "step": 429 }, { "epoch": 0.9000523286237572, "grad_norm": 10.433703422546387, "learning_rate": 4.1227346179650286e-07, "logits/chosen": 3.0, "logits/rejected": 2.9375, "logps/chosen": -540.0, "logps/rejected": -720.0, "loss": 0.6252, "rewards/accuracies": 0.75, "rewards/chosen": -0.921875, "rewards/margins": 0.169921875, "rewards/rejected": -1.09375, "step": 430 }, { "epoch": 0.902145473574045, "grad_norm": 10.272560119628906, "learning_rate": 4.118404408278771e-07, "logits/chosen": 3.28125, "logits/rejected": 2.03125, "logps/chosen": -442.0, "logps/rejected": -588.0, "loss": 0.6508, "rewards/accuracies": 0.25, "rewards/chosen": -0.96484375, "rewards/margins": -0.119140625, "rewards/rejected": -0.84375, "step": 431 }, { "epoch": 0.9042386185243328, "grad_norm": 9.89719009399414, "learning_rate": 4.11406582460784e-07, "logits/chosen": 2.96875, "logits/rejected": 2.40625, "logps/chosen": -924.0, "logps/rejected": -624.0, "loss": 0.6147, "rewards/accuracies": 0.5, "rewards/chosen": -1.0, "rewards/margins": 0.052734375, "rewards/rejected": -1.0546875, "step": 432 }, { "epoch": 0.9063317634746206, "grad_norm": 9.447464942932129, "learning_rate": 4.109718889404503e-07, "logits/chosen": 1.109375, "logits/rejected": 1.0078125, "logps/chosen": -214.0, "logps/rejected": -262.0, "loss": 0.636, "rewards/accuracies": 0.5, "rewards/chosen": -1.046875, "rewards/margins": -0.0390625, "rewards/rejected": -1.0078125, "step": 433 }, { "epoch": 0.9084249084249084, "grad_norm": 10.362798690795898, "learning_rate": 4.1053636251642456e-07, "logits/chosen": 1.84375, "logits/rejected": 2.703125, "logps/chosen": -552.0, "logps/rejected": -420.0, "loss": 0.6307, "rewards/accuracies": 0.5, "rewards/chosen": -1.53125, "rewards/margins": -0.240234375, "rewards/rejected": -1.296875, "step": 434 }, { "epoch": 0.9105180533751962, "grad_norm": 9.440129280090332, "learning_rate": 4.1010000544256536e-07, "logits/chosen": 1.546875, "logits/rejected": 1.4453125, "logps/chosen": -456.0, "logps/rejected": -390.0, "loss": 0.6167, "rewards/accuracies": 0.5, "rewards/chosen": -1.0078125, "rewards/margins": 0.08984375, "rewards/rejected": -1.1015625, "step": 435 }, { "epoch": 0.912611198325484, "grad_norm": 10.998126983642578, "learning_rate": 4.096628199770304e-07, "logits/chosen": 2.71875, "logits/rejected": 2.671875, "logps/chosen": -696.0, "logps/rejected": -716.0, "loss": 0.6675, "rewards/accuracies": 0.75, "rewards/chosen": -0.7421875, "rewards/margins": 0.390625, "rewards/rejected": -1.1328125, "step": 436 }, { "epoch": 0.9147043432757719, "grad_norm": 9.635455131530762, "learning_rate": 4.0922480838226394e-07, "logits/chosen": 3.328125, "logits/rejected": 3.125, "logps/chosen": -436.0, "logps/rejected": -812.0, "loss": 0.6078, "rewards/accuracies": 0.75, "rewards/chosen": -0.640625, "rewards/margins": 0.50390625, "rewards/rejected": -1.140625, "step": 437 }, { "epoch": 0.9167974882260597, "grad_norm": 10.190224647521973, "learning_rate": 4.0878597292498576e-07, "logits/chosen": 2.828125, "logits/rejected": 2.515625, "logps/chosen": -346.0, "logps/rejected": -524.0, "loss": 0.6652, "rewards/accuracies": 0.75, "rewards/chosen": -0.7421875, "rewards/margins": 0.27734375, "rewards/rejected": -1.015625, "step": 438 }, { "epoch": 0.9188906331763474, "grad_norm": 9.4074125289917, "learning_rate": 4.083463158761789e-07, "logits/chosen": 2.421875, "logits/rejected": 2.1875, "logps/chosen": -452.0, "logps/rejected": -502.0, "loss": 0.6276, "rewards/accuracies": 1.0, "rewards/chosen": -0.7890625, "rewards/margins": 0.328125, "rewards/rejected": -1.1171875, "step": 439 }, { "epoch": 0.9209837781266352, "grad_norm": 10.4319429397583, "learning_rate": 4.079058395110782e-07, "logits/chosen": 2.09375, "logits/rejected": 2.421875, "logps/chosen": -480.0, "logps/rejected": -376.0, "loss": 0.642, "rewards/accuracies": 0.25, "rewards/chosen": -1.0234375, "rewards/margins": 0.06396484375, "rewards/rejected": -1.0859375, "step": 440 }, { "epoch": 0.9230769230769231, "grad_norm": 10.221841812133789, "learning_rate": 4.074645461091587e-07, "logits/chosen": 2.5, "logits/rejected": 2.671875, "logps/chosen": -496.0, "logps/rejected": -440.0, "loss": 0.5748, "rewards/accuracies": 0.75, "rewards/chosen": -0.640625, "rewards/margins": 0.515625, "rewards/rejected": -1.15625, "step": 441 }, { "epoch": 0.9251700680272109, "grad_norm": 10.588394165039062, "learning_rate": 4.0702243795412343e-07, "logits/chosen": 2.734375, "logits/rejected": 3.453125, "logps/chosen": -608.0, "logps/rejected": -456.0, "loss": 0.6535, "rewards/accuracies": 0.75, "rewards/chosen": -1.2421875, "rewards/margins": 0.23046875, "rewards/rejected": -1.46875, "step": 442 }, { "epoch": 0.9272632129774987, "grad_norm": 9.824782371520996, "learning_rate": 4.065795173338918e-07, "logits/chosen": 1.8671875, "logits/rejected": 2.09375, "logps/chosen": -476.0, "logps/rejected": -476.0, "loss": 0.6246, "rewards/accuracies": 0.5, "rewards/chosen": -1.265625, "rewards/margins": -0.26171875, "rewards/rejected": -1.0078125, "step": 443 }, { "epoch": 0.9293563579277865, "grad_norm": 10.064045906066895, "learning_rate": 4.061357865405877e-07, "logits/chosen": 1.9609375, "logits/rejected": 2.171875, "logps/chosen": -400.0, "logps/rejected": -332.0, "loss": 0.6211, "rewards/accuracies": 0.75, "rewards/chosen": -0.86328125, "rewards/margins": 0.23828125, "rewards/rejected": -1.1015625, "step": 444 }, { "epoch": 0.9314495028780743, "grad_norm": 9.577927589416504, "learning_rate": 4.056912478705279e-07, "logits/chosen": 2.21875, "logits/rejected": 2.5625, "logps/chosen": -560.0, "logps/rejected": -342.0, "loss": 0.6292, "rewards/accuracies": 1.0, "rewards/chosen": -0.7734375, "rewards/margins": 0.3828125, "rewards/rejected": -1.15625, "step": 445 }, { "epoch": 0.9335426478283622, "grad_norm": 9.735443115234375, "learning_rate": 4.052459036242096e-07, "logits/chosen": 2.0, "logits/rejected": 2.0625, "logps/chosen": -496.0, "logps/rejected": -552.0, "loss": 0.6157, "rewards/accuracies": 1.0, "rewards/chosen": -0.78515625, "rewards/margins": 0.8359375, "rewards/rejected": -1.6171875, "step": 446 }, { "epoch": 0.9356357927786499, "grad_norm": 9.2816162109375, "learning_rate": 4.047997561062993e-07, "logits/chosen": 2.875, "logits/rejected": 2.78125, "logps/chosen": -494.0, "logps/rejected": -568.0, "loss": 0.6006, "rewards/accuracies": 0.25, "rewards/chosen": -1.0, "rewards/margins": -0.169921875, "rewards/rejected": -0.828125, "step": 447 }, { "epoch": 0.9377289377289377, "grad_norm": 10.253028869628906, "learning_rate": 4.0435280762562e-07, "logits/chosen": 1.7890625, "logits/rejected": 1.8671875, "logps/chosen": -278.0, "logps/rejected": -284.0, "loss": 0.6109, "rewards/accuracies": 0.75, "rewards/chosen": -0.703125, "rewards/margins": 0.2265625, "rewards/rejected": -0.9296875, "step": 448 }, { "epoch": 0.9398220826792255, "grad_norm": 9.290606498718262, "learning_rate": 4.039050604951401e-07, "logits/chosen": 2.40625, "logits/rejected": 2.78125, "logps/chosen": -592.0, "logps/rejected": -440.0, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": -0.67578125, "rewards/margins": 0.63671875, "rewards/rejected": -1.3125, "step": 449 }, { "epoch": 0.9419152276295133, "grad_norm": 10.120656967163086, "learning_rate": 4.0345651703196084e-07, "logits/chosen": 2.59375, "logits/rejected": 2.875, "logps/chosen": -430.0, "logps/rejected": -175.0, "loss": 0.6422, "rewards/accuracies": 0.5, "rewards/chosen": -0.59765625, "rewards/margins": 0.2314453125, "rewards/rejected": -0.83203125, "step": 450 }, { "epoch": 0.9440083725798012, "grad_norm": 10.762495994567871, "learning_rate": 4.030071795573044e-07, "logits/chosen": 3.375, "logits/rejected": 3.390625, "logps/chosen": -704.0, "logps/rejected": -584.0, "loss": 0.6472, "rewards/accuracies": 0.75, "rewards/chosen": -0.9453125, "rewards/margins": 0.32421875, "rewards/rejected": -1.2734375, "step": 451 }, { "epoch": 0.946101517530089, "grad_norm": 10.06574535369873, "learning_rate": 4.025570503965021e-07, "logits/chosen": 2.8125, "logits/rejected": 3.25, "logps/chosen": -820.0, "logps/rejected": -440.0, "loss": 0.6119, "rewards/accuracies": 0.75, "rewards/chosen": -1.21875, "rewards/margins": -0.150390625, "rewards/rejected": -1.0625, "step": 452 }, { "epoch": 0.9481946624803768, "grad_norm": 10.84588623046875, "learning_rate": 4.0210613187898243e-07, "logits/chosen": 2.359375, "logits/rejected": 2.484375, "logps/chosen": -528.0, "logps/rejected": -516.0, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": -0.9375, "rewards/margins": 0.478515625, "rewards/rejected": -1.4140625, "step": 453 }, { "epoch": 0.9502878074306645, "grad_norm": 10.217286109924316, "learning_rate": 4.016544263382585e-07, "logits/chosen": 3.09375, "logits/rejected": 3.5, "logps/chosen": -880.0, "logps/rejected": -652.0, "loss": 0.6135, "rewards/accuracies": 0.5, "rewards/chosen": -0.7421875, "rewards/margins": -0.02685546875, "rewards/rejected": -0.71484375, "step": 454 }, { "epoch": 0.9523809523809523, "grad_norm": 10.715296745300293, "learning_rate": 4.012019361119164e-07, "logits/chosen": 1.828125, "logits/rejected": 2.625, "logps/chosen": -320.0, "logps/rejected": -458.0, "loss": 0.6359, "rewards/accuracies": 1.0, "rewards/chosen": -0.6953125, "rewards/margins": 1.1796875, "rewards/rejected": -1.8671875, "step": 455 }, { "epoch": 0.9544740973312402, "grad_norm": 10.716880798339844, "learning_rate": 4.0074866354160304e-07, "logits/chosen": 2.15625, "logits/rejected": 3.0, "logps/chosen": -588.0, "logps/rejected": -360.0, "loss": 0.6413, "rewards/accuracies": 0.75, "rewards/chosen": -0.83203125, "rewards/margins": 0.306640625, "rewards/rejected": -1.140625, "step": 456 }, { "epoch": 0.956567242281528, "grad_norm": 9.964006423950195, "learning_rate": 4.00294610973014e-07, "logits/chosen": 2.46875, "logits/rejected": 2.375, "logps/chosen": -444.0, "logps/rejected": -604.0, "loss": 0.5844, "rewards/accuracies": 1.0, "rewards/chosen": -0.83984375, "rewards/margins": 0.515625, "rewards/rejected": -1.359375, "step": 457 }, { "epoch": 0.9586603872318158, "grad_norm": 9.94836711883545, "learning_rate": 3.998397807558813e-07, "logits/chosen": 2.328125, "logits/rejected": 2.5, "logps/chosen": -364.0, "logps/rejected": -388.0, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": -0.890625, "rewards/margins": 0.28125, "rewards/rejected": -1.171875, "step": 458 }, { "epoch": 0.9607535321821036, "grad_norm": 10.611427307128906, "learning_rate": 3.9938417524396124e-07, "logits/chosen": 1.84375, "logits/rejected": 2.328125, "logps/chosen": -500.0, "logps/rejected": -378.0, "loss": 0.6265, "rewards/accuracies": 0.75, "rewards/chosen": -1.171875, "rewards/margins": 0.3125, "rewards/rejected": -1.484375, "step": 459 }, { "epoch": 0.9628466771323915, "grad_norm": 10.26854133605957, "learning_rate": 3.9892779679502246e-07, "logits/chosen": 2.203125, "logits/rejected": 2.984375, "logps/chosen": -688.0, "logps/rejected": -672.0, "loss": 0.6392, "rewards/accuracies": 0.5, "rewards/chosen": -1.4140625, "rewards/margins": -0.40234375, "rewards/rejected": -1.015625, "step": 460 }, { "epoch": 0.9649398220826793, "grad_norm": 9.015223503112793, "learning_rate": 3.984706477708335e-07, "logits/chosen": 2.03125, "logits/rejected": 2.265625, "logps/chosen": -344.0, "logps/rejected": -278.0, "loss": 0.6155, "rewards/accuracies": 0.75, "rewards/chosen": -0.71875, "rewards/margins": 0.353515625, "rewards/rejected": -1.0703125, "step": 461 }, { "epoch": 0.967032967032967, "grad_norm": 9.67743968963623, "learning_rate": 3.9801273053715045e-07, "logits/chosen": 1.109375, "logits/rejected": 1.484375, "logps/chosen": -166.0, "logps/rejected": -181.0, "loss": 0.6059, "rewards/accuracies": 0.75, "rewards/chosen": -0.75, "rewards/margins": 0.2119140625, "rewards/rejected": -0.9609375, "step": 462 }, { "epoch": 0.9691261119832548, "grad_norm": 9.850830078125, "learning_rate": 3.975540474637053e-07, "logits/chosen": 2.140625, "logits/rejected": 2.109375, "logps/chosen": -350.0, "logps/rejected": -342.0, "loss": 0.6226, "rewards/accuracies": 0.5, "rewards/chosen": -0.98046875, "rewards/margins": -0.02001953125, "rewards/rejected": -0.9609375, "step": 463 }, { "epoch": 0.9712192569335426, "grad_norm": 10.694343566894531, "learning_rate": 3.970946009241929e-07, "logits/chosen": 2.421875, "logits/rejected": 2.578125, "logps/chosen": -520.0, "logps/rejected": -436.0, "loss": 0.6407, "rewards/accuracies": 0.75, "rewards/chosen": -0.859375, "rewards/margins": 0.4296875, "rewards/rejected": -1.2890625, "step": 464 }, { "epoch": 0.9733124018838305, "grad_norm": 10.816640853881836, "learning_rate": 3.9663439329625917e-07, "logits/chosen": 2.265625, "logits/rejected": 2.515625, "logps/chosen": -664.0, "logps/rejected": -476.0, "loss": 0.6241, "rewards/accuracies": 0.75, "rewards/chosen": -0.98828125, "rewards/margins": 0.578125, "rewards/rejected": -1.5625, "step": 465 }, { "epoch": 0.9754055468341183, "grad_norm": 9.781015396118164, "learning_rate": 3.961734269614889e-07, "logits/chosen": 2.109375, "logits/rejected": 2.28125, "logps/chosen": -412.0, "logps/rejected": -366.0, "loss": 0.6075, "rewards/accuracies": 1.0, "rewards/chosen": -0.9375, "rewards/margins": 0.185546875, "rewards/rejected": -1.125, "step": 466 }, { "epoch": 0.9774986917844061, "grad_norm": 10.579108238220215, "learning_rate": 3.9571170430539283e-07, "logits/chosen": 2.09375, "logits/rejected": 1.8359375, "logps/chosen": -354.0, "logps/rejected": -496.0, "loss": 0.656, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.25390625, "rewards/rejected": -1.34375, "step": 467 }, { "epoch": 0.9795918367346939, "grad_norm": 12.218819618225098, "learning_rate": 3.952492277173959e-07, "logits/chosen": 2.765625, "logits/rejected": 3.484375, "logps/chosen": -528.0, "logps/rejected": -298.0, "loss": 0.6665, "rewards/accuracies": 0.5, "rewards/chosen": -1.015625, "rewards/margins": -0.09375, "rewards/rejected": -0.921875, "step": 468 }, { "epoch": 0.9816849816849816, "grad_norm": 10.382241249084473, "learning_rate": 3.947859995908248e-07, "logits/chosen": 1.9921875, "logits/rejected": 2.390625, "logps/chosen": -288.0, "logps/rejected": -344.0, "loss": 0.6083, "rewards/accuracies": 0.75, "rewards/chosen": -1.125, "rewards/margins": 0.39453125, "rewards/rejected": -1.5234375, "step": 469 }, { "epoch": 0.9837781266352695, "grad_norm": 10.058098793029785, "learning_rate": 3.9432202232289497e-07, "logits/chosen": 2.546875, "logits/rejected": 2.5, "logps/chosen": -688.0, "logps/rejected": -676.0, "loss": 0.5984, "rewards/accuracies": 1.0, "rewards/chosen": -0.859375, "rewards/margins": 0.63671875, "rewards/rejected": -1.5, "step": 470 }, { "epoch": 0.9858712715855573, "grad_norm": 9.432204246520996, "learning_rate": 3.938572983146993e-07, "logits/chosen": 1.328125, "logits/rejected": 1.6953125, "logps/chosen": -346.0, "logps/rejected": -338.0, "loss": 0.5903, "rewards/accuracies": 0.75, "rewards/chosen": -1.0390625, "rewards/margins": 0.2890625, "rewards/rejected": -1.328125, "step": 471 }, { "epoch": 0.9879644165358451, "grad_norm": 8.902270317077637, "learning_rate": 3.9339182997119455e-07, "logits/chosen": 2.421875, "logits/rejected": 2.40625, "logps/chosen": -388.0, "logps/rejected": -616.0, "loss": 0.6047, "rewards/accuracies": 0.5, "rewards/chosen": -0.8984375, "rewards/margins": 0.33984375, "rewards/rejected": -1.234375, "step": 472 }, { "epoch": 0.9900575614861329, "grad_norm": 9.261591911315918, "learning_rate": 3.9292561970118976e-07, "logits/chosen": 3.203125, "logits/rejected": 3.0625, "logps/chosen": -600.0, "logps/rejected": -516.0, "loss": 0.5908, "rewards/accuracies": 0.75, "rewards/chosen": -0.9375, "rewards/margins": 0.21484375, "rewards/rejected": -1.1484375, "step": 473 }, { "epoch": 0.9921507064364207, "grad_norm": 9.265706062316895, "learning_rate": 3.9245866991733324e-07, "logits/chosen": 3.1875, "logits/rejected": 2.640625, "logps/chosen": -290.0, "logps/rejected": -446.0, "loss": 0.599, "rewards/accuracies": 1.0, "rewards/chosen": -0.9921875, "rewards/margins": 0.353515625, "rewards/rejected": -1.34375, "step": 474 }, { "epoch": 0.9942438513867086, "grad_norm": 9.12977123260498, "learning_rate": 3.919909830361004e-07, "logits/chosen": 1.3984375, "logits/rejected": 2.265625, "logps/chosen": -376.0, "logps/rejected": -216.0, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": -0.86328125, "rewards/margins": 0.2177734375, "rewards/rejected": -1.078125, "step": 475 }, { "epoch": 0.9963369963369964, "grad_norm": 10.491602897644043, "learning_rate": 3.9152256147778124e-07, "logits/chosen": 2.78125, "logits/rejected": 2.84375, "logps/chosen": -378.0, "logps/rejected": -444.0, "loss": 0.6401, "rewards/accuracies": 0.5, "rewards/chosen": -0.83984375, "rewards/margins": 0.0986328125, "rewards/rejected": -0.9375, "step": 476 }, { "epoch": 0.9984301412872841, "grad_norm": 9.917890548706055, "learning_rate": 3.910534076664676e-07, "logits/chosen": 1.609375, "logits/rejected": 2.0625, "logps/chosen": -528.0, "logps/rejected": -490.0, "loss": 0.6253, "rewards/accuracies": 0.5, "rewards/chosen": -1.1328125, "rewards/margins": 0.349609375, "rewards/rejected": -1.484375, "step": 477 }, { "epoch": 1.000523286237572, "grad_norm": 10.06885051727295, "learning_rate": 3.905835240300407e-07, "logits/chosen": 2.25, "logits/rejected": 1.9375, "logps/chosen": -470.0, "logps/rejected": -488.0, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": -0.98046875, "rewards/margins": 0.2578125, "rewards/rejected": -1.234375, "step": 478 }, { "epoch": 1.0026164311878598, "grad_norm": 9.318933486938477, "learning_rate": 3.901129130001588e-07, "logits/chosen": 2.984375, "logits/rejected": 3.25, "logps/chosen": -840.0, "logps/rejected": -470.0, "loss": 0.5939, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.1708984375, "rewards/rejected": -1.265625, "step": 479 }, { "epoch": 1.0047095761381475, "grad_norm": 9.795000076293945, "learning_rate": 3.896415770122443e-07, "logits/chosen": 2.046875, "logits/rejected": 2.296875, "logps/chosen": -442.0, "logps/rejected": -508.0, "loss": 0.6037, "rewards/accuracies": 0.5, "rewards/chosen": -1.015625, "rewards/margins": 0.2109375, "rewards/rejected": -1.2265625, "step": 480 }, { "epoch": 1.0068027210884354, "grad_norm": 9.666790962219238, "learning_rate": 3.891695185054712e-07, "logits/chosen": 1.3125, "logits/rejected": 1.7578125, "logps/chosen": -253.0, "logps/rejected": -264.0, "loss": 0.5925, "rewards/accuracies": 0.75, "rewards/chosen": -0.79296875, "rewards/margins": 0.421875, "rewards/rejected": -1.21875, "step": 481 }, { "epoch": 1.0088958660387233, "grad_norm": 10.20748233795166, "learning_rate": 3.886967399227529e-07, "logits/chosen": 2.578125, "logits/rejected": 2.75, "logps/chosen": -696.0, "logps/rejected": -418.0, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": -0.65625, "rewards/margins": 0.671875, "rewards/rejected": -1.328125, "step": 482 }, { "epoch": 1.010989010989011, "grad_norm": 10.576319694519043, "learning_rate": 3.8822324371072865e-07, "logits/chosen": 2.046875, "logits/rejected": 2.203125, "logps/chosen": -344.0, "logps/rejected": -332.0, "loss": 0.6513, "rewards/accuracies": 0.75, "rewards/chosen": -0.81640625, "rewards/margins": 0.314453125, "rewards/rejected": -1.1328125, "step": 483 }, { "epoch": 1.0130821559392988, "grad_norm": 10.309803009033203, "learning_rate": 3.877490323197521e-07, "logits/chosen": 1.96875, "logits/rejected": 1.7578125, "logps/chosen": -426.0, "logps/rejected": -494.0, "loss": 0.6231, "rewards/accuracies": 0.5, "rewards/chosen": -0.8984375, "rewards/margins": 0.12109375, "rewards/rejected": -1.0234375, "step": 484 }, { "epoch": 1.0151753008895865, "grad_norm": 10.046106338500977, "learning_rate": 3.872741082038774e-07, "logits/chosen": 2.171875, "logits/rejected": 2.4375, "logps/chosen": -536.0, "logps/rejected": -556.0, "loss": 0.6388, "rewards/accuracies": 0.5, "rewards/chosen": -1.09375, "rewards/margins": -0.00927734375, "rewards/rejected": -1.0859375, "step": 485 }, { "epoch": 1.0172684458398744, "grad_norm": 10.342788696289062, "learning_rate": 3.8679847382084747e-07, "logits/chosen": 2.234375, "logits/rejected": 2.625, "logps/chosen": -496.0, "logps/rejected": -352.0, "loss": 0.6206, "rewards/accuracies": 0.5, "rewards/chosen": -1.34375, "rewards/margins": -0.0185546875, "rewards/rejected": -1.328125, "step": 486 }, { "epoch": 1.0193615907901623, "grad_norm": 9.817326545715332, "learning_rate": 3.8632213163208053e-07, "logits/chosen": 1.8359375, "logits/rejected": 1.2578125, "logps/chosen": -209.0, "logps/rejected": -378.0, "loss": 0.5834, "rewards/accuracies": 0.75, "rewards/chosen": -0.98046875, "rewards/margins": 0.31640625, "rewards/rejected": -1.296875, "step": 487 }, { "epoch": 1.02145473574045, "grad_norm": 9.901391983032227, "learning_rate": 3.85845084102658e-07, "logits/chosen": 2.609375, "logits/rejected": 2.359375, "logps/chosen": -370.0, "logps/rejected": -596.0, "loss": 0.6091, "rewards/accuracies": 0.5, "rewards/chosen": -1.125, "rewards/margins": 0.197265625, "rewards/rejected": -1.3203125, "step": 488 }, { "epoch": 1.0235478806907379, "grad_norm": 10.532851219177246, "learning_rate": 3.853673337013113e-07, "logits/chosen": 2.96875, "logits/rejected": 2.78125, "logps/chosen": -820.0, "logps/rejected": -848.0, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": -0.6484375, "rewards/margins": 0.482421875, "rewards/rejected": -1.1328125, "step": 489 }, { "epoch": 1.0256410256410255, "grad_norm": 10.164332389831543, "learning_rate": 3.8488888290040944e-07, "logits/chosen": 2.53125, "logits/rejected": 2.359375, "logps/chosen": -584.0, "logps/rejected": -608.0, "loss": 0.6228, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.29296875, "rewards/rejected": -1.484375, "step": 490 }, { "epoch": 1.0277341705913134, "grad_norm": 10.77008056640625, "learning_rate": 3.844097341759455e-07, "logits/chosen": 2.1875, "logits/rejected": 2.25, "logps/chosen": -294.0, "logps/rejected": -320.0, "loss": 0.6264, "rewards/accuracies": 0.5, "rewards/chosen": -0.89453125, "rewards/margins": 0.22265625, "rewards/rejected": -1.1171875, "step": 491 }, { "epoch": 1.0298273155416013, "grad_norm": 10.060259819030762, "learning_rate": 3.8392989000752504e-07, "logits/chosen": 2.140625, "logits/rejected": 2.40625, "logps/chosen": -394.0, "logps/rejected": -306.0, "loss": 0.617, "rewards/accuracies": 0.75, "rewards/chosen": -1.078125, "rewards/margins": 0.26953125, "rewards/rejected": -1.34375, "step": 492 }, { "epoch": 1.031920460491889, "grad_norm": 10.199516296386719, "learning_rate": 3.834493528783519e-07, "logits/chosen": 2.515625, "logits/rejected": 3.171875, "logps/chosen": -440.0, "logps/rejected": -324.0, "loss": 0.6474, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.1416015625, "rewards/rejected": -1.34375, "step": 493 }, { "epoch": 1.034013605442177, "grad_norm": 9.630121231079102, "learning_rate": 3.829681252752165e-07, "logits/chosen": 1.0625, "logits/rejected": 1.6875, "logps/chosen": -446.0, "logps/rejected": -366.0, "loss": 0.587, "rewards/accuracies": 0.25, "rewards/chosen": -1.3984375, "rewards/margins": -0.08984375, "rewards/rejected": -1.3125, "step": 494 }, { "epoch": 1.0361067503924646, "grad_norm": 9.805791854858398, "learning_rate": 3.824862096884822e-07, "logits/chosen": 2.5625, "logits/rejected": 2.59375, "logps/chosen": -298.0, "logps/rejected": -408.0, "loss": 0.6153, "rewards/accuracies": 0.25, "rewards/chosen": -0.9765625, "rewards/margins": -0.076171875, "rewards/rejected": -0.8984375, "step": 495 }, { "epoch": 1.0381998953427525, "grad_norm": 10.049763679504395, "learning_rate": 3.820036086120726e-07, "logits/chosen": 2.28125, "logits/rejected": 2.875, "logps/chosen": -540.0, "logps/rejected": -350.0, "loss": 0.6468, "rewards/accuracies": 0.75, "rewards/chosen": -0.78515625, "rewards/margins": 0.5390625, "rewards/rejected": -1.328125, "step": 496 }, { "epoch": 1.0402930402930404, "grad_norm": 10.213252067565918, "learning_rate": 3.815203245434593e-07, "logits/chosen": 2.90625, "logits/rejected": 2.546875, "logps/chosen": -528.0, "logps/rejected": -456.0, "loss": 0.6571, "rewards/accuracies": 0.25, "rewards/chosen": -1.421875, "rewards/margins": -0.28515625, "rewards/rejected": -1.1328125, "step": 497 }, { "epoch": 1.042386185243328, "grad_norm": 9.96432113647461, "learning_rate": 3.8103635998364756e-07, "logits/chosen": 2.53125, "logits/rejected": 3.25, "logps/chosen": -736.0, "logps/rejected": -496.0, "loss": 0.5566, "rewards/accuracies": 1.0, "rewards/chosen": -1.3203125, "rewards/margins": 0.3359375, "rewards/rejected": -1.65625, "step": 498 }, { "epoch": 1.044479330193616, "grad_norm": 10.319202423095703, "learning_rate": 3.805517174371649e-07, "logits/chosen": 2.390625, "logits/rejected": 2.703125, "logps/chosen": -308.0, "logps/rejected": -302.0, "loss": 0.6423, "rewards/accuracies": 0.75, "rewards/chosen": -0.83203125, "rewards/margins": 0.140625, "rewards/rejected": -0.97265625, "step": 499 }, { "epoch": 1.0465724751439036, "grad_norm": 10.619379043579102, "learning_rate": 3.8006639941204707e-07, "logits/chosen": 1.84375, "logits/rejected": 1.1328125, "logps/chosen": -456.0, "logps/rejected": -728.0, "loss": 0.593, "rewards/accuracies": 1.0, "rewards/chosen": -1.15625, "rewards/margins": 0.67578125, "rewards/rejected": -1.828125, "step": 500 }, { "epoch": 1.0486656200941915, "grad_norm": 9.219826698303223, "learning_rate": 3.7958040841982554e-07, "logits/chosen": 1.7734375, "logits/rejected": 1.7890625, "logps/chosen": -388.0, "logps/rejected": -468.0, "loss": 0.5655, "rewards/accuracies": 0.75, "rewards/chosen": -1.0625, "rewards/margins": 1.0859375, "rewards/rejected": -2.140625, "step": 501 }, { "epoch": 1.0507587650444794, "grad_norm": 10.068828582763672, "learning_rate": 3.7909374697551437e-07, "logits/chosen": 1.5078125, "logits/rejected": 1.4140625, "logps/chosen": -450.0, "logps/rejected": -440.0, "loss": 0.6082, "rewards/accuracies": 0.75, "rewards/chosen": -1.9140625, "rewards/margins": 0.0166015625, "rewards/rejected": -1.9296875, "step": 502 }, { "epoch": 1.052851909994767, "grad_norm": 10.750775337219238, "learning_rate": 3.786064175975972e-07, "logits/chosen": 2.84375, "logits/rejected": 2.59375, "logps/chosen": -486.0, "logps/rejected": -494.0, "loss": 0.625, "rewards/accuracies": 0.5, "rewards/chosen": -1.109375, "rewards/margins": -0.0263671875, "rewards/rejected": -1.0859375, "step": 503 }, { "epoch": 1.054945054945055, "grad_norm": 10.124489784240723, "learning_rate": 3.781184228080145e-07, "logits/chosen": 2.171875, "logits/rejected": 2.71875, "logps/chosen": -656.0, "logps/rejected": -318.0, "loss": 0.6313, "rewards/accuracies": 0.25, "rewards/chosen": -2.0, "rewards/margins": -0.3359375, "rewards/rejected": -1.671875, "step": 504 }, { "epoch": 1.0570381998953426, "grad_norm": 10.40312385559082, "learning_rate": 3.7762976513214966e-07, "logits/chosen": 2.140625, "logits/rejected": 2.109375, "logps/chosen": -262.0, "logps/rejected": -386.0, "loss": 0.6014, "rewards/accuracies": 0.5, "rewards/chosen": -1.21875, "rewards/margins": 0.37890625, "rewards/rejected": -1.6015625, "step": 505 }, { "epoch": 1.0591313448456305, "grad_norm": 10.114124298095703, "learning_rate": 3.771404470988174e-07, "logits/chosen": 2.390625, "logits/rejected": 2.0, "logps/chosen": -290.0, "logps/rejected": -348.0, "loss": 0.5869, "rewards/accuracies": 0.75, "rewards/chosen": -0.9140625, "rewards/margins": 0.1962890625, "rewards/rejected": -1.109375, "step": 506 }, { "epoch": 1.0612244897959184, "grad_norm": 9.699861526489258, "learning_rate": 3.766504712402488e-07, "logits/chosen": 2.40625, "logits/rejected": 2.75, "logps/chosen": -220.0, "logps/rejected": -210.0, "loss": 0.5878, "rewards/accuracies": 0.0, "rewards/chosen": -0.8125, "rewards/margins": -0.09765625, "rewards/rejected": -0.71875, "step": 507 }, { "epoch": 1.063317634746206, "grad_norm": 10.216354370117188, "learning_rate": 3.7615984009208006e-07, "logits/chosen": 2.75, "logits/rejected": 3.15625, "logps/chosen": -482.0, "logps/rejected": -466.0, "loss": 0.6022, "rewards/accuracies": 0.75, "rewards/chosen": -1.015625, "rewards/margins": 0.02734375, "rewards/rejected": -1.046875, "step": 508 }, { "epoch": 1.065410779696494, "grad_norm": 10.050541877746582, "learning_rate": 3.7566855619333816e-07, "logits/chosen": 2.59375, "logits/rejected": 3.15625, "logps/chosen": -330.0, "logps/rejected": -348.0, "loss": 0.5886, "rewards/accuracies": 0.75, "rewards/chosen": -1.109375, "rewards/margins": 0.3046875, "rewards/rejected": -1.4140625, "step": 509 }, { "epoch": 1.0675039246467817, "grad_norm": 11.169584274291992, "learning_rate": 3.7517662208642783e-07, "logits/chosen": 0.93359375, "logits/rejected": 1.453125, "logps/chosen": -506.0, "logps/rejected": -346.0, "loss": 0.6547, "rewards/accuracies": 0.75, "rewards/chosen": -0.63671875, "rewards/margins": 0.4453125, "rewards/rejected": -1.0859375, "step": 510 }, { "epoch": 1.0695970695970696, "grad_norm": 10.254963874816895, "learning_rate": 3.7468404031711924e-07, "logits/chosen": 1.25, "logits/rejected": 2.03125, "logps/chosen": -324.0, "logps/rejected": -324.0, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": -1.1484375, "rewards/margins": 0.609375, "rewards/rejected": -1.7578125, "step": 511 }, { "epoch": 1.0716902145473575, "grad_norm": 10.694738388061523, "learning_rate": 3.741908134345335e-07, "logits/chosen": 1.53125, "logits/rejected": 2.0, "logps/chosen": -354.0, "logps/rejected": -490.0, "loss": 0.618, "rewards/accuracies": 1.0, "rewards/chosen": -1.15625, "rewards/margins": 0.33203125, "rewards/rejected": -1.4921875, "step": 512 }, { "epoch": 1.0737833594976451, "grad_norm": 10.05710506439209, "learning_rate": 3.736969439911309e-07, "logits/chosen": 2.3125, "logits/rejected": 2.359375, "logps/chosen": -470.0, "logps/rejected": -402.0, "loss": 0.6105, "rewards/accuracies": 0.25, "rewards/chosen": -1.203125, "rewards/margins": 0.0185546875, "rewards/rejected": -1.21875, "step": 513 }, { "epoch": 1.075876504447933, "grad_norm": 9.925804138183594, "learning_rate": 3.732024345426966e-07, "logits/chosen": 1.4453125, "logits/rejected": 1.4296875, "logps/chosen": -360.0, "logps/rejected": -450.0, "loss": 0.6363, "rewards/accuracies": 0.5, "rewards/chosen": -1.1875, "rewards/margins": 0.181640625, "rewards/rejected": -1.3671875, "step": 514 }, { "epoch": 1.077969649398221, "grad_norm": 9.118669509887695, "learning_rate": 3.727072876483278e-07, "logits/chosen": 2.578125, "logits/rejected": 2.15625, "logps/chosen": -328.0, "logps/rejected": -458.0, "loss": 0.5867, "rewards/accuracies": 0.75, "rewards/chosen": -0.96484375, "rewards/margins": 0.6640625, "rewards/rejected": -1.625, "step": 515 }, { "epoch": 1.0800627943485086, "grad_norm": 10.814146041870117, "learning_rate": 3.722115058704207e-07, "logits/chosen": 2.828125, "logits/rejected": 3.0625, "logps/chosen": -840.0, "logps/rejected": -608.0, "loss": 0.621, "rewards/accuracies": 0.5, "rewards/chosen": -1.3984375, "rewards/margins": -0.0390625, "rewards/rejected": -1.359375, "step": 516 }, { "epoch": 1.0821559392987965, "grad_norm": 10.450875282287598, "learning_rate": 3.7171509177465676e-07, "logits/chosen": 1.953125, "logits/rejected": 2.015625, "logps/chosen": -480.0, "logps/rejected": -520.0, "loss": 0.6005, "rewards/accuracies": 0.75, "rewards/chosen": -0.88671875, "rewards/margins": 0.5703125, "rewards/rejected": -1.453125, "step": 517 }, { "epoch": 1.0842490842490842, "grad_norm": 10.862897872924805, "learning_rate": 3.7121804792998995e-07, "logits/chosen": 1.9140625, "logits/rejected": 2.015625, "logps/chosen": -414.0, "logps/rejected": -382.0, "loss": 0.6515, "rewards/accuracies": 0.75, "rewards/chosen": -1.2578125, "rewards/margins": -0.111328125, "rewards/rejected": -1.1484375, "step": 518 }, { "epoch": 1.086342229199372, "grad_norm": 9.62572193145752, "learning_rate": 3.7072037690863306e-07, "logits/chosen": 2.65625, "logits/rejected": 2.75, "logps/chosen": -504.0, "logps/rejected": -572.0, "loss": 0.5884, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.0947265625, "rewards/rejected": -1.40625, "step": 519 }, { "epoch": 1.08843537414966, "grad_norm": 10.811123847961426, "learning_rate": 3.7022208128604453e-07, "logits/chosen": 2.03125, "logits/rejected": 2.71875, "logps/chosen": -648.0, "logps/rejected": -408.0, "loss": 0.5946, "rewards/accuracies": 1.0, "rewards/chosen": -0.9296875, "rewards/margins": 0.4921875, "rewards/rejected": -1.421875, "step": 520 }, { "epoch": 1.0905285190999476, "grad_norm": 10.257491111755371, "learning_rate": 3.6972316364091525e-07, "logits/chosen": 2.046875, "logits/rejected": 2.484375, "logps/chosen": -276.0, "logps/rejected": -312.0, "loss": 0.605, "rewards/accuracies": 1.0, "rewards/chosen": -1.09375, "rewards/margins": 0.390625, "rewards/rejected": -1.484375, "step": 521 }, { "epoch": 1.0926216640502355, "grad_norm": 11.21011734008789, "learning_rate": 3.6922362655515507e-07, "logits/chosen": 2.71875, "logits/rejected": 2.75, "logps/chosen": -520.0, "logps/rejected": -540.0, "loss": 0.6377, "rewards/accuracies": 1.0, "rewards/chosen": -0.64453125, "rewards/margins": 0.1865234375, "rewards/rejected": -0.828125, "step": 522 }, { "epoch": 1.0947148090005232, "grad_norm": 11.032417297363281, "learning_rate": 3.687234726138793e-07, "logits/chosen": 1.890625, "logits/rejected": 2.5, "logps/chosen": -434.0, "logps/rejected": -296.0, "loss": 0.6326, "rewards/accuracies": 0.25, "rewards/chosen": -1.1328125, "rewards/margins": -0.1064453125, "rewards/rejected": -1.03125, "step": 523 }, { "epoch": 1.096807953950811, "grad_norm": 9.880134582519531, "learning_rate": 3.682227044053957e-07, "logits/chosen": 2.203125, "logits/rejected": 2.390625, "logps/chosen": -628.0, "logps/rejected": -528.0, "loss": 0.5774, "rewards/accuracies": 0.75, "rewards/chosen": -0.478515625, "rewards/margins": 1.0546875, "rewards/rejected": -1.5390625, "step": 524 }, { "epoch": 1.098901098901099, "grad_norm": 10.90539264678955, "learning_rate": 3.677213245211906e-07, "logits/chosen": 2.109375, "logits/rejected": 2.203125, "logps/chosen": -640.0, "logps/rejected": -660.0, "loss": 0.6084, "rewards/accuracies": 0.5, "rewards/chosen": -1.5546875, "rewards/margins": -0.154296875, "rewards/rejected": -1.40625, "step": 525 }, { "epoch": 1.1009942438513867, "grad_norm": 11.089640617370605, "learning_rate": 3.6721933555591603e-07, "logits/chosen": 1.703125, "logits/rejected": 2.234375, "logps/chosen": -416.0, "logps/rejected": -276.0, "loss": 0.6526, "rewards/accuracies": 0.25, "rewards/chosen": -1.40625, "rewards/margins": -0.177734375, "rewards/rejected": -1.2265625, "step": 526 }, { "epoch": 1.1030873888016746, "grad_norm": 9.621665954589844, "learning_rate": 3.6671674010737596e-07, "logits/chosen": 2.96875, "logits/rejected": 3.328125, "logps/chosen": -424.0, "logps/rejected": -456.0, "loss": 0.6194, "rewards/accuracies": 0.5, "rewards/chosen": -1.1640625, "rewards/margins": 0.2890625, "rewards/rejected": -1.453125, "step": 527 }, { "epoch": 1.1051805337519622, "grad_norm": 10.657153129577637, "learning_rate": 3.6621354077651293e-07, "logits/chosen": 2.125, "logits/rejected": 1.7890625, "logps/chosen": -378.0, "logps/rejected": -416.0, "loss": 0.6154, "rewards/accuracies": 0.75, "rewards/chosen": -0.85546875, "rewards/margins": 0.28515625, "rewards/rejected": -1.140625, "step": 528 }, { "epoch": 1.1072736787022501, "grad_norm": 10.405318260192871, "learning_rate": 3.657097401673944e-07, "logits/chosen": 1.9375, "logits/rejected": 2.984375, "logps/chosen": -816.0, "logps/rejected": -484.0, "loss": 0.5704, "rewards/accuracies": 0.75, "rewards/chosen": -0.671875, "rewards/margins": 0.41015625, "rewards/rejected": -1.078125, "step": 529 }, { "epoch": 1.109366823652538, "grad_norm": 10.833739280700684, "learning_rate": 3.6520534088719963e-07, "logits/chosen": 2.40625, "logits/rejected": 2.421875, "logps/chosen": -402.0, "logps/rejected": -392.0, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": -1.15625, "rewards/margins": 0.369140625, "rewards/rejected": -1.5234375, "step": 530 }, { "epoch": 1.1114599686028257, "grad_norm": 10.032989501953125, "learning_rate": 3.6470034554620614e-07, "logits/chosen": 1.9609375, "logits/rejected": 2.046875, "logps/chosen": -344.0, "logps/rejected": -286.0, "loss": 0.6215, "rewards/accuracies": 0.75, "rewards/chosen": -0.90234375, "rewards/margins": 0.072265625, "rewards/rejected": -0.9765625, "step": 531 }, { "epoch": 1.1135531135531136, "grad_norm": 9.768916130065918, "learning_rate": 3.6419475675777587e-07, "logits/chosen": 2.0625, "logits/rejected": 1.7734375, "logps/chosen": -294.0, "logps/rejected": -320.0, "loss": 0.6178, "rewards/accuracies": 1.0, "rewards/chosen": -1.1328125, "rewards/margins": 0.265625, "rewards/rejected": -1.3984375, "step": 532 }, { "epoch": 1.1156462585034013, "grad_norm": 9.991143226623535, "learning_rate": 3.636885771383419e-07, "logits/chosen": 1.6953125, "logits/rejected": 2.203125, "logps/chosen": -296.0, "logps/rejected": -552.0, "loss": 0.6119, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.6484375, "rewards/rejected": -1.78125, "step": 533 }, { "epoch": 1.1177394034536892, "grad_norm": 9.989991188049316, "learning_rate": 3.631818093073948e-07, "logits/chosen": 2.453125, "logits/rejected": 2.859375, "logps/chosen": -572.0, "logps/rejected": -446.0, "loss": 0.6055, "rewards/accuracies": 1.0, "rewards/chosen": -0.66796875, "rewards/margins": 0.515625, "rewards/rejected": -1.1875, "step": 534 }, { "epoch": 1.119832548403977, "grad_norm": 9.190662384033203, "learning_rate": 3.626744558874696e-07, "logits/chosen": 2.3125, "logits/rejected": 2.65625, "logps/chosen": -344.0, "logps/rejected": -350.0, "loss": 0.6081, "rewards/accuracies": 1.0, "rewards/chosen": -0.78125, "rewards/margins": 0.19140625, "rewards/rejected": -0.97265625, "step": 535 }, { "epoch": 1.1219256933542647, "grad_norm": 10.211576461791992, "learning_rate": 3.6216651950413097e-07, "logits/chosen": 2.0625, "logits/rejected": 2.328125, "logps/chosen": -438.0, "logps/rejected": -350.0, "loss": 0.6157, "rewards/accuracies": 0.75, "rewards/chosen": -0.94140625, "rewards/margins": 0.3515625, "rewards/rejected": -1.2890625, "step": 536 }, { "epoch": 1.1240188383045526, "grad_norm": 11.025035858154297, "learning_rate": 3.6165800278596116e-07, "logits/chosen": 2.265625, "logits/rejected": 2.78125, "logps/chosen": -502.0, "logps/rejected": -448.0, "loss": 0.6248, "rewards/accuracies": 0.75, "rewards/chosen": -1.0625, "rewards/margins": 0.28515625, "rewards/rejected": -1.34375, "step": 537 }, { "epoch": 1.1261119832548405, "grad_norm": 10.022388458251953, "learning_rate": 3.611489083645453e-07, "logits/chosen": 2.578125, "logits/rejected": 2.34375, "logps/chosen": -652.0, "logps/rejected": -764.0, "loss": 0.5888, "rewards/accuracies": 0.75, "rewards/chosen": -0.734375, "rewards/margins": 0.17578125, "rewards/rejected": -0.91015625, "step": 538 }, { "epoch": 1.1282051282051282, "grad_norm": 9.915376663208008, "learning_rate": 3.6063923887445815e-07, "logits/chosen": 1.8046875, "logits/rejected": 1.8203125, "logps/chosen": -314.0, "logps/rejected": -382.0, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": -0.71875, "rewards/margins": 0.75390625, "rewards/rejected": -1.4765625, "step": 539 }, { "epoch": 1.130298273155416, "grad_norm": 10.21751880645752, "learning_rate": 3.601289969532506e-07, "logits/chosen": 1.8984375, "logits/rejected": 3.09375, "logps/chosen": -328.0, "logps/rejected": -408.0, "loss": 0.585, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.3203125, "rewards/rejected": -1.4140625, "step": 540 }, { "epoch": 1.1323914181057038, "grad_norm": 9.77088451385498, "learning_rate": 3.596181852414358e-07, "logits/chosen": 2.40625, "logits/rejected": 2.703125, "logps/chosen": -496.0, "logps/rejected": -500.0, "loss": 0.5663, "rewards/accuracies": 1.0, "rewards/chosen": -0.8125, "rewards/margins": 0.671875, "rewards/rejected": -1.484375, "step": 541 }, { "epoch": 1.1344845630559917, "grad_norm": 10.153401374816895, "learning_rate": 3.591068063824757e-07, "logits/chosen": 3.296875, "logits/rejected": 2.578125, "logps/chosen": -342.0, "logps/rejected": -420.0, "loss": 0.596, "rewards/accuracies": 0.75, "rewards/chosen": -0.9375, "rewards/margins": 0.2421875, "rewards/rejected": -1.1796875, "step": 542 }, { "epoch": 1.1365777080062793, "grad_norm": 10.042383193969727, "learning_rate": 3.5859486302276697e-07, "logits/chosen": 2.265625, "logits/rejected": 2.6875, "logps/chosen": -340.0, "logps/rejected": -328.0, "loss": 0.611, "rewards/accuracies": 0.75, "rewards/chosen": -1.28125, "rewards/margins": -0.0625, "rewards/rejected": -1.21875, "step": 543 }, { "epoch": 1.1386708529565672, "grad_norm": 10.027029037475586, "learning_rate": 3.5808235781162794e-07, "logits/chosen": 1.546875, "logits/rejected": 1.6875, "logps/chosen": -244.0, "logps/rejected": -468.0, "loss": 0.6011, "rewards/accuracies": 0.75, "rewards/chosen": -0.875, "rewards/margins": 0.35546875, "rewards/rejected": -1.2265625, "step": 544 }, { "epoch": 1.1407639979068551, "grad_norm": 10.477700233459473, "learning_rate": 3.575692934012843e-07, "logits/chosen": 2.5625, "logits/rejected": 2.609375, "logps/chosen": -308.0, "logps/rejected": -332.0, "loss": 0.6235, "rewards/accuracies": 0.5, "rewards/chosen": -1.140625, "rewards/margins": 0.3515625, "rewards/rejected": -1.4921875, "step": 545 }, { "epoch": 1.1428571428571428, "grad_norm": 10.700268745422363, "learning_rate": 3.570556724468556e-07, "logits/chosen": 1.84375, "logits/rejected": 1.703125, "logps/chosen": -266.0, "logps/rejected": -222.0, "loss": 0.6432, "rewards/accuracies": 0.25, "rewards/chosen": -1.46875, "rewards/margins": -0.025390625, "rewards/rejected": -1.4453125, "step": 546 }, { "epoch": 1.1449502878074307, "grad_norm": 9.420391082763672, "learning_rate": 3.5654149760634167e-07, "logits/chosen": 1.0625, "logits/rejected": 1.4375, "logps/chosen": -302.0, "logps/rejected": -396.0, "loss": 0.5872, "rewards/accuracies": 1.0, "rewards/chosen": -1.015625, "rewards/margins": 0.51953125, "rewards/rejected": -1.53125, "step": 547 }, { "epoch": 1.1470434327577186, "grad_norm": 10.092485427856445, "learning_rate": 3.560267715406085e-07, "logits/chosen": 1.1015625, "logits/rejected": 1.90625, "logps/chosen": -396.0, "logps/rejected": -344.0, "loss": 0.5838, "rewards/accuracies": 0.75, "rewards/chosen": -0.91015625, "rewards/margins": 0.5546875, "rewards/rejected": -1.46875, "step": 548 }, { "epoch": 1.1491365777080063, "grad_norm": 10.177468299865723, "learning_rate": 3.5551149691337496e-07, "logits/chosen": 1.34375, "logits/rejected": 1.5703125, "logps/chosen": -233.0, "logps/rejected": -197.0, "loss": 0.6062, "rewards/accuracies": 0.25, "rewards/chosen": -1.203125, "rewards/margins": -0.095703125, "rewards/rejected": -1.109375, "step": 549 }, { "epoch": 1.1512297226582942, "grad_norm": 9.664475440979004, "learning_rate": 3.549956763911985e-07, "logits/chosen": 2.96875, "logits/rejected": 2.390625, "logps/chosen": -504.0, "logps/rejected": -512.0, "loss": 0.5928, "rewards/accuracies": 0.75, "rewards/chosen": -1.0625, "rewards/margins": 0.177734375, "rewards/rejected": -1.2421875, "step": 550 }, { "epoch": 1.1533228676085818, "grad_norm": 10.4424467086792, "learning_rate": 3.5447931264346163e-07, "logits/chosen": 1.515625, "logits/rejected": 1.921875, "logps/chosen": -332.0, "logps/rejected": -374.0, "loss": 0.5921, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.3671875, "rewards/rejected": -1.5703125, "step": 551 }, { "epoch": 1.1554160125588697, "grad_norm": 10.784751892089844, "learning_rate": 3.539624083423582e-07, "logits/chosen": 2.125, "logits/rejected": 2.71875, "logps/chosen": -624.0, "logps/rejected": -452.0, "loss": 0.6126, "rewards/accuracies": 0.5, "rewards/chosen": -1.453125, "rewards/margins": -0.330078125, "rewards/rejected": -1.125, "step": 552 }, { "epoch": 1.1575091575091574, "grad_norm": 10.372719764709473, "learning_rate": 3.534449661628793e-07, "logits/chosen": 2.859375, "logits/rejected": 3.125, "logps/chosen": -592.0, "logps/rejected": -652.0, "loss": 0.6005, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.63671875, "rewards/rejected": -1.953125, "step": 553 }, { "epoch": 1.1596023024594453, "grad_norm": 10.629199981689453, "learning_rate": 3.5292698878279964e-07, "logits/chosen": 2.0, "logits/rejected": 2.515625, "logps/chosen": -418.0, "logps/rejected": -414.0, "loss": 0.5892, "rewards/accuracies": 0.75, "rewards/chosen": -1.0703125, "rewards/margins": 0.27734375, "rewards/rejected": -1.3515625, "step": 554 }, { "epoch": 1.1616954474097332, "grad_norm": 10.558868408203125, "learning_rate": 3.524084788826635e-07, "logits/chosen": 1.875, "logits/rejected": 1.90625, "logps/chosen": -416.0, "logps/rejected": -496.0, "loss": 0.6155, "rewards/accuracies": 0.75, "rewards/chosen": -0.8203125, "rewards/margins": 0.55859375, "rewards/rejected": -1.375, "step": 555 }, { "epoch": 1.1637885923600209, "grad_norm": 10.18245792388916, "learning_rate": 3.5188943914577097e-07, "logits/chosen": 1.8359375, "logits/rejected": 1.4453125, "logps/chosen": -266.0, "logps/rejected": -320.0, "loss": 0.614, "rewards/accuracies": 0.5, "rewards/chosen": -1.296875, "rewards/margins": 0.1416015625, "rewards/rejected": -1.4375, "step": 556 }, { "epoch": 1.1658817373103088, "grad_norm": 11.512836456298828, "learning_rate": 3.5136987225816433e-07, "logits/chosen": 1.9140625, "logits/rejected": 1.734375, "logps/chosen": -326.0, "logps/rejected": -464.0, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": -1.125, "rewards/margins": 0.36328125, "rewards/rejected": -1.484375, "step": 557 }, { "epoch": 1.1679748822605966, "grad_norm": 9.522254943847656, "learning_rate": 3.508497809086134e-07, "logits/chosen": 2.9375, "logits/rejected": 2.921875, "logps/chosen": -568.0, "logps/rejected": -704.0, "loss": 0.5811, "rewards/accuracies": 0.75, "rewards/chosen": -1.0859375, "rewards/margins": 1.2734375, "rewards/rejected": -2.359375, "step": 558 }, { "epoch": 1.1700680272108843, "grad_norm": 10.200825691223145, "learning_rate": 3.5032916778860253e-07, "logits/chosen": 0.99609375, "logits/rejected": 1.421875, "logps/chosen": -189.0, "logps/rejected": -167.0, "loss": 0.5896, "rewards/accuracies": 0.5, "rewards/chosen": -0.82421875, "rewards/margins": 0.0400390625, "rewards/rejected": -0.86328125, "step": 559 }, { "epoch": 1.1721611721611722, "grad_norm": 11.027997970581055, "learning_rate": 3.4980803559231595e-07, "logits/chosen": 1.0546875, "logits/rejected": 1.8828125, "logps/chosen": -300.0, "logps/rejected": -250.0, "loss": 0.6103, "rewards/accuracies": 0.5, "rewards/chosen": -1.125, "rewards/margins": -0.021484375, "rewards/rejected": -1.1015625, "step": 560 }, { "epoch": 1.1742543171114599, "grad_norm": 11.398727416992188, "learning_rate": 3.4928638701662445e-07, "logits/chosen": 0.79296875, "logits/rejected": 0.86328125, "logps/chosen": -201.0, "logps/rejected": -304.0, "loss": 0.5375, "rewards/accuracies": 0.75, "rewards/chosen": -0.94921875, "rewards/margins": 0.7578125, "rewards/rejected": -1.703125, "step": 561 }, { "epoch": 1.1763474620617478, "grad_norm": 10.500263214111328, "learning_rate": 3.4876422476107057e-07, "logits/chosen": 1.1640625, "logits/rejected": 1.3828125, "logps/chosen": -164.0, "logps/rejected": -334.0, "loss": 0.6236, "rewards/accuracies": 1.0, "rewards/chosen": -0.8203125, "rewards/margins": 0.73046875, "rewards/rejected": -1.5546875, "step": 562 }, { "epoch": 1.1784406070120357, "grad_norm": 11.702860832214355, "learning_rate": 3.482415515278558e-07, "logits/chosen": 2.25, "logits/rejected": 2.390625, "logps/chosen": -272.0, "logps/rejected": -370.0, "loss": 0.6412, "rewards/accuracies": 0.75, "rewards/chosen": -1.0546875, "rewards/margins": 0.6328125, "rewards/rejected": -1.6875, "step": 563 }, { "epoch": 1.1805337519623234, "grad_norm": 10.307535171508789, "learning_rate": 3.477183700218254e-07, "logits/chosen": 1.703125, "logits/rejected": 2.375, "logps/chosen": -520.0, "logps/rejected": -580.0, "loss": 0.5652, "rewards/accuracies": 1.0, "rewards/chosen": -0.98828125, "rewards/margins": 1.0703125, "rewards/rejected": -2.0625, "step": 564 }, { "epoch": 1.1826268969126112, "grad_norm": 10.563407897949219, "learning_rate": 3.471946829504553e-07, "logits/chosen": 3.09375, "logits/rejected": 2.765625, "logps/chosen": -420.0, "logps/rejected": -596.0, "loss": 0.6133, "rewards/accuracies": 0.5, "rewards/chosen": -1.15625, "rewards/margins": 0.015625, "rewards/rejected": -1.171875, "step": 565 }, { "epoch": 1.184720041862899, "grad_norm": 9.85606575012207, "learning_rate": 3.4667049302383763e-07, "logits/chosen": 2.53125, "logits/rejected": 3.28125, "logps/chosen": -588.0, "logps/rejected": -476.0, "loss": 0.5743, "rewards/accuracies": 0.75, "rewards/chosen": -1.1953125, "rewards/margins": 0.0908203125, "rewards/rejected": -1.28125, "step": 566 }, { "epoch": 1.1868131868131868, "grad_norm": 10.523385047912598, "learning_rate": 3.461458029546666e-07, "logits/chosen": 1.4296875, "logits/rejected": 2.546875, "logps/chosen": -408.0, "logps/rejected": -300.0, "loss": 0.616, "rewards/accuracies": 0.5, "rewards/chosen": -1.1328125, "rewards/margins": 0.19921875, "rewards/rejected": -1.3359375, "step": 567 }, { "epoch": 1.1889063317634747, "grad_norm": 10.355939865112305, "learning_rate": 3.456206154582251e-07, "logits/chosen": 2.203125, "logits/rejected": 2.90625, "logps/chosen": -636.0, "logps/rejected": -580.0, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.71875, "rewards/rejected": -1.8125, "step": 568 }, { "epoch": 1.1909994767137624, "grad_norm": 10.845210075378418, "learning_rate": 3.4509493325236984e-07, "logits/chosen": 2.140625, "logits/rejected": 1.8671875, "logps/chosen": -416.0, "logps/rejected": -420.0, "loss": 0.6238, "rewards/accuracies": 0.75, "rewards/chosen": -1.0546875, "rewards/margins": 0.2890625, "rewards/rejected": -1.34375, "step": 569 }, { "epoch": 1.1930926216640503, "grad_norm": 10.860997200012207, "learning_rate": 3.445687590575179e-07, "logits/chosen": 2.296875, "logits/rejected": 2.5, "logps/chosen": -652.0, "logps/rejected": -344.0, "loss": 0.6565, "rewards/accuracies": 1.0, "rewards/chosen": -0.9765625, "rewards/margins": 0.384765625, "rewards/rejected": -1.359375, "step": 570 }, { "epoch": 1.195185766614338, "grad_norm": 10.557795524597168, "learning_rate": 3.440420955966322e-07, "logits/chosen": 2.4375, "logits/rejected": 1.4296875, "logps/chosen": -416.0, "logps/rejected": -560.0, "loss": 0.5388, "rewards/accuracies": 0.75, "rewards/chosen": -1.078125, "rewards/margins": 0.16796875, "rewards/rejected": -1.25, "step": 571 }, { "epoch": 1.1972789115646258, "grad_norm": 10.709152221679688, "learning_rate": 3.435149455952078e-07, "logits/chosen": 1.90625, "logits/rejected": 2.375, "logps/chosen": -370.0, "logps/rejected": -312.0, "loss": 0.5801, "rewards/accuracies": 0.5, "rewards/chosen": -1.171875, "rewards/margins": 0.40234375, "rewards/rejected": -1.578125, "step": 572 }, { "epoch": 1.1993720565149137, "grad_norm": 11.39714241027832, "learning_rate": 3.429873117812576e-07, "logits/chosen": 0.59765625, "logits/rejected": 0.87109375, "logps/chosen": -424.0, "logps/rejected": -286.0, "loss": 0.613, "rewards/accuracies": 0.5, "rewards/chosen": -1.09375, "rewards/margins": 0.04296875, "rewards/rejected": -1.140625, "step": 573 }, { "epoch": 1.2014652014652014, "grad_norm": 10.997570991516113, "learning_rate": 3.4245919688529825e-07, "logits/chosen": 1.609375, "logits/rejected": 2.203125, "logps/chosen": -510.0, "logps/rejected": -432.0, "loss": 0.5696, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 0.3046875, "rewards/rejected": -1.625, "step": 574 }, { "epoch": 1.2035583464154893, "grad_norm": 10.565017700195312, "learning_rate": 3.419306036403357e-07, "logits/chosen": 1.828125, "logits/rejected": 2.078125, "logps/chosen": -414.0, "logps/rejected": -596.0, "loss": 0.587, "rewards/accuracies": 0.75, "rewards/chosen": -0.94140625, "rewards/margins": 0.75, "rewards/rejected": -1.6953125, "step": 575 }, { "epoch": 1.205651491365777, "grad_norm": 10.412652969360352, "learning_rate": 3.4140153478185194e-07, "logits/chosen": 0.6953125, "logits/rejected": 0.8515625, "logps/chosen": -184.0, "logps/rejected": -328.0, "loss": 0.6048, "rewards/accuracies": 0.25, "rewards/chosen": -1.265625, "rewards/margins": 0.43359375, "rewards/rejected": -1.6953125, "step": 576 }, { "epoch": 1.2077446363160649, "grad_norm": 11.035755157470703, "learning_rate": 3.408719930477898e-07, "logits/chosen": 2.625, "logits/rejected": 3.203125, "logps/chosen": -680.0, "logps/rejected": -620.0, "loss": 0.6422, "rewards/accuracies": 0.5, "rewards/chosen": -0.9921875, "rewards/margins": 0.291015625, "rewards/rejected": -1.28125, "step": 577 }, { "epoch": 1.2098377812663528, "grad_norm": 11.217260360717773, "learning_rate": 3.4034198117853933e-07, "logits/chosen": 1.578125, "logits/rejected": 1.9765625, "logps/chosen": -410.0, "logps/rejected": -318.0, "loss": 0.6314, "rewards/accuracies": 0.5, "rewards/chosen": -0.94921875, "rewards/margins": 0.08349609375, "rewards/rejected": -1.03125, "step": 578 }, { "epoch": 1.2119309262166404, "grad_norm": 10.444401741027832, "learning_rate": 3.398115019169238e-07, "logits/chosen": 2.203125, "logits/rejected": 1.9765625, "logps/chosen": -404.0, "logps/rejected": -384.0, "loss": 0.6165, "rewards/accuracies": 0.25, "rewards/chosen": -1.5234375, "rewards/margins": -0.55078125, "rewards/rejected": -0.96875, "step": 579 }, { "epoch": 1.2140240711669283, "grad_norm": 11.714608192443848, "learning_rate": 3.3928055800818484e-07, "logits/chosen": 1.4609375, "logits/rejected": 1.84375, "logps/chosen": -438.0, "logps/rejected": -400.0, "loss": 0.6471, "rewards/accuracies": 0.5, "rewards/chosen": -1.40625, "rewards/margins": 0.2041015625, "rewards/rejected": -1.609375, "step": 580 }, { "epoch": 1.2161172161172162, "grad_norm": 12.41568660736084, "learning_rate": 3.387491521999692e-07, "logits/chosen": 1.65625, "logits/rejected": 2.21875, "logps/chosen": -572.0, "logps/rejected": -500.0, "loss": 0.635, "rewards/accuracies": 0.25, "rewards/chosen": -1.7578125, "rewards/margins": 0.029296875, "rewards/rejected": -1.7890625, "step": 581 }, { "epoch": 1.218210361067504, "grad_norm": 10.436721801757812, "learning_rate": 3.382172872423132e-07, "logits/chosen": 2.34375, "logits/rejected": 3.296875, "logps/chosen": -760.0, "logps/rejected": -344.0, "loss": 0.6416, "rewards/accuracies": 0.75, "rewards/chosen": -1.0, "rewards/margins": 0.267578125, "rewards/rejected": -1.2734375, "step": 582 }, { "epoch": 1.2203035060177918, "grad_norm": 9.642477989196777, "learning_rate": 3.3768496588763007e-07, "logits/chosen": 2.28125, "logits/rejected": 2.15625, "logps/chosen": -548.0, "logps/rejected": -724.0, "loss": 0.5784, "rewards/accuracies": 1.0, "rewards/chosen": -0.82421875, "rewards/margins": 0.57421875, "rewards/rejected": -1.3984375, "step": 583 }, { "epoch": 1.2223966509680795, "grad_norm": 9.932283401489258, "learning_rate": 3.371521908906943e-07, "logits/chosen": 2.15625, "logits/rejected": 2.703125, "logps/chosen": -536.0, "logps/rejected": -564.0, "loss": 0.5857, "rewards/accuracies": 0.75, "rewards/chosen": -0.8828125, "rewards/margins": 0.470703125, "rewards/rejected": -1.3515625, "step": 584 }, { "epoch": 1.2244897959183674, "grad_norm": 10.983428001403809, "learning_rate": 3.366189650086284e-07, "logits/chosen": 2.15625, "logits/rejected": 2.4375, "logps/chosen": -444.0, "logps/rejected": -380.0, "loss": 0.6206, "rewards/accuracies": 1.0, "rewards/chosen": -1.046875, "rewards/margins": 0.6640625, "rewards/rejected": -1.703125, "step": 585 }, { "epoch": 1.226582940868655, "grad_norm": 10.217317581176758, "learning_rate": 3.360852910008879e-07, "logits/chosen": 1.21875, "logits/rejected": 1.515625, "logps/chosen": -360.0, "logps/rejected": -432.0, "loss": 0.6135, "rewards/accuracies": 0.75, "rewards/chosen": -0.9921875, "rewards/margins": 0.65234375, "rewards/rejected": -1.640625, "step": 586 }, { "epoch": 1.228676085818943, "grad_norm": 10.667376518249512, "learning_rate": 3.3555117162924756e-07, "logits/chosen": 2.140625, "logits/rejected": 2.109375, "logps/chosen": -290.0, "logps/rejected": -462.0, "loss": 0.6056, "rewards/accuracies": 0.5, "rewards/chosen": -1.15625, "rewards/margins": -0.1552734375, "rewards/rejected": -1.0, "step": 587 }, { "epoch": 1.2307692307692308, "grad_norm": 10.79523754119873, "learning_rate": 3.3501660965778707e-07, "logits/chosen": 2.125, "logits/rejected": 2.703125, "logps/chosen": -592.0, "logps/rejected": -652.0, "loss": 0.5988, "rewards/accuracies": 0.5, "rewards/chosen": -0.9453125, "rewards/margins": 0.53125, "rewards/rejected": -1.4765625, "step": 588 }, { "epoch": 1.2328623757195185, "grad_norm": 11.859354019165039, "learning_rate": 3.34481607852876e-07, "logits/chosen": 2.59375, "logits/rejected": 2.796875, "logps/chosen": -486.0, "logps/rejected": -350.0, "loss": 0.6067, "rewards/accuracies": 0.75, "rewards/chosen": -1.140625, "rewards/margins": 0.1669921875, "rewards/rejected": -1.3125, "step": 589 }, { "epoch": 1.2349555206698064, "grad_norm": 10.522369384765625, "learning_rate": 3.3394616898316085e-07, "logits/chosen": 1.625, "logits/rejected": 2.203125, "logps/chosen": -636.0, "logps/rejected": -528.0, "loss": 0.6135, "rewards/accuracies": 0.75, "rewards/chosen": -0.96484375, "rewards/margins": 0.119140625, "rewards/rejected": -1.0859375, "step": 590 }, { "epoch": 1.2370486656200943, "grad_norm": 10.819913864135742, "learning_rate": 3.3341029581954946e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.140625, "logps/chosen": -270.0, "logps/rejected": -512.0, "loss": 0.5995, "rewards/accuracies": 0.75, "rewards/chosen": -1.296875, "rewards/margins": 1.078125, "rewards/rejected": -2.375, "step": 591 }, { "epoch": 1.239141810570382, "grad_norm": 10.902482986450195, "learning_rate": 3.3287399113519706e-07, "logits/chosen": 2.71875, "logits/rejected": 3.3125, "logps/chosen": -752.0, "logps/rejected": -600.0, "loss": 0.6019, "rewards/accuracies": 0.25, "rewards/chosen": -1.1328125, "rewards/margins": 0.021484375, "rewards/rejected": -1.15625, "step": 592 }, { "epoch": 1.2412349555206699, "grad_norm": 10.952805519104004, "learning_rate": 3.323372577054924e-07, "logits/chosen": 3.375, "logits/rejected": 2.921875, "logps/chosen": -374.0, "logps/rejected": -552.0, "loss": 0.6354, "rewards/accuracies": 0.5, "rewards/chosen": -1.1640625, "rewards/margins": -0.0009765625, "rewards/rejected": -1.1640625, "step": 593 }, { "epoch": 1.2433281004709575, "grad_norm": 11.068962097167969, "learning_rate": 3.318000983080426e-07, "logits/chosen": 2.203125, "logits/rejected": 1.65625, "logps/chosen": -290.0, "logps/rejected": -444.0, "loss": 0.5645, "rewards/accuracies": 0.75, "rewards/chosen": -1.359375, "rewards/margins": 0.66015625, "rewards/rejected": -2.015625, "step": 594 }, { "epoch": 1.2454212454212454, "grad_norm": 10.889310836791992, "learning_rate": 3.312625157226597e-07, "logits/chosen": 2.015625, "logits/rejected": 2.875, "logps/chosen": -524.0, "logps/rejected": -400.0, "loss": 0.6028, "rewards/accuracies": 0.5, "rewards/chosen": -0.9140625, "rewards/margins": 0.32421875, "rewards/rejected": -1.234375, "step": 595 }, { "epoch": 1.247514390371533, "grad_norm": 11.577537536621094, "learning_rate": 3.3072451273134497e-07, "logits/chosen": 2.578125, "logits/rejected": 2.65625, "logps/chosen": -700.0, "logps/rejected": -500.0, "loss": 0.6479, "rewards/accuracies": 1.0, "rewards/chosen": -0.94921875, "rewards/margins": 0.77734375, "rewards/rejected": -1.7265625, "step": 596 }, { "epoch": 1.249607535321821, "grad_norm": 10.762384414672852, "learning_rate": 3.3018609211827606e-07, "logits/chosen": 2.578125, "logits/rejected": 2.296875, "logps/chosen": -440.0, "logps/rejected": -704.0, "loss": 0.5725, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.396484375, "rewards/rejected": -1.6015625, "step": 597 }, { "epoch": 1.251700680272109, "grad_norm": 10.538718223571777, "learning_rate": 3.296472566697914e-07, "logits/chosen": 1.90625, "logits/rejected": 2.65625, "logps/chosen": -454.0, "logps/rejected": -294.0, "loss": 0.6096, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.1826171875, "rewards/rejected": -1.3671875, "step": 598 }, { "epoch": 1.2537938252223966, "grad_norm": 10.4353609085083, "learning_rate": 3.291080091743762e-07, "logits/chosen": 1.703125, "logits/rejected": 3.234375, "logps/chosen": -656.0, "logps/rejected": -426.0, "loss": 0.5777, "rewards/accuracies": 0.75, "rewards/chosen": -1.109375, "rewards/margins": 0.2470703125, "rewards/rejected": -1.359375, "step": 599 }, { "epoch": 1.2558869701726845, "grad_norm": 11.074726104736328, "learning_rate": 3.2856835242264825e-07, "logits/chosen": 2.140625, "logits/rejected": 1.7578125, "logps/chosen": -458.0, "logps/rejected": -416.0, "loss": 0.6433, "rewards/accuracies": 0.5, "rewards/chosen": -0.9140625, "rewards/margins": 0.30078125, "rewards/rejected": -1.2109375, "step": 600 }, { "epoch": 1.2579801151229724, "grad_norm": 11.208061218261719, "learning_rate": 3.2802828920734297e-07, "logits/chosen": 1.7421875, "logits/rejected": 1.953125, "logps/chosen": -450.0, "logps/rejected": -464.0, "loss": 0.6085, "rewards/accuracies": 0.75, "rewards/chosen": -0.9609375, "rewards/margins": 0.494140625, "rewards/rejected": -1.453125, "step": 601 }, { "epoch": 1.26007326007326, "grad_norm": 10.752561569213867, "learning_rate": 3.274878223232996e-07, "logits/chosen": 2.59375, "logits/rejected": 2.671875, "logps/chosen": -364.0, "logps/rejected": -266.0, "loss": 0.6114, "rewards/accuracies": 0.5, "rewards/chosen": -1.25, "rewards/margins": 0.0263671875, "rewards/rejected": -1.28125, "step": 602 }, { "epoch": 1.262166405023548, "grad_norm": 10.955470085144043, "learning_rate": 3.269469545674459e-07, "logits/chosen": 1.359375, "logits/rejected": 2.09375, "logps/chosen": -494.0, "logps/rejected": -372.0, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -1.1171875, "rewards/margins": 0.58203125, "rewards/rejected": -1.703125, "step": 603 }, { "epoch": 1.2642595499738356, "grad_norm": 12.255962371826172, "learning_rate": 3.2640568873878457e-07, "logits/chosen": 1.7578125, "logits/rejected": 2.34375, "logps/chosen": -540.0, "logps/rejected": -412.0, "loss": 0.6545, "rewards/accuracies": 0.25, "rewards/chosen": -1.0234375, "rewards/margins": -0.0986328125, "rewards/rejected": -0.92578125, "step": 604 }, { "epoch": 1.2663526949241235, "grad_norm": 10.650092124938965, "learning_rate": 3.258640276383781e-07, "logits/chosen": 1.515625, "logits/rejected": 1.359375, "logps/chosen": -224.0, "logps/rejected": -280.0, "loss": 0.6096, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 0.1435546875, "rewards/rejected": -1.2421875, "step": 605 }, { "epoch": 1.2684458398744112, "grad_norm": 10.870160102844238, "learning_rate": 3.2532197406933475e-07, "logits/chosen": 2.140625, "logits/rejected": 2.90625, "logps/chosen": -560.0, "logps/rejected": -472.0, "loss": 0.5933, "rewards/accuracies": 0.5, "rewards/chosen": -1.046875, "rewards/margins": 0.328125, "rewards/rejected": -1.375, "step": 606 }, { "epoch": 1.270538984824699, "grad_norm": 11.42349910736084, "learning_rate": 3.247795308367936e-07, "logits/chosen": 2.4375, "logits/rejected": 2.59375, "logps/chosen": -376.0, "logps/rejected": -320.0, "loss": 0.6287, "rewards/accuracies": 0.25, "rewards/chosen": -1.5390625, "rewards/margins": -0.2255859375, "rewards/rejected": -1.3125, "step": 607 }, { "epoch": 1.272632129774987, "grad_norm": 11.02907943725586, "learning_rate": 3.242367007479103e-07, "logits/chosen": 3.03125, "logits/rejected": 3.21875, "logps/chosen": -492.0, "logps/rejected": -548.0, "loss": 0.6036, "rewards/accuracies": 1.0, "rewards/chosen": -0.9140625, "rewards/margins": 0.70703125, "rewards/rejected": -1.6171875, "step": 608 }, { "epoch": 1.2747252747252746, "grad_norm": 10.953951835632324, "learning_rate": 3.2369348661184234e-07, "logits/chosen": 1.34375, "logits/rejected": 1.5625, "logps/chosen": -384.0, "logps/rejected": -372.0, "loss": 0.5955, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.6953125, "rewards/rejected": -2.0, "step": 609 }, { "epoch": 1.2768184196755625, "grad_norm": 10.425783157348633, "learning_rate": 3.2314989123973505e-07, "logits/chosen": 1.703125, "logits/rejected": 1.78125, "logps/chosen": -234.0, "logps/rejected": -330.0, "loss": 0.6275, "rewards/accuracies": 0.75, "rewards/chosen": -1.109375, "rewards/margins": 0.0361328125, "rewards/rejected": -1.1484375, "step": 610 }, { "epoch": 1.2789115646258504, "grad_norm": 9.83671760559082, "learning_rate": 3.2260591744470634e-07, "logits/chosen": 2.34375, "logits/rejected": 1.71875, "logps/chosen": -488.0, "logps/rejected": -544.0, "loss": 0.5903, "rewards/accuracies": 1.0, "rewards/chosen": -0.6328125, "rewards/margins": 0.484375, "rewards/rejected": -1.1171875, "step": 611 }, { "epoch": 1.281004709576138, "grad_norm": 11.166946411132812, "learning_rate": 3.2206156804183277e-07, "logits/chosen": 1.5625, "logits/rejected": 1.7734375, "logps/chosen": -308.0, "logps/rejected": -308.0, "loss": 0.5882, "rewards/accuracies": 1.0, "rewards/chosen": -1.140625, "rewards/margins": 0.68359375, "rewards/rejected": -1.828125, "step": 612 }, { "epoch": 1.283097854526426, "grad_norm": 11.616917610168457, "learning_rate": 3.2151684584813417e-07, "logits/chosen": 1.78125, "logits/rejected": 1.3984375, "logps/chosen": -252.0, "logps/rejected": -304.0, "loss": 0.6398, "rewards/accuracies": 0.5, "rewards/chosen": -1.3125, "rewards/margins": -0.017578125, "rewards/rejected": -1.296875, "step": 613 }, { "epoch": 1.285190999476714, "grad_norm": 10.931349754333496, "learning_rate": 3.2097175368256006e-07, "logits/chosen": 2.203125, "logits/rejected": 2.546875, "logps/chosen": -512.0, "logps/rejected": -444.0, "loss": 0.5923, "rewards/accuracies": 0.75, "rewards/chosen": -1.1171875, "rewards/margins": 0.453125, "rewards/rejected": -1.5703125, "step": 614 }, { "epoch": 1.2872841444270016, "grad_norm": 10.645867347717285, "learning_rate": 3.204262943659744e-07, "logits/chosen": 2.46875, "logits/rejected": 3.015625, "logps/chosen": -664.0, "logps/rejected": -576.0, "loss": 0.5853, "rewards/accuracies": 0.5, "rewards/chosen": -1.4609375, "rewards/margins": -0.390625, "rewards/rejected": -1.0703125, "step": 615 }, { "epoch": 1.2893772893772895, "grad_norm": 10.39663028717041, "learning_rate": 3.1988047072114097e-07, "logits/chosen": 2.421875, "logits/rejected": 2.234375, "logps/chosen": -466.0, "logps/rejected": -736.0, "loss": 0.592, "rewards/accuracies": 0.75, "rewards/chosen": -1.0, "rewards/margins": 0.130859375, "rewards/rejected": -1.1328125, "step": 616 }, { "epoch": 1.2914704343275771, "grad_norm": 10.462268829345703, "learning_rate": 3.193342855727095e-07, "logits/chosen": 1.8515625, "logits/rejected": 2.53125, "logps/chosen": -460.0, "logps/rejected": -452.0, "loss": 0.5816, "rewards/accuracies": 0.5, "rewards/chosen": -0.921875, "rewards/margins": 0.1083984375, "rewards/rejected": -1.03125, "step": 617 }, { "epoch": 1.293563579277865, "grad_norm": 10.846890449523926, "learning_rate": 3.187877417471998e-07, "logits/chosen": 2.03125, "logits/rejected": 1.9765625, "logps/chosen": -211.0, "logps/rejected": -308.0, "loss": 0.6014, "rewards/accuracies": 0.5, "rewards/chosen": -0.84375, "rewards/margins": 0.3828125, "rewards/rejected": -1.2265625, "step": 618 }, { "epoch": 1.2956567242281527, "grad_norm": 11.082700729370117, "learning_rate": 3.182408420729884e-07, "logits/chosen": 2.375, "logits/rejected": 2.640625, "logps/chosen": -424.0, "logps/rejected": -436.0, "loss": 0.6238, "rewards/accuracies": 1.0, "rewards/chosen": -1.328125, "rewards/margins": 0.341796875, "rewards/rejected": -1.671875, "step": 619 }, { "epoch": 1.2977498691784406, "grad_norm": 10.534208297729492, "learning_rate": 3.17693589380293e-07, "logits/chosen": 3.5, "logits/rejected": 2.609375, "logps/chosen": -444.0, "logps/rejected": -656.0, "loss": 0.6237, "rewards/accuracies": 0.25, "rewards/chosen": -1.3828125, "rewards/margins": -0.2451171875, "rewards/rejected": -1.1328125, "step": 620 }, { "epoch": 1.2998430141287285, "grad_norm": 12.856078147888184, "learning_rate": 3.1714598650115853e-07, "logits/chosen": 2.46875, "logits/rejected": 2.265625, "logps/chosen": -456.0, "logps/rejected": -604.0, "loss": 0.6824, "rewards/accuracies": 0.5, "rewards/chosen": -1.375, "rewards/margins": 0.26171875, "rewards/rejected": -1.640625, "step": 621 }, { "epoch": 1.3019361590790162, "grad_norm": 10.476717948913574, "learning_rate": 3.1659803626944175e-07, "logits/chosen": 1.3203125, "logits/rejected": 1.2734375, "logps/chosen": -248.0, "logps/rejected": -306.0, "loss": 0.6038, "rewards/accuracies": 0.75, "rewards/chosen": -1.0546875, "rewards/margins": 0.080078125, "rewards/rejected": -1.140625, "step": 622 }, { "epoch": 1.304029304029304, "grad_norm": 10.692028045654297, "learning_rate": 3.1604974152079724e-07, "logits/chosen": 1.0546875, "logits/rejected": 1.234375, "logps/chosen": -328.0, "logps/rejected": -388.0, "loss": 0.6181, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.1826171875, "rewards/rejected": -1.28125, "step": 623 }, { "epoch": 1.306122448979592, "grad_norm": 10.150449752807617, "learning_rate": 3.155011050926624e-07, "logits/chosen": 1.796875, "logits/rejected": 1.9765625, "logps/chosen": -434.0, "logps/rejected": -304.0, "loss": 0.5883, "rewards/accuracies": 0.75, "rewards/chosen": -0.8828125, "rewards/margins": -0.0341796875, "rewards/rejected": -0.84765625, "step": 624 }, { "epoch": 1.3082155939298796, "grad_norm": 11.333098411560059, "learning_rate": 3.1495212982424283e-07, "logits/chosen": 1.359375, "logits/rejected": 2.15625, "logps/chosen": -540.0, "logps/rejected": -342.0, "loss": 0.6208, "rewards/accuracies": 0.75, "rewards/chosen": -2.0625, "rewards/margins": -0.3828125, "rewards/rejected": -1.6875, "step": 625 }, { "epoch": 1.3103087388801675, "grad_norm": 10.467708587646484, "learning_rate": 3.1440281855649764e-07, "logits/chosen": 2.640625, "logits/rejected": 2.296875, "logps/chosen": -520.0, "logps/rejected": -528.0, "loss": 0.5716, "rewards/accuracies": 0.5, "rewards/chosen": -1.3671875, "rewards/margins": -0.056640625, "rewards/rejected": -1.3125, "step": 626 }, { "epoch": 1.3124018838304552, "grad_norm": 10.712902069091797, "learning_rate": 3.138531741321246e-07, "logits/chosen": 2.078125, "logits/rejected": 1.734375, "logps/chosen": -312.0, "logps/rejected": -600.0, "loss": 0.577, "rewards/accuracies": 1.0, "rewards/chosen": -0.91796875, "rewards/margins": 0.2109375, "rewards/rejected": -1.125, "step": 627 }, { "epoch": 1.314495028780743, "grad_norm": 10.024105072021484, "learning_rate": 3.1330319939554585e-07, "logits/chosen": 0.46875, "logits/rejected": 0.92578125, "logps/chosen": -296.0, "logps/rejected": -364.0, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 0.1025390625, "rewards/rejected": -1.203125, "step": 628 }, { "epoch": 1.3165881737310308, "grad_norm": 11.217373847961426, "learning_rate": 3.1275289719289266e-07, "logits/chosen": 2.59375, "logits/rejected": 3.640625, "logps/chosen": -944.0, "logps/rejected": -416.0, "loss": 0.6388, "rewards/accuracies": 0.75, "rewards/chosen": -0.828125, "rewards/margins": 0.453125, "rewards/rejected": -1.28125, "step": 629 }, { "epoch": 1.3186813186813187, "grad_norm": 10.900577545166016, "learning_rate": 3.122022703719912e-07, "logits/chosen": 2.1875, "logits/rejected": 2.484375, "logps/chosen": -476.0, "logps/rejected": -506.0, "loss": 0.6337, "rewards/accuracies": 0.0, "rewards/chosen": -1.6171875, "rewards/margins": -0.7265625, "rewards/rejected": -0.890625, "step": 630 }, { "epoch": 1.3207744636316066, "grad_norm": 10.464300155639648, "learning_rate": 3.116513217823471e-07, "logits/chosen": 2.390625, "logits/rejected": 3.21875, "logps/chosen": -612.0, "logps/rejected": -406.0, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": -1.1328125, "rewards/margins": 0.46875, "rewards/rejected": -1.6015625, "step": 631 }, { "epoch": 1.3228676085818942, "grad_norm": 10.725011825561523, "learning_rate": 3.111000542751317e-07, "logits/chosen": 1.0859375, "logits/rejected": 1.25, "logps/chosen": -568.0, "logps/rejected": -500.0, "loss": 0.6162, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.439453125, "rewards/rejected": -1.625, "step": 632 }, { "epoch": 1.3249607535321821, "grad_norm": 10.770613670349121, "learning_rate": 3.105484707031663e-07, "logits/chosen": 1.3125, "logits/rejected": 1.6171875, "logps/chosen": -442.0, "logps/rejected": -392.0, "loss": 0.6139, "rewards/accuracies": 0.25, "rewards/chosen": -1.546875, "rewards/margins": -0.4375, "rewards/rejected": -1.1015625, "step": 633 }, { "epoch": 1.32705389848247, "grad_norm": 11.20041561126709, "learning_rate": 3.0999657392090826e-07, "logits/chosen": 3.21875, "logits/rejected": 2.515625, "logps/chosen": -536.0, "logps/rejected": -688.0, "loss": 0.6004, "rewards/accuracies": 0.5, "rewards/chosen": -0.8203125, "rewards/margins": -0.03271484375, "rewards/rejected": -0.7890625, "step": 634 }, { "epoch": 1.3291470434327577, "grad_norm": 10.969812393188477, "learning_rate": 3.0944436678443526e-07, "logits/chosen": 1.625, "logits/rejected": 2.75, "logps/chosen": -284.0, "logps/rejected": -392.0, "loss": 0.6037, "rewards/accuracies": 0.75, "rewards/chosen": -1.0625, "rewards/margins": 0.078125, "rewards/rejected": -1.140625, "step": 635 }, { "epoch": 1.3312401883830456, "grad_norm": 12.59915828704834, "learning_rate": 3.088918521514317e-07, "logits/chosen": 1.8671875, "logits/rejected": 1.46875, "logps/chosen": -324.0, "logps/rejected": -368.0, "loss": 0.6147, "rewards/accuracies": 0.5, "rewards/chosen": -1.015625, "rewards/margins": -0.044921875, "rewards/rejected": -0.96875, "step": 636 }, { "epoch": 1.3333333333333333, "grad_norm": 12.617871284484863, "learning_rate": 3.083390328811726e-07, "logits/chosen": 2.125, "logits/rejected": 2.84375, "logps/chosen": -398.0, "logps/rejected": -328.0, "loss": 0.6532, "rewards/accuracies": 0.5, "rewards/chosen": -1.34375, "rewards/margins": 0.294921875, "rewards/rejected": -1.640625, "step": 637 }, { "epoch": 1.3354264782836212, "grad_norm": 11.845966339111328, "learning_rate": 3.077859118345102e-07, "logits/chosen": 1.59375, "logits/rejected": 2.5625, "logps/chosen": -388.0, "logps/rejected": -251.0, "loss": 0.6508, "rewards/accuracies": 0.5, "rewards/chosen": -1.28125, "rewards/margins": -0.1728515625, "rewards/rejected": -1.109375, "step": 638 }, { "epoch": 1.3375196232339088, "grad_norm": 11.422259330749512, "learning_rate": 3.072324918738579e-07, "logits/chosen": 1.8359375, "logits/rejected": 2.046875, "logps/chosen": -390.0, "logps/rejected": -414.0, "loss": 0.6063, "rewards/accuracies": 0.75, "rewards/chosen": -0.95703125, "rewards/margins": 0.625, "rewards/rejected": -1.578125, "step": 639 }, { "epoch": 1.3396127681841967, "grad_norm": 9.103655815124512, "learning_rate": 3.066787758631763e-07, "logits/chosen": 1.8984375, "logits/rejected": 2.234375, "logps/chosen": -528.0, "logps/rejected": -428.0, "loss": 0.5832, "rewards/accuracies": 0.25, "rewards/chosen": -1.59375, "rewards/margins": -0.32421875, "rewards/rejected": -1.2734375, "step": 640 }, { "epoch": 1.3417059131344846, "grad_norm": 11.316737174987793, "learning_rate": 3.0612476666795776e-07, "logits/chosen": 1.421875, "logits/rejected": 1.1484375, "logps/chosen": -368.0, "logps/rejected": -556.0, "loss": 0.6204, "rewards/accuracies": 1.0, "rewards/chosen": -1.1328125, "rewards/margins": 0.6171875, "rewards/rejected": -1.75, "step": 641 }, { "epoch": 1.3437990580847723, "grad_norm": 11.12604808807373, "learning_rate": 3.055704671552122e-07, "logits/chosen": 2.0625, "logits/rejected": 2.421875, "logps/chosen": -456.0, "logps/rejected": -362.0, "loss": 0.5931, "rewards/accuracies": 0.25, "rewards/chosen": -1.3671875, "rewards/margins": -0.30859375, "rewards/rejected": -1.0546875, "step": 642 }, { "epoch": 1.3458922030350602, "grad_norm": 11.656798362731934, "learning_rate": 3.0501588019345174e-07, "logits/chosen": 2.25, "logits/rejected": 2.90625, "logps/chosen": -502.0, "logps/rejected": -408.0, "loss": 0.6395, "rewards/accuracies": 0.75, "rewards/chosen": -1.234375, "rewards/margins": 0.1474609375, "rewards/rejected": -1.375, "step": 643 }, { "epoch": 1.347985347985348, "grad_norm": 10.157209396362305, "learning_rate": 3.0446100865267617e-07, "logits/chosen": 2.3125, "logits/rejected": 2.21875, "logps/chosen": -516.0, "logps/rejected": -704.0, "loss": 0.5799, "rewards/accuracies": 0.5, "rewards/chosen": -1.3125, "rewards/margins": 0.337890625, "rewards/rejected": -1.6484375, "step": 644 }, { "epoch": 1.3500784929356358, "grad_norm": 10.407575607299805, "learning_rate": 3.039058554043579e-07, "logits/chosen": 2.046875, "logits/rejected": 2.75, "logps/chosen": -482.0, "logps/rejected": -474.0, "loss": 0.5835, "rewards/accuracies": 1.0, "rewards/chosen": -0.921875, "rewards/margins": 0.6171875, "rewards/rejected": -1.546875, "step": 645 }, { "epoch": 1.3521716378859236, "grad_norm": 10.747139930725098, "learning_rate": 3.0335042332142706e-07, "logits/chosen": 1.609375, "logits/rejected": 1.5390625, "logps/chosen": -372.0, "logps/rejected": -227.0, "loss": 0.6214, "rewards/accuracies": 0.25, "rewards/chosen": -1.515625, "rewards/margins": -0.322265625, "rewards/rejected": -1.1875, "step": 646 }, { "epoch": 1.3542647828362115, "grad_norm": 10.498771667480469, "learning_rate": 3.0279471527825713e-07, "logits/chosen": 2.0625, "logits/rejected": 1.9140625, "logps/chosen": -412.0, "logps/rejected": -512.0, "loss": 0.587, "rewards/accuracies": 1.0, "rewards/chosen": -1.1171875, "rewards/margins": 0.33203125, "rewards/rejected": -1.453125, "step": 647 }, { "epoch": 1.3563579277864992, "grad_norm": 10.430938720703125, "learning_rate": 3.022387341506493e-07, "logits/chosen": 2.53125, "logits/rejected": 2.6875, "logps/chosen": -612.0, "logps/rejected": -704.0, "loss": 0.6009, "rewards/accuracies": 0.5, "rewards/chosen": -1.953125, "rewards/margins": -0.43359375, "rewards/rejected": -1.5234375, "step": 648 }, { "epoch": 1.358451072736787, "grad_norm": 10.411450386047363, "learning_rate": 3.016824828158182e-07, "logits/chosen": 1.90625, "logits/rejected": 2.765625, "logps/chosen": -320.0, "logps/rejected": -362.0, "loss": 0.5785, "rewards/accuracies": 1.0, "rewards/chosen": -0.6953125, "rewards/margins": 0.8515625, "rewards/rejected": -1.546875, "step": 649 }, { "epoch": 1.3605442176870748, "grad_norm": 10.431605339050293, "learning_rate": 3.0112596415237685e-07, "logits/chosen": 1.5, "logits/rejected": 1.5390625, "logps/chosen": -440.0, "logps/rejected": -506.0, "loss": 0.5886, "rewards/accuracies": 0.75, "rewards/chosen": -0.98046875, "rewards/margins": 0.53515625, "rewards/rejected": -1.515625, "step": 650 }, { "epoch": 1.3626373626373627, "grad_norm": 10.654319763183594, "learning_rate": 3.0056918104032135e-07, "logits/chosen": 1.2265625, "logits/rejected": 1.140625, "logps/chosen": -253.0, "logps/rejected": -406.0, "loss": 0.5936, "rewards/accuracies": 0.75, "rewards/chosen": -1.1484375, "rewards/margins": 0.31640625, "rewards/rejected": -1.46875, "step": 651 }, { "epoch": 1.3647305075876504, "grad_norm": 11.069323539733887, "learning_rate": 3.000121363610167e-07, "logits/chosen": 1.640625, "logits/rejected": 2.0625, "logps/chosen": -253.0, "logps/rejected": -231.0, "loss": 0.616, "rewards/accuracies": 0.5, "rewards/chosen": -0.9609375, "rewards/margins": 0.0654296875, "rewards/rejected": -1.0234375, "step": 652 }, { "epoch": 1.3668236525379382, "grad_norm": 11.488618850708008, "learning_rate": 2.994548329971814e-07, "logits/chosen": 1.640625, "logits/rejected": 2.828125, "logps/chosen": -620.0, "logps/rejected": -424.0, "loss": 0.6375, "rewards/accuracies": 0.5, "rewards/chosen": -0.88671875, "rewards/margins": 0.1123046875, "rewards/rejected": -1.0, "step": 653 }, { "epoch": 1.3689167974882261, "grad_norm": 10.386863708496094, "learning_rate": 2.988972738328724e-07, "logits/chosen": 1.828125, "logits/rejected": 2.21875, "logps/chosen": -502.0, "logps/rejected": -322.0, "loss": 0.6062, "rewards/accuracies": 0.5, "rewards/chosen": -1.5, "rewards/margins": -0.052734375, "rewards/rejected": -1.4453125, "step": 654 }, { "epoch": 1.3710099424385138, "grad_norm": 11.457762718200684, "learning_rate": 2.98339461753471e-07, "logits/chosen": 2.984375, "logits/rejected": 2.796875, "logps/chosen": -540.0, "logps/rejected": -438.0, "loss": 0.5958, "rewards/accuracies": 1.0, "rewards/chosen": -0.84375, "rewards/margins": 0.52734375, "rewards/rejected": -1.375, "step": 655 }, { "epoch": 1.3731030873888017, "grad_norm": 10.398492813110352, "learning_rate": 2.9778139964566675e-07, "logits/chosen": 2.546875, "logits/rejected": 2.96875, "logps/chosen": -672.0, "logps/rejected": -684.0, "loss": 0.5755, "rewards/accuracies": 1.0, "rewards/chosen": -1.328125, "rewards/margins": 0.375, "rewards/rejected": -1.703125, "step": 656 }, { "epoch": 1.3751962323390896, "grad_norm": 11.246747970581055, "learning_rate": 2.972230903974433e-07, "logits/chosen": 2.078125, "logits/rejected": 1.984375, "logps/chosen": -394.0, "logps/rejected": -366.0, "loss": 0.6048, "rewards/accuracies": 1.0, "rewards/chosen": -0.76171875, "rewards/margins": 0.2734375, "rewards/rejected": -1.03125, "step": 657 }, { "epoch": 1.3772893772893773, "grad_norm": 10.025164604187012, "learning_rate": 2.9666453689806345e-07, "logits/chosen": 1.6640625, "logits/rejected": 1.78125, "logps/chosen": -438.0, "logps/rejected": -302.0, "loss": 0.6108, "rewards/accuracies": 1.0, "rewards/chosen": -0.76953125, "rewards/margins": 0.455078125, "rewards/rejected": -1.2265625, "step": 658 }, { "epoch": 1.3793825222396652, "grad_norm": 10.379528045654297, "learning_rate": 2.961057420380538e-07, "logits/chosen": 2.546875, "logits/rejected": 2.4375, "logps/chosen": -904.0, "logps/rejected": -712.0, "loss": 0.5591, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.40625, "rewards/rejected": -1.5390625, "step": 659 }, { "epoch": 1.3814756671899528, "grad_norm": 10.235441207885742, "learning_rate": 2.9554670870919e-07, "logits/chosen": 2.21875, "logits/rejected": 2.765625, "logps/chosen": -354.0, "logps/rejected": -380.0, "loss": 0.569, "rewards/accuracies": 0.25, "rewards/chosen": -1.2109375, "rewards/margins": -0.0458984375, "rewards/rejected": -1.1640625, "step": 660 }, { "epoch": 1.3835688121402407, "grad_norm": 10.565560340881348, "learning_rate": 2.949874398044818e-07, "logits/chosen": 1.75, "logits/rejected": 1.5546875, "logps/chosen": -510.0, "logps/rejected": -556.0, "loss": 0.6289, "rewards/accuracies": 1.0, "rewards/chosen": -1.5234375, "rewards/margins": 0.40234375, "rewards/rejected": -1.9296875, "step": 661 }, { "epoch": 1.3856619570905284, "grad_norm": 9.682659149169922, "learning_rate": 2.944279382181582e-07, "logits/chosen": 2.625, "logits/rejected": 2.6875, "logps/chosen": -532.0, "logps/rejected": -408.0, "loss": 0.5742, "rewards/accuracies": 1.0, "rewards/chosen": -0.97265625, "rewards/margins": 0.396484375, "rewards/rejected": -1.3671875, "step": 662 }, { "epoch": 1.3877551020408163, "grad_norm": 10.06523609161377, "learning_rate": 2.938682068456522e-07, "logits/chosen": 1.859375, "logits/rejected": 2.09375, "logps/chosen": -406.0, "logps/rejected": -432.0, "loss": 0.5693, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.5234375, "rewards/rejected": -1.65625, "step": 663 }, { "epoch": 1.3898482469911042, "grad_norm": 11.815163612365723, "learning_rate": 2.9330824858358587e-07, "logits/chosen": 2.0, "logits/rejected": 2.28125, "logps/chosen": -376.0, "logps/rejected": -362.0, "loss": 0.6123, "rewards/accuracies": 0.5, "rewards/chosen": -1.3046875, "rewards/margins": 0.14453125, "rewards/rejected": -1.453125, "step": 664 }, { "epoch": 1.3919413919413919, "grad_norm": 11.66609001159668, "learning_rate": 2.9274806632975575e-07, "logits/chosen": 2.390625, "logits/rejected": 2.515625, "logps/chosen": -414.0, "logps/rejected": -496.0, "loss": 0.6305, "rewards/accuracies": 0.5, "rewards/chosen": -0.80859375, "rewards/margins": 0.515625, "rewards/rejected": -1.3203125, "step": 665 }, { "epoch": 1.3940345368916798, "grad_norm": 10.703742027282715, "learning_rate": 2.92187662983117e-07, "logits/chosen": 2.75, "logits/rejected": 3.0, "logps/chosen": -588.0, "logps/rejected": -520.0, "loss": 0.6147, "rewards/accuracies": 1.0, "rewards/chosen": -0.796875, "rewards/margins": 0.59375, "rewards/rejected": -1.390625, "step": 666 }, { "epoch": 1.3961276818419677, "grad_norm": 10.139862060546875, "learning_rate": 2.916270414437696e-07, "logits/chosen": 2.015625, "logits/rejected": 2.15625, "logps/chosen": -458.0, "logps/rejected": -428.0, "loss": 0.586, "rewards/accuracies": 0.0, "rewards/chosen": -1.203125, "rewards/margins": -0.15625, "rewards/rejected": -1.046875, "step": 667 }, { "epoch": 1.3982208267922553, "grad_norm": 10.547820091247559, "learning_rate": 2.9106620461294223e-07, "logits/chosen": 1.53125, "logits/rejected": 1.265625, "logps/chosen": -249.0, "logps/rejected": -484.0, "loss": 0.602, "rewards/accuracies": 0.75, "rewards/chosen": -1.1796875, "rewards/margins": 0.470703125, "rewards/rejected": -1.65625, "step": 668 }, { "epoch": 1.4003139717425432, "grad_norm": 10.163110733032227, "learning_rate": 2.905051553929778e-07, "logits/chosen": 1.828125, "logits/rejected": 2.578125, "logps/chosen": -760.0, "logps/rejected": -424.0, "loss": 0.5665, "rewards/accuracies": 0.75, "rewards/chosen": -0.83984375, "rewards/margins": 0.359375, "rewards/rejected": -1.1953125, "step": 669 }, { "epoch": 1.402407116692831, "grad_norm": 10.655150413513184, "learning_rate": 2.899438966873183e-07, "logits/chosen": 2.375, "logits/rejected": 2.03125, "logps/chosen": -382.0, "logps/rejected": -540.0, "loss": 0.626, "rewards/accuracies": 0.5, "rewards/chosen": -1.40625, "rewards/margins": -0.1181640625, "rewards/rejected": -1.2890625, "step": 670 }, { "epoch": 1.4045002616431188, "grad_norm": 10.15519905090332, "learning_rate": 2.8938243140049003e-07, "logits/chosen": 0.87109375, "logits/rejected": 1.0859375, "logps/chosen": -200.0, "logps/rejected": -216.0, "loss": 0.5746, "rewards/accuracies": 0.5, "rewards/chosen": -0.91796875, "rewards/margins": 0.20703125, "rewards/rejected": -1.125, "step": 671 }, { "epoch": 1.4065934065934065, "grad_norm": 11.310110092163086, "learning_rate": 2.8882076243808817e-07, "logits/chosen": 1.859375, "logits/rejected": 2.59375, "logps/chosen": -652.0, "logps/rejected": -532.0, "loss": 0.5545, "rewards/accuracies": 0.5, "rewards/chosen": -1.4140625, "rewards/margins": 0.46484375, "rewards/rejected": -1.8828125, "step": 672 }, { "epoch": 1.4086865515436944, "grad_norm": 10.634568214416504, "learning_rate": 2.8825889270676193e-07, "logits/chosen": 1.4765625, "logits/rejected": 1.578125, "logps/chosen": -251.0, "logps/rejected": -306.0, "loss": 0.6162, "rewards/accuracies": 0.75, "rewards/chosen": -1.171875, "rewards/margins": 0.234375, "rewards/rejected": -1.40625, "step": 673 }, { "epoch": 1.4107796964939823, "grad_norm": 10.30864429473877, "learning_rate": 2.8769682511419946e-07, "logits/chosen": 2.5625, "logits/rejected": 2.96875, "logps/chosen": -564.0, "logps/rejected": -436.0, "loss": 0.6162, "rewards/accuracies": 1.0, "rewards/chosen": -1.0234375, "rewards/margins": 0.4140625, "rewards/rejected": -1.4375, "step": 674 }, { "epoch": 1.41287284144427, "grad_norm": 10.896045684814453, "learning_rate": 2.8713456256911306e-07, "logits/chosen": 2.84375, "logits/rejected": 1.96875, "logps/chosen": -596.0, "logps/rejected": -748.0, "loss": 0.576, "rewards/accuracies": 1.0, "rewards/chosen": -1.109375, "rewards/margins": 0.462890625, "rewards/rejected": -1.5703125, "step": 675 }, { "epoch": 1.4149659863945578, "grad_norm": 10.742579460144043, "learning_rate": 2.8657210798122374e-07, "logits/chosen": 2.53125, "logits/rejected": 2.4375, "logps/chosen": -752.0, "logps/rejected": -628.0, "loss": 0.5836, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.59375, "rewards/rejected": -1.828125, "step": 676 }, { "epoch": 1.4170591313448457, "grad_norm": 10.400110244750977, "learning_rate": 2.860094642612463e-07, "logits/chosen": 1.875, "logits/rejected": 1.5859375, "logps/chosen": -520.0, "logps/rejected": -482.0, "loss": 0.5986, "rewards/accuracies": 0.25, "rewards/chosen": -1.3984375, "rewards/margins": -0.35546875, "rewards/rejected": -1.046875, "step": 677 }, { "epoch": 1.4191522762951334, "grad_norm": 10.457825660705566, "learning_rate": 2.854466343208745e-07, "logits/chosen": 1.640625, "logits/rejected": 2.5, "logps/chosen": -600.0, "logps/rejected": -384.0, "loss": 0.5562, "rewards/accuracies": 0.75, "rewards/chosen": -1.21875, "rewards/margins": 0.17578125, "rewards/rejected": -1.3984375, "step": 678 }, { "epoch": 1.4212454212454213, "grad_norm": 9.994818687438965, "learning_rate": 2.848836210727655e-07, "logits/chosen": 1.796875, "logits/rejected": 1.4296875, "logps/chosen": -414.0, "logps/rejected": -426.0, "loss": 0.5831, "rewards/accuracies": 0.25, "rewards/chosen": -1.203125, "rewards/margins": 0.1728515625, "rewards/rejected": -1.3828125, "step": 679 }, { "epoch": 1.423338566195709, "grad_norm": 11.893540382385254, "learning_rate": 2.843204274305253e-07, "logits/chosen": 2.328125, "logits/rejected": 2.8125, "logps/chosen": -576.0, "logps/rejected": -450.0, "loss": 0.645, "rewards/accuracies": 0.5, "rewards/chosen": -1.1640625, "rewards/margins": 0.0302734375, "rewards/rejected": -1.1875, "step": 680 }, { "epoch": 1.4254317111459969, "grad_norm": 11.3270902633667, "learning_rate": 2.837570563086935e-07, "logits/chosen": 1.9375, "logits/rejected": 1.5, "logps/chosen": -249.0, "logps/rejected": -394.0, "loss": 0.6374, "rewards/accuracies": 0.75, "rewards/chosen": -1.234375, "rewards/margins": 0.28125, "rewards/rejected": -1.515625, "step": 681 }, { "epoch": 1.4275248560962845, "grad_norm": 11.636581420898438, "learning_rate": 2.8319351062272794e-07, "logits/chosen": 2.59375, "logits/rejected": 2.546875, "logps/chosen": -380.0, "logps/rejected": -520.0, "loss": 0.6472, "rewards/accuracies": 1.0, "rewards/chosen": -0.89453125, "rewards/margins": 0.6171875, "rewards/rejected": -1.515625, "step": 682 }, { "epoch": 1.4296180010465724, "grad_norm": 10.399662971496582, "learning_rate": 2.8262979328899004e-07, "logits/chosen": 1.9140625, "logits/rejected": 1.84375, "logps/chosen": -800.0, "logps/rejected": -716.0, "loss": 0.6063, "rewards/accuracies": 0.75, "rewards/chosen": -0.875, "rewards/margins": 0.328125, "rewards/rejected": -1.203125, "step": 683 }, { "epoch": 1.4317111459968603, "grad_norm": 11.006073951721191, "learning_rate": 2.820659072247294e-07, "logits/chosen": 1.796875, "logits/rejected": 1.9921875, "logps/chosen": -304.0, "logps/rejected": -352.0, "loss": 0.6032, "rewards/accuracies": 1.0, "rewards/chosen": -0.9609375, "rewards/margins": 0.2197265625, "rewards/rejected": -1.1796875, "step": 684 }, { "epoch": 1.433804290947148, "grad_norm": 11.377279281616211, "learning_rate": 2.8150185534806863e-07, "logits/chosen": 2.03125, "logits/rejected": 2.8125, "logps/chosen": -664.0, "logps/rejected": -344.0, "loss": 0.6, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 0.109375, "rewards/rejected": -1.4296875, "step": 685 }, { "epoch": 1.435897435897436, "grad_norm": 11.495552062988281, "learning_rate": 2.8093764057798885e-07, "logits/chosen": 2.765625, "logits/rejected": 3.203125, "logps/chosen": -980.0, "logps/rejected": -768.0, "loss": 0.6084, "rewards/accuracies": 0.75, "rewards/chosen": -1.5234375, "rewards/margins": 0.0234375, "rewards/rejected": -1.546875, "step": 686 }, { "epoch": 1.4379905808477238, "grad_norm": 11.461719512939453, "learning_rate": 2.803732658343138e-07, "logits/chosen": 2.34375, "logits/rejected": 2.984375, "logps/chosen": -478.0, "logps/rejected": -452.0, "loss": 0.5996, "rewards/accuracies": 0.75, "rewards/chosen": -1.1171875, "rewards/margins": 0.2109375, "rewards/rejected": -1.328125, "step": 687 }, { "epoch": 1.4400837257980115, "grad_norm": 12.534832954406738, "learning_rate": 2.7980873403769506e-07, "logits/chosen": 3.078125, "logits/rejected": 3.09375, "logps/chosen": -948.0, "logps/rejected": -548.0, "loss": 0.6669, "rewards/accuracies": 1.0, "rewards/chosen": -0.890625, "rewards/margins": 1.0859375, "rewards/rejected": -1.9765625, "step": 688 }, { "epoch": 1.4421768707482994, "grad_norm": 10.652071952819824, "learning_rate": 2.792440481095974e-07, "logits/chosen": 2.21875, "logits/rejected": 2.046875, "logps/chosen": -286.0, "logps/rejected": -532.0, "loss": 0.5648, "rewards/accuracies": 1.0, "rewards/chosen": -0.9609375, "rewards/margins": 0.443359375, "rewards/rejected": -1.40625, "step": 689 }, { "epoch": 1.4442700156985873, "grad_norm": 11.058365821838379, "learning_rate": 2.786792109722827e-07, "logits/chosen": 1.9375, "logits/rejected": 2.40625, "logps/chosen": -540.0, "logps/rejected": -446.0, "loss": 0.5799, "rewards/accuracies": 0.75, "rewards/chosen": -1.03125, "rewards/margins": 0.294921875, "rewards/rejected": -1.328125, "step": 690 }, { "epoch": 1.446363160648875, "grad_norm": 10.38504695892334, "learning_rate": 2.7811422554879563e-07, "logits/chosen": 2.59375, "logits/rejected": 2.984375, "logps/chosen": -1072.0, "logps/rejected": -688.0, "loss": 0.6002, "rewards/accuracies": 1.0, "rewards/chosen": -0.828125, "rewards/margins": 1.1640625, "rewards/rejected": -1.9921875, "step": 691 }, { "epoch": 1.4484563055991626, "grad_norm": 10.090402603149414, "learning_rate": 2.7754909476294824e-07, "logits/chosen": 2.765625, "logits/rejected": 2.78125, "logps/chosen": -592.0, "logps/rejected": -612.0, "loss": 0.6002, "rewards/accuracies": 0.5, "rewards/chosen": -0.984375, "rewards/margins": 0.1787109375, "rewards/rejected": -1.1640625, "step": 692 }, { "epoch": 1.4505494505494505, "grad_norm": 10.569539070129395, "learning_rate": 2.769838215393047e-07, "logits/chosen": 1.8515625, "logits/rejected": 2.625, "logps/chosen": -498.0, "logps/rejected": -552.0, "loss": 0.6024, "rewards/accuracies": 1.0, "rewards/chosen": -1.0625, "rewards/margins": 0.734375, "rewards/rejected": -1.796875, "step": 693 }, { "epoch": 1.4526425954997384, "grad_norm": 11.858864784240723, "learning_rate": 2.7641840880316647e-07, "logits/chosen": 1.5625, "logits/rejected": 1.4140625, "logps/chosen": -239.0, "logps/rejected": -350.0, "loss": 0.6419, "rewards/accuracies": 0.5, "rewards/chosen": -0.890625, "rewards/margins": 0.26953125, "rewards/rejected": -1.15625, "step": 694 }, { "epoch": 1.454735740450026, "grad_norm": 11.208900451660156, "learning_rate": 2.758528594805568e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.8203125, "logps/chosen": -428.0, "logps/rejected": -512.0, "loss": 0.6163, "rewards/accuracies": 0.5, "rewards/chosen": -1.3984375, "rewards/margins": 0.2333984375, "rewards/rejected": -1.6328125, "step": 695 }, { "epoch": 1.456828885400314, "grad_norm": 10.76639461517334, "learning_rate": 2.7528717649820604e-07, "logits/chosen": 1.6171875, "logits/rejected": 2.140625, "logps/chosen": -400.0, "logps/rejected": -280.0, "loss": 0.5738, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.2265625, "rewards/rejected": -1.359375, "step": 696 }, { "epoch": 1.4589220303506019, "grad_norm": 11.675130844116211, "learning_rate": 2.7472136278353584e-07, "logits/chosen": 2.609375, "logits/rejected": 2.4375, "logps/chosen": -324.0, "logps/rejected": -624.0, "loss": 0.5779, "rewards/accuracies": 0.75, "rewards/chosen": -1.34375, "rewards/margins": 0.466796875, "rewards/rejected": -1.8125, "step": 697 }, { "epoch": 1.4610151753008895, "grad_norm": 11.958902359008789, "learning_rate": 2.741554212646449e-07, "logits/chosen": 2.359375, "logits/rejected": 2.40625, "logps/chosen": -612.0, "logps/rejected": -648.0, "loss": 0.6082, "rewards/accuracies": 0.5, "rewards/chosen": -1.3125, "rewards/margins": 0.00390625, "rewards/rejected": -1.3125, "step": 698 }, { "epoch": 1.4631083202511774, "grad_norm": 11.79665470123291, "learning_rate": 2.735893548702928e-07, "logits/chosen": 2.375, "logits/rejected": 2.0625, "logps/chosen": -344.0, "logps/rejected": -520.0, "loss": 0.6233, "rewards/accuracies": 1.0, "rewards/chosen": -0.99609375, "rewards/margins": 0.431640625, "rewards/rejected": -1.4296875, "step": 699 }, { "epoch": 1.4652014652014653, "grad_norm": 10.444296836853027, "learning_rate": 2.730231665298857e-07, "logits/chosen": 2.328125, "logits/rejected": 2.8125, "logps/chosen": -536.0, "logps/rejected": -390.0, "loss": 0.6263, "rewards/accuracies": 0.75, "rewards/chosen": -1.078125, "rewards/margins": 0.25390625, "rewards/rejected": -1.328125, "step": 700 }, { "epoch": 1.467294610151753, "grad_norm": 10.155138969421387, "learning_rate": 2.724568591734607e-07, "logits/chosen": 3.03125, "logits/rejected": 2.9375, "logps/chosen": -548.0, "logps/rejected": -652.0, "loss": 0.6076, "rewards/accuracies": 1.0, "rewards/chosen": -1.140625, "rewards/margins": 0.9375, "rewards/rejected": -2.078125, "step": 701 }, { "epoch": 1.469387755102041, "grad_norm": 11.027820587158203, "learning_rate": 2.7189043573167084e-07, "logits/chosen": 2.5625, "logits/rejected": 2.265625, "logps/chosen": -588.0, "logps/rejected": -584.0, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": -1.046875, "rewards/margins": 0.30859375, "rewards/rejected": -1.3515625, "step": 702 }, { "epoch": 1.4714809000523286, "grad_norm": 9.480152130126953, "learning_rate": 2.7132389913576983e-07, "logits/chosen": 2.015625, "logits/rejected": 2.578125, "logps/chosen": -452.0, "logps/rejected": -362.0, "loss": 0.5668, "rewards/accuracies": 1.0, "rewards/chosen": -1.1328125, "rewards/margins": 0.578125, "rewards/rejected": -1.7109375, "step": 703 }, { "epoch": 1.4735740450026165, "grad_norm": 11.405426025390625, "learning_rate": 2.7075725231759713e-07, "logits/chosen": 2.296875, "logits/rejected": 3.234375, "logps/chosen": -592.0, "logps/rejected": -476.0, "loss": 0.5817, "rewards/accuracies": 1.0, "rewards/chosen": -0.9765625, "rewards/margins": 0.609375, "rewards/rejected": -1.5859375, "step": 704 }, { "epoch": 1.4756671899529041, "grad_norm": 10.888693809509277, "learning_rate": 2.701904982095625e-07, "logits/chosen": 2.203125, "logits/rejected": 2.578125, "logps/chosen": -464.0, "logps/rejected": -438.0, "loss": 0.5896, "rewards/accuracies": 0.75, "rewards/chosen": -0.921875, "rewards/margins": 0.53515625, "rewards/rejected": -1.453125, "step": 705 }, { "epoch": 1.477760334903192, "grad_norm": 10.568636894226074, "learning_rate": 2.696236397446308e-07, "logits/chosen": 1.5078125, "logits/rejected": 1.640625, "logps/chosen": -334.0, "logps/rejected": -332.0, "loss": 0.6034, "rewards/accuracies": 0.5, "rewards/chosen": -1.1484375, "rewards/margins": 0.1923828125, "rewards/rejected": -1.34375, "step": 706 }, { "epoch": 1.47985347985348, "grad_norm": 10.257476806640625, "learning_rate": 2.6905667985630703e-07, "logits/chosen": 2.0, "logits/rejected": 1.4140625, "logps/chosen": -316.0, "logps/rejected": -656.0, "loss": 0.5853, "rewards/accuracies": 0.75, "rewards/chosen": -1.015625, "rewards/margins": 0.671875, "rewards/rejected": -1.6875, "step": 707 }, { "epoch": 1.4819466248037676, "grad_norm": 10.183833122253418, "learning_rate": 2.684896214786214e-07, "logits/chosen": 2.75, "logits/rejected": 2.296875, "logps/chosen": -696.0, "logps/rejected": -468.0, "loss": 0.5695, "rewards/accuracies": 0.5, "rewards/chosen": -1.3515625, "rewards/margins": 0.365234375, "rewards/rejected": -1.71875, "step": 708 }, { "epoch": 1.4840397697540555, "grad_norm": 11.091374397277832, "learning_rate": 2.6792246754611315e-07, "logits/chosen": 1.6640625, "logits/rejected": 1.796875, "logps/chosen": -400.0, "logps/rejected": -434.0, "loss": 0.5943, "rewards/accuracies": 1.0, "rewards/chosen": -1.3203125, "rewards/margins": 0.453125, "rewards/rejected": -1.7734375, "step": 709 }, { "epoch": 1.4861329147043434, "grad_norm": 10.51530647277832, "learning_rate": 2.673552209938165e-07, "logits/chosen": 1.25, "logits/rejected": 2.15625, "logps/chosen": -524.0, "logps/rejected": -376.0, "loss": 0.5623, "rewards/accuracies": 0.75, "rewards/chosen": -1.578125, "rewards/margins": 0.37890625, "rewards/rejected": -1.953125, "step": 710 }, { "epoch": 1.488226059654631, "grad_norm": 10.477109909057617, "learning_rate": 2.667878847572448e-07, "logits/chosen": 2.828125, "logits/rejected": 2.09375, "logps/chosen": -448.0, "logps/rejected": -604.0, "loss": 0.606, "rewards/accuracies": 0.75, "rewards/chosen": -1.40625, "rewards/margins": 0.38671875, "rewards/rejected": -1.796875, "step": 711 }, { "epoch": 1.490319204604919, "grad_norm": 10.01471996307373, "learning_rate": 2.662204617723756e-07, "logits/chosen": 1.9140625, "logits/rejected": 2.109375, "logps/chosen": -452.0, "logps/rejected": -432.0, "loss": 0.5975, "rewards/accuracies": 0.25, "rewards/chosen": -1.2265625, "rewards/margins": -0.0439453125, "rewards/rejected": -1.1875, "step": 712 }, { "epoch": 1.4924123495552066, "grad_norm": 10.801339149475098, "learning_rate": 2.656529549756354e-07, "logits/chosen": 1.1953125, "logits/rejected": 1.0625, "logps/chosen": -231.0, "logps/rejected": -278.0, "loss": 0.5738, "rewards/accuracies": 0.25, "rewards/chosen": -1.359375, "rewards/margins": 0.0078125, "rewards/rejected": -1.3671875, "step": 713 }, { "epoch": 1.4945054945054945, "grad_norm": 10.13692569732666, "learning_rate": 2.6508536730388416e-07, "logits/chosen": 2.0, "logits/rejected": 2.09375, "logps/chosen": -380.0, "logps/rejected": -346.0, "loss": 0.5886, "rewards/accuracies": 0.5, "rewards/chosen": -1.1171875, "rewards/margins": 0.0009765625, "rewards/rejected": -1.1171875, "step": 714 }, { "epoch": 1.4965986394557822, "grad_norm": 11.655537605285645, "learning_rate": 2.6451770169440085e-07, "logits/chosen": 2.078125, "logits/rejected": 2.125, "logps/chosen": -472.0, "logps/rejected": -544.0, "loss": 0.6434, "rewards/accuracies": 0.5, "rewards/chosen": -1.1796875, "rewards/margins": 0.380859375, "rewards/rejected": -1.5625, "step": 715 }, { "epoch": 1.49869178440607, "grad_norm": 10.766107559204102, "learning_rate": 2.639499610848673e-07, "logits/chosen": 1.203125, "logits/rejected": 2.03125, "logps/chosen": -388.0, "logps/rejected": -286.0, "loss": 0.5704, "rewards/accuracies": 1.0, "rewards/chosen": -0.94921875, "rewards/margins": 0.71484375, "rewards/rejected": -1.6640625, "step": 716 }, { "epoch": 1.500784929356358, "grad_norm": 12.128816604614258, "learning_rate": 2.6338214841335364e-07, "logits/chosen": 2.15625, "logits/rejected": 2.65625, "logps/chosen": -348.0, "logps/rejected": -504.0, "loss": 0.6176, "rewards/accuracies": 0.75, "rewards/chosen": -0.8828125, "rewards/margins": -0.0078125, "rewards/rejected": -0.875, "step": 717 }, { "epoch": 1.5028780743066457, "grad_norm": 9.875317573547363, "learning_rate": 2.6281426661830295e-07, "logits/chosen": 1.9140625, "logits/rejected": 2.375, "logps/chosen": -424.0, "logps/rejected": -330.0, "loss": 0.6105, "rewards/accuracies": 0.5, "rewards/chosen": -1.03125, "rewards/margins": 0.208984375, "rewards/rejected": -1.2421875, "step": 718 }, { "epoch": 1.5049712192569336, "grad_norm": 11.082642555236816, "learning_rate": 2.622463186385161e-07, "logits/chosen": 2.25, "logits/rejected": 2.671875, "logps/chosen": -572.0, "logps/rejected": -548.0, "loss": 0.6359, "rewards/accuracies": 0.25, "rewards/chosen": -1.3515625, "rewards/margins": 0.013671875, "rewards/rejected": -1.3671875, "step": 719 }, { "epoch": 1.5070643642072215, "grad_norm": 11.48105525970459, "learning_rate": 2.616783074131364e-07, "logits/chosen": 1.765625, "logits/rejected": 1.1953125, "logps/chosen": -186.0, "logps/rejected": -360.0, "loss": 0.5563, "rewards/accuracies": 0.75, "rewards/chosen": -1.25, "rewards/margins": 0.375, "rewards/rejected": -1.625, "step": 720 }, { "epoch": 1.5091575091575091, "grad_norm": 11.445796012878418, "learning_rate": 2.6111023588163445e-07, "logits/chosen": 1.9765625, "logits/rejected": 2.453125, "logps/chosen": -444.0, "logps/rejected": -376.0, "loss": 0.5152, "rewards/accuracies": 0.75, "rewards/chosen": -1.046875, "rewards/margins": 0.33203125, "rewards/rejected": -1.3828125, "step": 721 }, { "epoch": 1.511250654107797, "grad_norm": 12.939420700073242, "learning_rate": 2.6054210698379276e-07, "logits/chosen": 2.03125, "logits/rejected": 2.0, "logps/chosen": -460.0, "logps/rejected": -340.0, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": -1.125, "rewards/margins": 0.017578125, "rewards/rejected": -1.1484375, "step": 722 }, { "epoch": 1.513343799058085, "grad_norm": 11.441888809204102, "learning_rate": 2.5997392365969097e-07, "logits/chosen": 2.375, "logits/rejected": 1.90625, "logps/chosen": -302.0, "logps/rejected": -420.0, "loss": 0.608, "rewards/accuracies": 0.5, "rewards/chosen": -1.4296875, "rewards/margins": 0.12890625, "rewards/rejected": -1.5625, "step": 723 }, { "epoch": 1.5154369440083726, "grad_norm": 10.66491985321045, "learning_rate": 2.5940568884969035e-07, "logits/chosen": 1.140625, "logits/rejected": 1.4609375, "logps/chosen": -438.0, "logps/rejected": -386.0, "loss": 0.5736, "rewards/accuracies": 1.0, "rewards/chosen": -1.3203125, "rewards/margins": 0.3828125, "rewards/rejected": -1.703125, "step": 724 }, { "epoch": 1.5175300889586603, "grad_norm": 10.134530067443848, "learning_rate": 2.5883740549441844e-07, "logits/chosen": 2.109375, "logits/rejected": 1.671875, "logps/chosen": -294.0, "logps/rejected": -322.0, "loss": 0.5838, "rewards/accuracies": 0.5, "rewards/chosen": -1.2578125, "rewards/margins": 0.029296875, "rewards/rejected": -1.2890625, "step": 725 }, { "epoch": 1.5196232339089482, "grad_norm": 10.396265983581543, "learning_rate": 2.582690765347542e-07, "logits/chosen": 2.4375, "logits/rejected": 2.96875, "logps/chosen": -808.0, "logps/rejected": -564.0, "loss": 0.5766, "rewards/accuracies": 0.5, "rewards/chosen": -1.6328125, "rewards/margins": 0.205078125, "rewards/rejected": -1.8359375, "step": 726 }, { "epoch": 1.521716378859236, "grad_norm": 11.76471996307373, "learning_rate": 2.577007049118125e-07, "logits/chosen": 2.1875, "logits/rejected": 2.296875, "logps/chosen": -276.0, "logps/rejected": -1012.0, "loss": 0.6159, "rewards/accuracies": 0.75, "rewards/chosen": -1.3671875, "rewards/margins": 0.53125, "rewards/rejected": -1.8984375, "step": 727 }, { "epoch": 1.5238095238095237, "grad_norm": 11.774922370910645, "learning_rate": 2.57132293566929e-07, "logits/chosen": 2.71875, "logits/rejected": 2.859375, "logps/chosen": -712.0, "logps/rejected": -712.0, "loss": 0.6371, "rewards/accuracies": 0.75, "rewards/chosen": -1.25, "rewards/margins": 0.87109375, "rewards/rejected": -2.125, "step": 728 }, { "epoch": 1.5259026687598116, "grad_norm": 11.291149139404297, "learning_rate": 2.565638454416448e-07, "logits/chosen": 1.9296875, "logits/rejected": 2.40625, "logps/chosen": -680.0, "logps/rejected": -616.0, "loss": 0.5991, "rewards/accuracies": 1.0, "rewards/chosen": -1.0390625, "rewards/margins": 0.8515625, "rewards/rejected": -1.890625, "step": 729 }, { "epoch": 1.5279958137100995, "grad_norm": 10.837662696838379, "learning_rate": 2.5599536347769157e-07, "logits/chosen": 1.71875, "logits/rejected": 1.390625, "logps/chosen": -616.0, "logps/rejected": -616.0, "loss": 0.6112, "rewards/accuracies": 0.5, "rewards/chosen": -1.15625, "rewards/margins": 0.0986328125, "rewards/rejected": -1.2578125, "step": 730 }, { "epoch": 1.5300889586603872, "grad_norm": 10.20396614074707, "learning_rate": 2.5542685061697595e-07, "logits/chosen": 2.078125, "logits/rejected": 2.15625, "logps/chosen": -680.0, "logps/rejected": -568.0, "loss": 0.5881, "rewards/accuracies": 0.75, "rewards/chosen": -0.9765625, "rewards/margins": 0.0205078125, "rewards/rejected": -0.99609375, "step": 731 }, { "epoch": 1.532182103610675, "grad_norm": 11.000545501708984, "learning_rate": 2.548583098015646e-07, "logits/chosen": 1.96875, "logits/rejected": 2.078125, "logps/chosen": -408.0, "logps/rejected": -600.0, "loss": 0.582, "rewards/accuracies": 0.5, "rewards/chosen": -1.3984375, "rewards/margins": 0.515625, "rewards/rejected": -1.9140625, "step": 732 }, { "epoch": 1.534275248560963, "grad_norm": 11.277151107788086, "learning_rate": 2.5428974397366856e-07, "logits/chosen": 1.5703125, "logits/rejected": 2.15625, "logps/chosen": -532.0, "logps/rejected": -494.0, "loss": 0.6567, "rewards/accuracies": 1.0, "rewards/chosen": -1.296875, "rewards/margins": 0.58984375, "rewards/rejected": -1.890625, "step": 733 }, { "epoch": 1.5363683935112507, "grad_norm": 11.331924438476562, "learning_rate": 2.537211560756286e-07, "logits/chosen": 2.75, "logits/rejected": 2.3125, "logps/chosen": -430.0, "logps/rejected": -476.0, "loss": 0.6137, "rewards/accuracies": 0.75, "rewards/chosen": -1.1171875, "rewards/margins": 0.50390625, "rewards/rejected": -1.625, "step": 734 }, { "epoch": 1.5384615384615383, "grad_norm": 12.324798583984375, "learning_rate": 2.531525490498997e-07, "logits/chosen": 2.25, "logits/rejected": 3.015625, "logps/chosen": -720.0, "logps/rejected": -442.0, "loss": 0.631, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.6171875, "rewards/rejected": -1.75, "step": 735 }, { "epoch": 1.5405546834118262, "grad_norm": 10.492572784423828, "learning_rate": 2.525839258390355e-07, "logits/chosen": 2.46875, "logits/rejected": 3.21875, "logps/chosen": -768.0, "logps/rejected": -608.0, "loss": 0.5506, "rewards/accuracies": 1.0, "rewards/chosen": -0.75, "rewards/margins": 1.3671875, "rewards/rejected": -2.125, "step": 736 }, { "epoch": 1.5426478283621141, "grad_norm": 10.195070266723633, "learning_rate": 2.520152893856739e-07, "logits/chosen": 1.1953125, "logits/rejected": 1.0546875, "logps/chosen": -298.0, "logps/rejected": -378.0, "loss": 0.577, "rewards/accuracies": 0.75, "rewards/chosen": -1.1796875, "rewards/margins": 0.33203125, "rewards/rejected": -1.5078125, "step": 737 }, { "epoch": 1.5447409733124018, "grad_norm": 10.100964546203613, "learning_rate": 2.514466426325209e-07, "logits/chosen": 1.4765625, "logits/rejected": 1.953125, "logps/chosen": -368.0, "logps/rejected": -368.0, "loss": 0.5793, "rewards/accuracies": 0.75, "rewards/chosen": -1.21875, "rewards/margins": 0.498046875, "rewards/rejected": -1.7109375, "step": 738 }, { "epoch": 1.5468341182626897, "grad_norm": 10.938796043395996, "learning_rate": 2.5087798852233593e-07, "logits/chosen": 1.359375, "logits/rejected": 1.6171875, "logps/chosen": -436.0, "logps/rejected": -358.0, "loss": 0.6109, "rewards/accuracies": 0.5, "rewards/chosen": -1.1796875, "rewards/margins": 0.46484375, "rewards/rejected": -1.640625, "step": 739 }, { "epoch": 1.5489272632129776, "grad_norm": 11.056641578674316, "learning_rate": 2.503093299979166e-07, "logits/chosen": 2.65625, "logits/rejected": 2.875, "logps/chosen": -552.0, "logps/rejected": -720.0, "loss": 0.5651, "rewards/accuracies": 0.75, "rewards/chosen": -1.34375, "rewards/margins": 0.2392578125, "rewards/rejected": -1.5859375, "step": 740 }, { "epoch": 1.5510204081632653, "grad_norm": 10.961803436279297, "learning_rate": 2.4974067000208334e-07, "logits/chosen": 2.203125, "logits/rejected": 2.15625, "logps/chosen": -468.0, "logps/rejected": -510.0, "loss": 0.595, "rewards/accuracies": 0.5, "rewards/chosen": -1.34375, "rewards/margins": 0.03515625, "rewards/rejected": -1.375, "step": 741 }, { "epoch": 1.5531135531135531, "grad_norm": 11.027541160583496, "learning_rate": 2.491720114776641e-07, "logits/chosen": 1.15625, "logits/rejected": 1.3125, "logps/chosen": -276.0, "logps/rejected": -324.0, "loss": 0.6082, "rewards/accuracies": 0.5, "rewards/chosen": -0.9609375, "rewards/margins": 0.3125, "rewards/rejected": -1.2734375, "step": 742 }, { "epoch": 1.555206698063841, "grad_norm": 11.360031127929688, "learning_rate": 2.4860335736747915e-07, "logits/chosen": 1.9296875, "logits/rejected": 2.1875, "logps/chosen": -336.0, "logps/rejected": -468.0, "loss": 0.6007, "rewards/accuracies": 0.75, "rewards/chosen": -1.015625, "rewards/margins": 0.365234375, "rewards/rejected": -1.375, "step": 743 }, { "epoch": 1.5572998430141287, "grad_norm": 10.475598335266113, "learning_rate": 2.480347106143261e-07, "logits/chosen": 2.140625, "logits/rejected": 2.953125, "logps/chosen": -400.0, "logps/rejected": -412.0, "loss": 0.5641, "rewards/accuracies": 1.0, "rewards/chosen": -1.3828125, "rewards/margins": 0.5078125, "rewards/rejected": -1.890625, "step": 744 }, { "epoch": 1.5593929879644164, "grad_norm": 10.709178924560547, "learning_rate": 2.474660741609645e-07, "logits/chosen": 1.1171875, "logits/rejected": 1.0234375, "logps/chosen": -234.0, "logps/rejected": -248.0, "loss": 0.5916, "rewards/accuracies": 0.25, "rewards/chosen": -1.7265625, "rewards/margins": -0.01953125, "rewards/rejected": -1.703125, "step": 745 }, { "epoch": 1.5614861329147045, "grad_norm": 11.80731201171875, "learning_rate": 2.468974509501004e-07, "logits/chosen": 2.0, "logits/rejected": 1.5703125, "logps/chosen": -458.0, "logps/rejected": -412.0, "loss": 0.6428, "rewards/accuracies": 0.25, "rewards/chosen": -1.3125, "rewards/margins": -0.224609375, "rewards/rejected": -1.0859375, "step": 746 }, { "epoch": 1.5635792778649922, "grad_norm": 11.310405731201172, "learning_rate": 2.463288439243714e-07, "logits/chosen": 1.859375, "logits/rejected": 2.5, "logps/chosen": -444.0, "logps/rejected": -324.0, "loss": 0.617, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.2109375, "rewards/rejected": -1.4765625, "step": 747 }, { "epoch": 1.5656724228152799, "grad_norm": 9.95641040802002, "learning_rate": 2.457602560263314e-07, "logits/chosen": 2.1875, "logits/rejected": 2.21875, "logps/chosen": -448.0, "logps/rejected": -552.0, "loss": 0.5878, "rewards/accuracies": 0.75, "rewards/chosen": -1.0859375, "rewards/margins": 0.3203125, "rewards/rejected": -1.40625, "step": 748 }, { "epoch": 1.5677655677655677, "grad_norm": 12.05051040649414, "learning_rate": 2.451916901984355e-07, "logits/chosen": 1.2578125, "logits/rejected": 1.8125, "logps/chosen": -444.0, "logps/rejected": -302.0, "loss": 0.6069, "rewards/accuracies": 0.75, "rewards/chosen": -0.92578125, "rewards/margins": 0.37109375, "rewards/rejected": -1.296875, "step": 749 }, { "epoch": 1.5698587127158556, "grad_norm": 11.73843002319336, "learning_rate": 2.446231493830241e-07, "logits/chosen": 2.15625, "logits/rejected": 2.0, "logps/chosen": -482.0, "logps/rejected": -512.0, "loss": 0.611, "rewards/accuracies": 0.75, "rewards/chosen": -1.4609375, "rewards/margins": 0.1826171875, "rewards/rejected": -1.640625, "step": 750 }, { "epoch": 1.5719518576661433, "grad_norm": 11.1854829788208, "learning_rate": 2.440546365223084e-07, "logits/chosen": 1.3828125, "logits/rejected": 1.7109375, "logps/chosen": -288.0, "logps/rejected": -318.0, "loss": 0.5909, "rewards/accuracies": 0.5, "rewards/chosen": -1.296875, "rewards/margins": 0.013671875, "rewards/rejected": -1.3125, "step": 751 }, { "epoch": 1.5740450026164312, "grad_norm": 10.895852088928223, "learning_rate": 2.4348615455835516e-07, "logits/chosen": 2.828125, "logits/rejected": 2.765625, "logps/chosen": -604.0, "logps/rejected": -632.0, "loss": 0.6138, "rewards/accuracies": 0.5, "rewards/chosen": -1.046875, "rewards/margins": 0.1845703125, "rewards/rejected": -1.234375, "step": 752 }, { "epoch": 1.576138147566719, "grad_norm": 11.637216567993164, "learning_rate": 2.42917706433071e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.859375, "logps/chosen": -498.0, "logps/rejected": -536.0, "loss": 0.6268, "rewards/accuracies": 0.75, "rewards/chosen": -0.97265625, "rewards/margins": 0.5, "rewards/rejected": -1.46875, "step": 753 }, { "epoch": 1.5782312925170068, "grad_norm": 11.550223350524902, "learning_rate": 2.423492950881875e-07, "logits/chosen": 2.09375, "logits/rejected": 1.7109375, "logps/chosen": -460.0, "logps/rejected": -664.0, "loss": 0.6323, "rewards/accuracies": 0.5, "rewards/chosen": -1.03125, "rewards/margins": 0.49609375, "rewards/rejected": -1.53125, "step": 754 }, { "epoch": 1.5803244374672945, "grad_norm": 12.201874732971191, "learning_rate": 2.417809234652457e-07, "logits/chosen": 3.1875, "logits/rejected": 3.625, "logps/chosen": -872.0, "logps/rejected": -540.0, "loss": 0.5869, "rewards/accuracies": 0.5, "rewards/chosen": -1.28125, "rewards/margins": 0.38671875, "rewards/rejected": -1.671875, "step": 755 }, { "epoch": 1.5824175824175826, "grad_norm": 11.802955627441406, "learning_rate": 2.412125945055816e-07, "logits/chosen": 1.9765625, "logits/rejected": 2.984375, "logps/chosen": -652.0, "logps/rejected": -408.0, "loss": 0.6082, "rewards/accuracies": 1.0, "rewards/chosen": -1.0625, "rewards/margins": 0.43359375, "rewards/rejected": -1.5, "step": 756 }, { "epoch": 1.5845107273678702, "grad_norm": 12.770798683166504, "learning_rate": 2.406443111503097e-07, "logits/chosen": 2.15625, "logits/rejected": 3.140625, "logps/chosen": -548.0, "logps/rejected": -500.0, "loss": 0.6227, "rewards/accuracies": 1.0, "rewards/chosen": -1.03125, "rewards/margins": 0.6171875, "rewards/rejected": -1.65625, "step": 757 }, { "epoch": 1.586603872318158, "grad_norm": 11.293933868408203, "learning_rate": 2.40076076340309e-07, "logits/chosen": 2.484375, "logits/rejected": 2.6875, "logps/chosen": -776.0, "logps/rejected": -584.0, "loss": 0.5771, "rewards/accuracies": 0.5, "rewards/chosen": -1.296875, "rewards/margins": 0.5234375, "rewards/rejected": -1.8203125, "step": 758 }, { "epoch": 1.5886970172684458, "grad_norm": 11.184715270996094, "learning_rate": 2.3950789301620727e-07, "logits/chosen": 2.40625, "logits/rejected": 2.609375, "logps/chosen": -744.0, "logps/rejected": -560.0, "loss": 0.6186, "rewards/accuracies": 0.5, "rewards/chosen": -1.7734375, "rewards/margins": 0.03125, "rewards/rejected": -1.8046875, "step": 759 }, { "epoch": 1.5907901622187337, "grad_norm": 10.282952308654785, "learning_rate": 2.389397641183656e-07, "logits/chosen": 1.265625, "logits/rejected": 2.125, "logps/chosen": -390.0, "logps/rejected": -388.0, "loss": 0.5607, "rewards/accuracies": 1.0, "rewards/chosen": -1.015625, "rewards/margins": 0.578125, "rewards/rejected": -1.59375, "step": 760 }, { "epoch": 1.5928833071690214, "grad_norm": 11.480621337890625, "learning_rate": 2.383716925868636e-07, "logits/chosen": 2.1875, "logits/rejected": 2.234375, "logps/chosen": -440.0, "logps/rejected": -498.0, "loss": 0.6184, "rewards/accuracies": 0.5, "rewards/chosen": -1.6953125, "rewards/margins": -0.072265625, "rewards/rejected": -1.625, "step": 761 }, { "epoch": 1.5949764521193093, "grad_norm": 11.712589263916016, "learning_rate": 2.3780368136148381e-07, "logits/chosen": 1.9296875, "logits/rejected": 2.515625, "logps/chosen": -302.0, "logps/rejected": -228.0, "loss": 0.6187, "rewards/accuracies": 0.5, "rewards/chosen": -1.21875, "rewards/margins": 0.083984375, "rewards/rejected": -1.296875, "step": 762 }, { "epoch": 1.5970695970695972, "grad_norm": 10.707878112792969, "learning_rate": 2.37235733381697e-07, "logits/chosen": 2.265625, "logits/rejected": 2.0, "logps/chosen": -272.0, "logps/rejected": -340.0, "loss": 0.5598, "rewards/accuracies": 0.25, "rewards/chosen": -1.453125, "rewards/margins": -0.3046875, "rewards/rejected": -1.15625, "step": 763 }, { "epoch": 1.5991627420198848, "grad_norm": 11.1841402053833, "learning_rate": 2.3666785158664644e-07, "logits/chosen": 1.265625, "logits/rejected": 1.1171875, "logps/chosen": -346.0, "logps/rejected": -380.0, "loss": 0.6387, "rewards/accuracies": 0.5, "rewards/chosen": -1.671875, "rewards/margins": 0.142578125, "rewards/rejected": -1.8125, "step": 764 }, { "epoch": 1.6012558869701727, "grad_norm": 11.163543701171875, "learning_rate": 2.3610003891513274e-07, "logits/chosen": 2.203125, "logits/rejected": 2.421875, "logps/chosen": -640.0, "logps/rejected": -628.0, "loss": 0.5559, "rewards/accuracies": 1.0, "rewards/chosen": -0.9140625, "rewards/margins": 0.6953125, "rewards/rejected": -1.609375, "step": 765 }, { "epoch": 1.6033490319204606, "grad_norm": 11.096171379089355, "learning_rate": 2.3553229830559918e-07, "logits/chosen": 2.078125, "logits/rejected": 2.375, "logps/chosen": -580.0, "logps/rejected": -474.0, "loss": 0.6042, "rewards/accuracies": 0.75, "rewards/chosen": -1.4296875, "rewards/margins": 0.376953125, "rewards/rejected": -1.8046875, "step": 766 }, { "epoch": 1.6054421768707483, "grad_norm": 11.36347770690918, "learning_rate": 2.3496463269611577e-07, "logits/chosen": 2.484375, "logits/rejected": 3.140625, "logps/chosen": -784.0, "logps/rejected": -536.0, "loss": 0.5814, "rewards/accuracies": 0.25, "rewards/chosen": -0.9375, "rewards/margins": 0.07958984375, "rewards/rejected": -1.015625, "step": 767 }, { "epoch": 1.607535321821036, "grad_norm": 10.495102882385254, "learning_rate": 2.3439704502436462e-07, "logits/chosen": 1.6796875, "logits/rejected": 1.796875, "logps/chosen": -376.0, "logps/rejected": -552.0, "loss": 0.5767, "rewards/accuracies": 0.75, "rewards/chosen": -1.03125, "rewards/margins": 0.609375, "rewards/rejected": -1.640625, "step": 768 }, { "epoch": 1.6096284667713239, "grad_norm": 11.483415603637695, "learning_rate": 2.3382953822762432e-07, "logits/chosen": 1.78125, "logits/rejected": 1.140625, "logps/chosen": -334.0, "logps/rejected": -592.0, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": -1.390625, "rewards/margins": 0.15234375, "rewards/rejected": -1.5390625, "step": 769 }, { "epoch": 1.6117216117216118, "grad_norm": 10.574986457824707, "learning_rate": 2.3326211524275515e-07, "logits/chosen": 2.34375, "logits/rejected": 1.84375, "logps/chosen": -462.0, "logps/rejected": -548.0, "loss": 0.599, "rewards/accuracies": 0.75, "rewards/chosen": -1.453125, "rewards/margins": 0.0673828125, "rewards/rejected": -1.5234375, "step": 770 }, { "epoch": 1.6138147566718994, "grad_norm": 10.306511878967285, "learning_rate": 2.3269477900618355e-07, "logits/chosen": 1.28125, "logits/rejected": 1.75, "logps/chosen": -342.0, "logps/rejected": -412.0, "loss": 0.5745, "rewards/accuracies": 0.75, "rewards/chosen": -1.28125, "rewards/margins": 0.765625, "rewards/rejected": -2.046875, "step": 771 }, { "epoch": 1.6159079016221873, "grad_norm": 10.39566707611084, "learning_rate": 2.3212753245388691e-07, "logits/chosen": 2.0625, "logits/rejected": 2.375, "logps/chosen": -640.0, "logps/rejected": -476.0, "loss": 0.5766, "rewards/accuracies": 0.75, "rewards/chosen": -1.34375, "rewards/margins": 0.30859375, "rewards/rejected": -1.65625, "step": 772 }, { "epoch": 1.6180010465724752, "grad_norm": 11.270380020141602, "learning_rate": 2.3156037852137865e-07, "logits/chosen": 1.5, "logits/rejected": 1.46875, "logps/chosen": -510.0, "logps/rejected": -492.0, "loss": 0.589, "rewards/accuracies": 0.25, "rewards/chosen": -1.6015625, "rewards/margins": -0.119140625, "rewards/rejected": -1.484375, "step": 773 }, { "epoch": 1.620094191522763, "grad_norm": 10.048487663269043, "learning_rate": 2.3099332014369287e-07, "logits/chosen": 2.71875, "logits/rejected": 2.703125, "logps/chosen": -500.0, "logps/rejected": -468.0, "loss": 0.5616, "rewards/accuracies": 1.0, "rewards/chosen": -1.109375, "rewards/margins": 0.8359375, "rewards/rejected": -1.9453125, "step": 774 }, { "epoch": 1.6221873364730508, "grad_norm": 11.019427299499512, "learning_rate": 2.3042636025536925e-07, "logits/chosen": 1.1640625, "logits/rejected": 0.6953125, "logps/chosen": -244.0, "logps/rejected": -402.0, "loss": 0.5983, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.58203125, "rewards/rejected": -1.890625, "step": 775 }, { "epoch": 1.6242804814233387, "grad_norm": 11.41428279876709, "learning_rate": 2.298595017904375e-07, "logits/chosen": 2.3125, "logits/rejected": 1.78125, "logps/chosen": -452.0, "logps/rejected": -448.0, "loss": 0.6019, "rewards/accuracies": 0.75, "rewards/chosen": -1.296875, "rewards/margins": 0.212890625, "rewards/rejected": -1.5078125, "step": 776 }, { "epoch": 1.6263736263736264, "grad_norm": 10.68587875366211, "learning_rate": 2.292927476824028e-07, "logits/chosen": 1.6796875, "logits/rejected": 1.546875, "logps/chosen": -362.0, "logps/rejected": -264.0, "loss": 0.5849, "rewards/accuracies": 0.75, "rewards/chosen": -0.8671875, "rewards/margins": 0.5859375, "rewards/rejected": -1.453125, "step": 777 }, { "epoch": 1.628466771323914, "grad_norm": 11.420637130737305, "learning_rate": 2.287261008642302e-07, "logits/chosen": 2.078125, "logits/rejected": 2.875, "logps/chosen": -476.0, "logps/rejected": -362.0, "loss": 0.5739, "rewards/accuracies": 0.75, "rewards/chosen": -1.2109375, "rewards/margins": 0.25, "rewards/rejected": -1.4609375, "step": 778 }, { "epoch": 1.630559916274202, "grad_norm": 11.029525756835938, "learning_rate": 2.2815956426832922e-07, "logits/chosen": 2.28125, "logits/rejected": 2.359375, "logps/chosen": -446.0, "logps/rejected": -460.0, "loss": 0.6079, "rewards/accuracies": 0.25, "rewards/chosen": -1.484375, "rewards/margins": -0.34375, "rewards/rejected": -1.140625, "step": 779 }, { "epoch": 1.6326530612244898, "grad_norm": 11.36279010772705, "learning_rate": 2.275931408265393e-07, "logits/chosen": 2.46875, "logits/rejected": 2.03125, "logps/chosen": -270.0, "logps/rejected": -510.0, "loss": 0.5937, "rewards/accuracies": 0.5, "rewards/chosen": -1.015625, "rewards/margins": 0.048828125, "rewards/rejected": -1.0625, "step": 780 }, { "epoch": 1.6347462061747775, "grad_norm": 10.862942695617676, "learning_rate": 2.270268334701143e-07, "logits/chosen": 2.5625, "logits/rejected": 2.8125, "logps/chosen": -784.0, "logps/rejected": -584.0, "loss": 0.6022, "rewards/accuracies": 1.0, "rewards/chosen": -0.84375, "rewards/margins": 0.86328125, "rewards/rejected": -1.7109375, "step": 781 }, { "epoch": 1.6368393511250654, "grad_norm": 10.789078712463379, "learning_rate": 2.264606451297072e-07, "logits/chosen": 2.203125, "logits/rejected": 3.125, "logps/chosen": -464.0, "logps/rejected": -251.0, "loss": 0.5859, "rewards/accuracies": 0.5, "rewards/chosen": -1.203125, "rewards/margins": 0.13671875, "rewards/rejected": -1.34375, "step": 782 }, { "epoch": 1.6389324960753533, "grad_norm": 10.766769409179688, "learning_rate": 2.258945787353552e-07, "logits/chosen": 1.140625, "logits/rejected": 1.5703125, "logps/chosen": -492.0, "logps/rejected": -310.0, "loss": 0.5794, "rewards/accuracies": 0.25, "rewards/chosen": -1.328125, "rewards/margins": -0.0068359375, "rewards/rejected": -1.328125, "step": 783 }, { "epoch": 1.641025641025641, "grad_norm": 10.560734748840332, "learning_rate": 2.2532863721646409e-07, "logits/chosen": 1.7890625, "logits/rejected": 1.7578125, "logps/chosen": -448.0, "logps/rejected": -592.0, "loss": 0.6047, "rewards/accuracies": 0.75, "rewards/chosen": -1.2265625, "rewards/margins": 0.287109375, "rewards/rejected": -1.515625, "step": 784 }, { "epoch": 1.6431187859759289, "grad_norm": 11.903189659118652, "learning_rate": 2.2476282350179402e-07, "logits/chosen": 1.5546875, "logits/rejected": 3.125, "logps/chosen": -516.0, "logps/rejected": -296.0, "loss": 0.6025, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.283203125, "rewards/rejected": -1.5546875, "step": 785 }, { "epoch": 1.6452119309262168, "grad_norm": 11.229724884033203, "learning_rate": 2.2419714051944323e-07, "logits/chosen": 1.359375, "logits/rejected": 1.7890625, "logps/chosen": -318.0, "logps/rejected": -370.0, "loss": 0.6236, "rewards/accuracies": 0.75, "rewards/chosen": -1.125, "rewards/margins": 0.53125, "rewards/rejected": -1.65625, "step": 786 }, { "epoch": 1.6473050758765044, "grad_norm": 11.278830528259277, "learning_rate": 2.2363159119683352e-07, "logits/chosen": 1.0859375, "logits/rejected": 1.8671875, "logps/chosen": -270.0, "logps/rejected": -286.0, "loss": 0.5618, "rewards/accuracies": 0.75, "rewards/chosen": -0.984375, "rewards/margins": 0.1904296875, "rewards/rejected": -1.1796875, "step": 787 }, { "epoch": 1.649398220826792, "grad_norm": 11.758581161499023, "learning_rate": 2.2306617846069524e-07, "logits/chosen": 2.40625, "logits/rejected": 2.96875, "logps/chosen": -576.0, "logps/rejected": -408.0, "loss": 0.6119, "rewards/accuracies": 1.0, "rewards/chosen": -0.94921875, "rewards/margins": 0.37890625, "rewards/rejected": -1.328125, "step": 788 }, { "epoch": 1.6514913657770802, "grad_norm": 10.333982467651367, "learning_rate": 2.2250090523705177e-07, "logits/chosen": 1.765625, "logits/rejected": 2.234375, "logps/chosen": -472.0, "logps/rejected": -464.0, "loss": 0.6051, "rewards/accuracies": 0.5, "rewards/chosen": -0.96484375, "rewards/margins": 0.41796875, "rewards/rejected": -1.3828125, "step": 789 }, { "epoch": 1.653584510727368, "grad_norm": 11.33622932434082, "learning_rate": 2.2193577445120443e-07, "logits/chosen": 1.8203125, "logits/rejected": 3.28125, "logps/chosen": -664.0, "logps/rejected": -426.0, "loss": 0.6247, "rewards/accuracies": 0.75, "rewards/chosen": -1.421875, "rewards/margins": 0.099609375, "rewards/rejected": -1.5234375, "step": 790 }, { "epoch": 1.6556776556776556, "grad_norm": 10.776433944702148, "learning_rate": 2.2137078902771728e-07, "logits/chosen": 2.25, "logits/rejected": 2.34375, "logps/chosen": -284.0, "logps/rejected": -304.0, "loss": 0.621, "rewards/accuracies": 0.5, "rewards/chosen": -1.34375, "rewards/margins": -0.06640625, "rewards/rejected": -1.2734375, "step": 791 }, { "epoch": 1.6577708006279435, "grad_norm": 11.183871269226074, "learning_rate": 2.2080595189040263e-07, "logits/chosen": 1.0390625, "logits/rejected": 1.0625, "logps/chosen": -406.0, "logps/rejected": -632.0, "loss": 0.6021, "rewards/accuracies": 0.5, "rewards/chosen": -1.4375, "rewards/margins": 1.1015625, "rewards/rejected": -2.546875, "step": 792 }, { "epoch": 1.6598639455782314, "grad_norm": 11.59293270111084, "learning_rate": 2.2024126596230492e-07, "logits/chosen": 1.9453125, "logits/rejected": 1.3203125, "logps/chosen": -368.0, "logps/rejected": -490.0, "loss": 0.6159, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.5, "rewards/rejected": -1.8203125, "step": 793 }, { "epoch": 1.661957090528519, "grad_norm": 11.104989051818848, "learning_rate": 2.196767341656863e-07, "logits/chosen": 2.578125, "logits/rejected": 2.4375, "logps/chosen": -508.0, "logps/rejected": -824.0, "loss": 0.566, "rewards/accuracies": 0.75, "rewards/chosen": -1.28125, "rewards/margins": 0.251953125, "rewards/rejected": -1.53125, "step": 794 }, { "epoch": 1.664050235478807, "grad_norm": 11.907608032226562, "learning_rate": 2.1911235942201115e-07, "logits/chosen": 1.4375, "logits/rejected": 1.453125, "logps/chosen": -368.0, "logps/rejected": -332.0, "loss": 0.6334, "rewards/accuracies": 1.0, "rewards/chosen": -1.2109375, "rewards/margins": 0.625, "rewards/rejected": -1.8359375, "step": 795 }, { "epoch": 1.6661433804290948, "grad_norm": 10.751801490783691, "learning_rate": 2.1854814465193132e-07, "logits/chosen": 2.46875, "logits/rejected": 2.28125, "logps/chosen": -362.0, "logps/rejected": -374.0, "loss": 0.5655, "rewards/accuracies": 0.75, "rewards/chosen": -0.94140625, "rewards/margins": 0.33984375, "rewards/rejected": -1.28125, "step": 796 }, { "epoch": 1.6682365253793825, "grad_norm": 10.60224437713623, "learning_rate": 2.1798409277527064e-07, "logits/chosen": 1.2421875, "logits/rejected": 1.296875, "logps/chosen": -588.0, "logps/rejected": -552.0, "loss": 0.5637, "rewards/accuracies": 1.0, "rewards/chosen": -0.98046875, "rewards/margins": 0.58984375, "rewards/rejected": -1.5703125, "step": 797 }, { "epoch": 1.6703296703296702, "grad_norm": 10.94421672821045, "learning_rate": 2.174202067110099e-07, "logits/chosen": 2.5625, "logits/rejected": 2.75, "logps/chosen": -648.0, "logps/rejected": -624.0, "loss": 0.595, "rewards/accuracies": 0.5, "rewards/chosen": -1.796875, "rewards/margins": 0.013671875, "rewards/rejected": -1.8125, "step": 798 }, { "epoch": 1.6724228152799583, "grad_norm": 13.429807662963867, "learning_rate": 2.1685648937727202e-07, "logits/chosen": 2.0625, "logits/rejected": 1.6171875, "logps/chosen": -350.0, "logps/rejected": -510.0, "loss": 0.6668, "rewards/accuracies": 0.5, "rewards/chosen": -1.1953125, "rewards/margins": 0.1279296875, "rewards/rejected": -1.328125, "step": 799 }, { "epoch": 1.674515960230246, "grad_norm": 10.412588119506836, "learning_rate": 2.162929436913065e-07, "logits/chosen": 2.125, "logits/rejected": 2.078125, "logps/chosen": -584.0, "logps/rejected": -498.0, "loss": 0.5531, "rewards/accuracies": 1.0, "rewards/chosen": -1.2109375, "rewards/margins": 0.796875, "rewards/rejected": -2.0, "step": 800 }, { "epoch": 1.6766091051805336, "grad_norm": 10.976048469543457, "learning_rate": 2.157295725694747e-07, "logits/chosen": 1.5625, "logits/rejected": 1.890625, "logps/chosen": -241.0, "logps/rejected": -296.0, "loss": 0.6109, "rewards/accuracies": 1.0, "rewards/chosen": -0.96875, "rewards/margins": 0.6171875, "rewards/rejected": -1.5859375, "step": 801 }, { "epoch": 1.6787022501308215, "grad_norm": 11.032082557678223, "learning_rate": 2.1516637892723453e-07, "logits/chosen": 1.78125, "logits/rejected": 2.453125, "logps/chosen": -362.0, "logps/rejected": -378.0, "loss": 0.6015, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.44921875, "rewards/rejected": -1.640625, "step": 802 }, { "epoch": 1.6807953950811094, "grad_norm": 10.239130973815918, "learning_rate": 2.1460336567912553e-07, "logits/chosen": 2.5625, "logits/rejected": 3.21875, "logps/chosen": -492.0, "logps/rejected": -532.0, "loss": 0.5695, "rewards/accuracies": 1.0, "rewards/chosen": -1.1484375, "rewards/margins": 0.8671875, "rewards/rejected": -2.015625, "step": 803 }, { "epoch": 1.682888540031397, "grad_norm": 11.244982719421387, "learning_rate": 2.140405357387537e-07, "logits/chosen": 1.8828125, "logits/rejected": 2.375, "logps/chosen": -500.0, "logps/rejected": -458.0, "loss": 0.5852, "rewards/accuracies": 0.75, "rewards/chosen": -1.171875, "rewards/margins": 0.296875, "rewards/rejected": -1.46875, "step": 804 }, { "epoch": 1.684981684981685, "grad_norm": 10.452801704406738, "learning_rate": 2.1347789201877634e-07, "logits/chosen": 3.078125, "logits/rejected": 3.5, "logps/chosen": -536.0, "logps/rejected": -494.0, "loss": 0.5971, "rewards/accuracies": 0.5, "rewards/chosen": -0.96484375, "rewards/margins": 0.400390625, "rewards/rejected": -1.3671875, "step": 805 }, { "epoch": 1.6870748299319729, "grad_norm": 12.595290184020996, "learning_rate": 2.1291543743088687e-07, "logits/chosen": 2.09375, "logits/rejected": 2.265625, "logps/chosen": -668.0, "logps/rejected": -482.0, "loss": 0.653, "rewards/accuracies": 0.75, "rewards/chosen": -1.296875, "rewards/margins": 0.380859375, "rewards/rejected": -1.671875, "step": 806 }, { "epoch": 1.6891679748822606, "grad_norm": 10.873169898986816, "learning_rate": 2.1235317488580055e-07, "logits/chosen": 2.515625, "logits/rejected": 3.34375, "logps/chosen": -712.0, "logps/rejected": -552.0, "loss": 0.5862, "rewards/accuracies": 0.25, "rewards/chosen": -1.265625, "rewards/margins": -0.224609375, "rewards/rejected": -1.046875, "step": 807 }, { "epoch": 1.6912611198325485, "grad_norm": 12.01298713684082, "learning_rate": 2.1179110729323816e-07, "logits/chosen": 0.89453125, "logits/rejected": 1.3359375, "logps/chosen": -400.0, "logps/rejected": -298.0, "loss": 0.6112, "rewards/accuracies": 0.5, "rewards/chosen": -1.6796875, "rewards/margins": -0.1689453125, "rewards/rejected": -1.515625, "step": 808 }, { "epoch": 1.6933542647828363, "grad_norm": 10.95003604888916, "learning_rate": 2.1122923756191181e-07, "logits/chosen": 1.4765625, "logits/rejected": 1.984375, "logps/chosen": -708.0, "logps/rejected": -486.0, "loss": 0.6002, "rewards/accuracies": 0.5, "rewards/chosen": -1.46875, "rewards/margins": 0.30078125, "rewards/rejected": -1.765625, "step": 809 }, { "epoch": 1.695447409733124, "grad_norm": 11.224799156188965, "learning_rate": 2.1066756859950995e-07, "logits/chosen": 2.046875, "logits/rejected": 2.453125, "logps/chosen": -548.0, "logps/rejected": -510.0, "loss": 0.6023, "rewards/accuracies": 1.0, "rewards/chosen": -0.84765625, "rewards/margins": 0.63671875, "rewards/rejected": -1.484375, "step": 810 }, { "epoch": 1.6975405546834117, "grad_norm": 13.100383758544922, "learning_rate": 2.1010610331268168e-07, "logits/chosen": 2.21875, "logits/rejected": 2.96875, "logps/chosen": -520.0, "logps/rejected": -524.0, "loss": 0.6265, "rewards/accuracies": 0.75, "rewards/chosen": -1.328125, "rewards/margins": 0.390625, "rewards/rejected": -1.71875, "step": 811 }, { "epoch": 1.6996336996336996, "grad_norm": 10.363519668579102, "learning_rate": 2.0954484460702233e-07, "logits/chosen": 1.6171875, "logits/rejected": 2.015625, "logps/chosen": -552.0, "logps/rejected": -480.0, "loss": 0.6076, "rewards/accuracies": 0.75, "rewards/chosen": -1.125, "rewards/margins": 0.0859375, "rewards/rejected": -1.2109375, "step": 812 }, { "epoch": 1.7017268445839875, "grad_norm": 11.107074737548828, "learning_rate": 2.0898379538705773e-07, "logits/chosen": 3.125, "logits/rejected": 2.484375, "logps/chosen": -668.0, "logps/rejected": -960.0, "loss": 0.5918, "rewards/accuracies": 0.75, "rewards/chosen": -1.484375, "rewards/margins": 0.36328125, "rewards/rejected": -1.84375, "step": 813 }, { "epoch": 1.7038199895342752, "grad_norm": 10.36357307434082, "learning_rate": 2.0842295855623038e-07, "logits/chosen": 1.40625, "logits/rejected": 0.96484375, "logps/chosen": -308.0, "logps/rejected": -334.0, "loss": 0.5855, "rewards/accuracies": 0.5, "rewards/chosen": -1.0546875, "rewards/margins": 0.138671875, "rewards/rejected": -1.1953125, "step": 814 }, { "epoch": 1.705913134484563, "grad_norm": 10.832947731018066, "learning_rate": 2.0786233701688295e-07, "logits/chosen": 2.03125, "logits/rejected": 2.046875, "logps/chosen": -616.0, "logps/rejected": -548.0, "loss": 0.5809, "rewards/accuracies": 0.75, "rewards/chosen": -1.4921875, "rewards/margins": 0.384765625, "rewards/rejected": -1.875, "step": 815 }, { "epoch": 1.708006279434851, "grad_norm": 11.328465461730957, "learning_rate": 2.073019336702443e-07, "logits/chosen": 1.6015625, "logits/rejected": 1.1953125, "logps/chosen": -310.0, "logps/rejected": -334.0, "loss": 0.6189, "rewards/accuracies": 0.5, "rewards/chosen": -0.984375, "rewards/margins": 0.271484375, "rewards/rejected": -1.2578125, "step": 816 }, { "epoch": 1.7100994243851386, "grad_norm": 10.60431957244873, "learning_rate": 2.0674175141641406e-07, "logits/chosen": 2.359375, "logits/rejected": 2.453125, "logps/chosen": -446.0, "logps/rejected": -312.0, "loss": 0.6431, "rewards/accuracies": 0.5, "rewards/chosen": -1.1171875, "rewards/margins": 0.0634765625, "rewards/rejected": -1.1796875, "step": 817 }, { "epoch": 1.7121925693354265, "grad_norm": 10.303520202636719, "learning_rate": 2.0618179315434778e-07, "logits/chosen": 2.21875, "logits/rejected": 2.96875, "logps/chosen": -660.0, "logps/rejected": -372.0, "loss": 0.5334, "rewards/accuracies": 0.75, "rewards/chosen": -1.03125, "rewards/margins": 0.61328125, "rewards/rejected": -1.640625, "step": 818 }, { "epoch": 1.7142857142857144, "grad_norm": 10.645200729370117, "learning_rate": 2.056220617818418e-07, "logits/chosen": 1.5546875, "logits/rejected": 2.296875, "logps/chosen": -380.0, "logps/rejected": -398.0, "loss": 0.5867, "rewards/accuracies": 0.5, "rewards/chosen": -1.2109375, "rewards/margins": 0.7734375, "rewards/rejected": -1.984375, "step": 819 }, { "epoch": 1.716378859236002, "grad_norm": 10.647467613220215, "learning_rate": 2.0506256019551813e-07, "logits/chosen": 1.0078125, "logits/rejected": 1.484375, "logps/chosen": -450.0, "logps/rejected": -416.0, "loss": 0.5738, "rewards/accuracies": 0.5, "rewards/chosen": -1.671875, "rewards/margins": 0.2265625, "rewards/rejected": -1.890625, "step": 820 }, { "epoch": 1.7184720041862898, "grad_norm": 11.647187232971191, "learning_rate": 2.0450329129081003e-07, "logits/chosen": 2.890625, "logits/rejected": 2.828125, "logps/chosen": -604.0, "logps/rejected": -504.0, "loss": 0.642, "rewards/accuracies": 0.25, "rewards/chosen": -2.03125, "rewards/margins": -0.57421875, "rewards/rejected": -1.453125, "step": 821 }, { "epoch": 1.7205651491365777, "grad_norm": 10.969862937927246, "learning_rate": 2.0394425796194625e-07, "logits/chosen": 2.046875, "logits/rejected": 2.6875, "logps/chosen": -560.0, "logps/rejected": -446.0, "loss": 0.5625, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.50390625, "rewards/rejected": -1.734375, "step": 822 }, { "epoch": 1.7226582940868655, "grad_norm": 10.765937805175781, "learning_rate": 2.0338546310193655e-07, "logits/chosen": 1.8984375, "logits/rejected": 1.75, "logps/chosen": -468.0, "logps/rejected": -572.0, "loss": 0.588, "rewards/accuracies": 1.0, "rewards/chosen": -1.109375, "rewards/margins": 0.54296875, "rewards/rejected": -1.65625, "step": 823 }, { "epoch": 1.7247514390371532, "grad_norm": 11.399531364440918, "learning_rate": 2.0282690960255667e-07, "logits/chosen": 1.765625, "logits/rejected": 2.75, "logps/chosen": -452.0, "logps/rejected": -452.0, "loss": 0.6237, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.36328125, "rewards/rejected": -1.5703125, "step": 824 }, { "epoch": 1.7268445839874411, "grad_norm": 11.497965812683105, "learning_rate": 2.0226860035433326e-07, "logits/chosen": 2.46875, "logits/rejected": 2.421875, "logps/chosen": -556.0, "logps/rejected": -420.0, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": -1.2578125, "rewards/margins": 0.2119140625, "rewards/rejected": -1.46875, "step": 825 }, { "epoch": 1.728937728937729, "grad_norm": 10.845474243164062, "learning_rate": 2.0171053824652906e-07, "logits/chosen": 1.984375, "logits/rejected": 2.1875, "logps/chosen": -330.0, "logps/rejected": -490.0, "loss": 0.5722, "rewards/accuracies": 0.5, "rewards/chosen": -1.21875, "rewards/margins": 0.3359375, "rewards/rejected": -1.5546875, "step": 826 }, { "epoch": 1.7310308738880167, "grad_norm": 11.209230422973633, "learning_rate": 2.0115272616712755e-07, "logits/chosen": 2.40625, "logits/rejected": 3.46875, "logps/chosen": -824.0, "logps/rejected": -580.0, "loss": 0.5757, "rewards/accuracies": 1.0, "rewards/chosen": -1.3125, "rewards/margins": 0.431640625, "rewards/rejected": -1.7421875, "step": 827 }, { "epoch": 1.7331240188383046, "grad_norm": 14.437784194946289, "learning_rate": 2.0059516700281864e-07, "logits/chosen": 2.921875, "logits/rejected": 2.859375, "logps/chosen": -856.0, "logps/rejected": -856.0, "loss": 0.6504, "rewards/accuracies": 1.0, "rewards/chosen": -1.109375, "rewards/margins": 1.3359375, "rewards/rejected": -2.4375, "step": 828 }, { "epoch": 1.7352171637885925, "grad_norm": 11.006444931030273, "learning_rate": 2.0003786363898327e-07, "logits/chosen": 1.6953125, "logits/rejected": 2.59375, "logps/chosen": -506.0, "logps/rejected": -406.0, "loss": 0.5937, "rewards/accuracies": 1.0, "rewards/chosen": -1.03125, "rewards/margins": 0.5625, "rewards/rejected": -1.59375, "step": 829 }, { "epoch": 1.7373103087388801, "grad_norm": 12.13193130493164, "learning_rate": 1.9948081895967863e-07, "logits/chosen": 1.9453125, "logits/rejected": 2.4375, "logps/chosen": -548.0, "logps/rejected": -600.0, "loss": 0.6022, "rewards/accuracies": 0.75, "rewards/chosen": -1.4765625, "rewards/margins": 0.392578125, "rewards/rejected": -1.875, "step": 830 }, { "epoch": 1.7394034536891678, "grad_norm": 11.159494400024414, "learning_rate": 1.9892403584762313e-07, "logits/chosen": 1.90625, "logits/rejected": 1.6328125, "logps/chosen": -728.0, "logps/rejected": -588.0, "loss": 0.6099, "rewards/accuracies": 1.0, "rewards/chosen": -1.234375, "rewards/margins": 0.70703125, "rewards/rejected": -1.9375, "step": 831 }, { "epoch": 1.741496598639456, "grad_norm": 11.207942008972168, "learning_rate": 1.9836751718418172e-07, "logits/chosen": 1.8046875, "logits/rejected": 2.046875, "logps/chosen": -360.0, "logps/rejected": -196.0, "loss": 0.6046, "rewards/accuracies": 0.75, "rewards/chosen": -1.1796875, "rewards/margins": -0.1640625, "rewards/rejected": -1.015625, "step": 832 }, { "epoch": 1.7435897435897436, "grad_norm": 11.720377922058105, "learning_rate": 1.978112658493507e-07, "logits/chosen": 1.71875, "logits/rejected": 2.25, "logps/chosen": -728.0, "logps/rejected": -568.0, "loss": 0.6135, "rewards/accuracies": 0.5, "rewards/chosen": -2.4375, "rewards/margins": 0.19140625, "rewards/rejected": -2.625, "step": 833 }, { "epoch": 1.7456828885400313, "grad_norm": 11.53357219696045, "learning_rate": 1.972552847217429e-07, "logits/chosen": 2.046875, "logits/rejected": 2.375, "logps/chosen": -428.0, "logps/rejected": -386.0, "loss": 0.6111, "rewards/accuracies": 0.25, "rewards/chosen": -1.28125, "rewards/margins": -0.138671875, "rewards/rejected": -1.1484375, "step": 834 }, { "epoch": 1.7477760334903192, "grad_norm": 11.31391716003418, "learning_rate": 1.9669957667857292e-07, "logits/chosen": 1.1484375, "logits/rejected": 1.40625, "logps/chosen": -240.0, "logps/rejected": -224.0, "loss": 0.6174, "rewards/accuracies": 0.25, "rewards/chosen": -1.359375, "rewards/margins": 0.0400390625, "rewards/rejected": -1.40625, "step": 835 }, { "epoch": 1.749869178440607, "grad_norm": 12.320887565612793, "learning_rate": 1.9614414459564215e-07, "logits/chosen": 1.53125, "logits/rejected": 1.3984375, "logps/chosen": -350.0, "logps/rejected": -308.0, "loss": 0.6136, "rewards/accuracies": 0.5, "rewards/chosen": -1.4375, "rewards/margins": 0.14453125, "rewards/rejected": -1.5859375, "step": 836 }, { "epoch": 1.7519623233908947, "grad_norm": 13.618435859680176, "learning_rate": 1.955889913473238e-07, "logits/chosen": 1.875, "logits/rejected": 1.8671875, "logps/chosen": -294.0, "logps/rejected": -402.0, "loss": 0.6388, "rewards/accuracies": 0.75, "rewards/chosen": -1.4140625, "rewards/margins": 0.40625, "rewards/rejected": -1.8203125, "step": 837 }, { "epoch": 1.7540554683411826, "grad_norm": 10.987975120544434, "learning_rate": 1.9503411980654825e-07, "logits/chosen": 2.125, "logits/rejected": 1.8046875, "logps/chosen": -524.0, "logps/rejected": -486.0, "loss": 0.6343, "rewards/accuracies": 0.5, "rewards/chosen": -1.0078125, "rewards/margins": 0.5234375, "rewards/rejected": -1.53125, "step": 838 }, { "epoch": 1.7561486132914705, "grad_norm": 10.229272842407227, "learning_rate": 1.9447953284478773e-07, "logits/chosen": 1.6171875, "logits/rejected": 2.546875, "logps/chosen": -446.0, "logps/rejected": -368.0, "loss": 0.5864, "rewards/accuracies": 0.75, "rewards/chosen": -0.953125, "rewards/margins": 0.2333984375, "rewards/rejected": -1.1875, "step": 839 }, { "epoch": 1.7582417582417582, "grad_norm": 11.316136360168457, "learning_rate": 1.939252333320422e-07, "logits/chosen": 1.25, "logits/rejected": 1.1796875, "logps/chosen": -272.0, "logps/rejected": -468.0, "loss": 0.5621, "rewards/accuracies": 0.75, "rewards/chosen": -1.1171875, "rewards/margins": 1.1953125, "rewards/rejected": -2.3125, "step": 840 }, { "epoch": 1.7603349031920459, "grad_norm": 11.072029113769531, "learning_rate": 1.9337122413682376e-07, "logits/chosen": 2.5625, "logits/rejected": 3.140625, "logps/chosen": -1168.0, "logps/rejected": -656.0, "loss": 0.5701, "rewards/accuracies": 1.0, "rewards/chosen": -1.296875, "rewards/margins": 0.26953125, "rewards/rejected": -1.5703125, "step": 841 }, { "epoch": 1.762428048142334, "grad_norm": 11.808143615722656, "learning_rate": 1.9281750812614204e-07, "logits/chosen": 3.125, "logits/rejected": 2.484375, "logps/chosen": -572.0, "logps/rejected": -776.0, "loss": 0.6283, "rewards/accuracies": 0.5, "rewards/chosen": -1.3125, "rewards/margins": 0.32421875, "rewards/rejected": -1.640625, "step": 842 }, { "epoch": 1.7645211930926217, "grad_norm": 11.55233383178711, "learning_rate": 1.9226408816548979e-07, "logits/chosen": 2.8125, "logits/rejected": 2.5, "logps/chosen": -760.0, "logps/rejected": -704.0, "loss": 0.5926, "rewards/accuracies": 0.75, "rewards/chosen": -1.4296875, "rewards/margins": 0.44140625, "rewards/rejected": -1.875, "step": 843 }, { "epoch": 1.7666143380429093, "grad_norm": 10.958243370056152, "learning_rate": 1.9171096711882734e-07, "logits/chosen": 2.0625, "logits/rejected": 2.296875, "logps/chosen": -470.0, "logps/rejected": -426.0, "loss": 0.5719, "rewards/accuracies": 0.75, "rewards/chosen": -0.9296875, "rewards/margins": 0.81640625, "rewards/rejected": -1.75, "step": 844 }, { "epoch": 1.7687074829931972, "grad_norm": 10.45136833190918, "learning_rate": 1.9115814784856838e-07, "logits/chosen": 2.265625, "logits/rejected": 2.828125, "logps/chosen": -494.0, "logps/rejected": -460.0, "loss": 0.6025, "rewards/accuracies": 0.75, "rewards/chosen": -1.0390625, "rewards/margins": 0.9375, "rewards/rejected": -1.9765625, "step": 845 }, { "epoch": 1.7708006279434851, "grad_norm": 10.515970230102539, "learning_rate": 1.9060563321556467e-07, "logits/chosen": 3.03125, "logits/rejected": 2.6875, "logps/chosen": -700.0, "logps/rejected": -684.0, "loss": 0.5836, "rewards/accuracies": 1.0, "rewards/chosen": -0.9609375, "rewards/margins": 1.015625, "rewards/rejected": -1.96875, "step": 846 }, { "epoch": 1.7728937728937728, "grad_norm": 12.13499927520752, "learning_rate": 1.9005342607909175e-07, "logits/chosen": 1.8671875, "logits/rejected": 1.3203125, "logps/chosen": -244.0, "logps/rejected": -354.0, "loss": 0.6331, "rewards/accuracies": 0.5, "rewards/chosen": -1.1953125, "rewards/margins": 0.0859375, "rewards/rejected": -1.28125, "step": 847 }, { "epoch": 1.7749869178440607, "grad_norm": 11.108990669250488, "learning_rate": 1.8950152929683365e-07, "logits/chosen": 1.453125, "logits/rejected": 1.828125, "logps/chosen": -306.0, "logps/rejected": -262.0, "loss": 0.6347, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.0, "rewards/rejected": -1.234375, "step": 848 }, { "epoch": 1.7770800627943486, "grad_norm": 10.588078498840332, "learning_rate": 1.8894994572486834e-07, "logits/chosen": 1.5546875, "logits/rejected": 2.046875, "logps/chosen": -414.0, "logps/rejected": -576.0, "loss": 0.5927, "rewards/accuracies": 0.5, "rewards/chosen": -1.0234375, "rewards/margins": 0.53515625, "rewards/rejected": -1.5546875, "step": 849 }, { "epoch": 1.7791732077446363, "grad_norm": 10.220610618591309, "learning_rate": 1.8839867821765289e-07, "logits/chosen": 2.625, "logits/rejected": 2.71875, "logps/chosen": -1128.0, "logps/rejected": -656.0, "loss": 0.5764, "rewards/accuracies": 0.5, "rewards/chosen": -1.1953125, "rewards/margins": 0.279296875, "rewards/rejected": -1.46875, "step": 850 }, { "epoch": 1.7812663526949242, "grad_norm": 10.536320686340332, "learning_rate": 1.8784772962800886e-07, "logits/chosen": 2.6875, "logits/rejected": 2.265625, "logps/chosen": -298.0, "logps/rejected": -576.0, "loss": 0.6031, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 1.0, "rewards/rejected": -2.09375, "step": 851 }, { "epoch": 1.783359497645212, "grad_norm": 10.931295394897461, "learning_rate": 1.8729710280710732e-07, "logits/chosen": 1.96875, "logits/rejected": 2.15625, "logps/chosen": -474.0, "logps/rejected": -418.0, "loss": 0.6327, "rewards/accuracies": 0.75, "rewards/chosen": -1.15625, "rewards/margins": 0.060546875, "rewards/rejected": -1.2109375, "step": 852 }, { "epoch": 1.7854526425954997, "grad_norm": 10.23117446899414, "learning_rate": 1.867468006044541e-07, "logits/chosen": 2.796875, "logits/rejected": 3.125, "logps/chosen": -948.0, "logps/rejected": -948.0, "loss": 0.5796, "rewards/accuracies": 1.0, "rewards/chosen": -1.2890625, "rewards/margins": 0.625, "rewards/rejected": -1.90625, "step": 853 }, { "epoch": 1.7875457875457874, "grad_norm": 10.816021919250488, "learning_rate": 1.8619682586787537e-07, "logits/chosen": 1.4140625, "logits/rejected": 1.8828125, "logps/chosen": -628.0, "logps/rejected": -544.0, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": -1.3125, "rewards/margins": 0.546875, "rewards/rejected": -1.859375, "step": 854 }, { "epoch": 1.7896389324960753, "grad_norm": 11.199675559997559, "learning_rate": 1.8564718144350244e-07, "logits/chosen": 2.25, "logits/rejected": 3.671875, "logps/chosen": -760.0, "logps/rejected": -480.0, "loss": 0.5731, "rewards/accuracies": 0.5, "rewards/chosen": -1.3046875, "rewards/margins": 0.21484375, "rewards/rejected": -1.515625, "step": 855 }, { "epoch": 1.7917320774463632, "grad_norm": 10.781134605407715, "learning_rate": 1.850978701757572e-07, "logits/chosen": 2.390625, "logits/rejected": 2.953125, "logps/chosen": -732.0, "logps/rejected": -332.0, "loss": 0.5674, "rewards/accuracies": 0.5, "rewards/chosen": -1.390625, "rewards/margins": -0.0244140625, "rewards/rejected": -1.3671875, "step": 856 }, { "epoch": 1.7938252223966509, "grad_norm": 10.734904289245605, "learning_rate": 1.8454889490733757e-07, "logits/chosen": 1.8203125, "logits/rejected": 1.9453125, "logps/chosen": -596.0, "logps/rejected": -440.0, "loss": 0.5771, "rewards/accuracies": 0.5, "rewards/chosen": -1.2890625, "rewards/margins": -0.1591796875, "rewards/rejected": -1.1328125, "step": 857 }, { "epoch": 1.7959183673469388, "grad_norm": 10.833751678466797, "learning_rate": 1.840002584792027e-07, "logits/chosen": 1.3046875, "logits/rejected": 2.1875, "logps/chosen": -418.0, "logps/rejected": -436.0, "loss": 0.5985, "rewards/accuracies": 0.5, "rewards/chosen": -1.578125, "rewards/margins": 0.2890625, "rewards/rejected": -1.8671875, "step": 858 }, { "epoch": 1.7980115122972267, "grad_norm": 10.765353202819824, "learning_rate": 1.8345196373055826e-07, "logits/chosen": 1.375, "logits/rejected": 1.4296875, "logps/chosen": -612.0, "logps/rejected": -342.0, "loss": 0.5849, "rewards/accuracies": 0.25, "rewards/chosen": -1.9296875, "rewards/margins": -0.6796875, "rewards/rejected": -1.2421875, "step": 859 }, { "epoch": 1.8001046572475143, "grad_norm": 10.382110595703125, "learning_rate": 1.8290401349884158e-07, "logits/chosen": 2.109375, "logits/rejected": 2.671875, "logps/chosen": -492.0, "logps/rejected": -326.0, "loss": 0.5628, "rewards/accuracies": 0.75, "rewards/chosen": -1.2265625, "rewards/margins": 0.375, "rewards/rejected": -1.6015625, "step": 860 }, { "epoch": 1.8021978021978022, "grad_norm": 10.998440742492676, "learning_rate": 1.8235641061970693e-07, "logits/chosen": 2.5, "logits/rejected": 1.71875, "logps/chosen": -320.0, "logps/rejected": -536.0, "loss": 0.585, "rewards/accuracies": 1.0, "rewards/chosen": -1.4296875, "rewards/margins": 0.7734375, "rewards/rejected": -2.203125, "step": 861 }, { "epoch": 1.8042909471480901, "grad_norm": 10.22006607055664, "learning_rate": 1.8180915792701165e-07, "logits/chosen": 1.5390625, "logits/rejected": 1.7734375, "logps/chosen": -616.0, "logps/rejected": -280.0, "loss": 0.6156, "rewards/accuracies": 1.0, "rewards/chosen": -0.7578125, "rewards/margins": 0.30078125, "rewards/rejected": -1.0625, "step": 862 }, { "epoch": 1.8063840920983778, "grad_norm": 11.575730323791504, "learning_rate": 1.8126225825280022e-07, "logits/chosen": 1.9609375, "logits/rejected": 2.28125, "logps/chosen": -544.0, "logps/rejected": -438.0, "loss": 0.6018, "rewards/accuracies": 0.75, "rewards/chosen": -1.21875, "rewards/margins": 0.2890625, "rewards/rejected": -1.5, "step": 863 }, { "epoch": 1.8084772370486655, "grad_norm": 10.908926010131836, "learning_rate": 1.807157144272905e-07, "logits/chosen": 1.78125, "logits/rejected": 2.21875, "logps/chosen": -386.0, "logps/rejected": -402.0, "loss": 0.5911, "rewards/accuracies": 0.75, "rewards/chosen": -1.0625, "rewards/margins": 0.2060546875, "rewards/rejected": -1.265625, "step": 864 }, { "epoch": 1.8105703819989536, "grad_norm": 11.957222938537598, "learning_rate": 1.8016952927885893e-07, "logits/chosen": 2.28125, "logits/rejected": 2.125, "logps/chosen": -568.0, "logps/rejected": -676.0, "loss": 0.6458, "rewards/accuracies": 0.75, "rewards/chosen": -1.578125, "rewards/margins": 0.31640625, "rewards/rejected": -1.890625, "step": 865 }, { "epoch": 1.8126635269492413, "grad_norm": 10.530887603759766, "learning_rate": 1.7962370563402566e-07, "logits/chosen": 1.3359375, "logits/rejected": 2.109375, "logps/chosen": -390.0, "logps/rejected": -246.0, "loss": 0.5985, "rewards/accuracies": 0.5, "rewards/chosen": -0.96875, "rewards/margins": 0.09375, "rewards/rejected": -1.0625, "step": 866 }, { "epoch": 1.814756671899529, "grad_norm": 10.46704387664795, "learning_rate": 1.7907824631744e-07, "logits/chosen": 2.375, "logits/rejected": 1.953125, "logps/chosen": -544.0, "logps/rejected": -450.0, "loss": 0.5895, "rewards/accuracies": 0.75, "rewards/chosen": -1.03125, "rewards/margins": 0.6328125, "rewards/rejected": -1.65625, "step": 867 }, { "epoch": 1.8168498168498168, "grad_norm": 11.886927604675293, "learning_rate": 1.7853315415186579e-07, "logits/chosen": 1.7734375, "logits/rejected": 1.78125, "logps/chosen": -508.0, "logps/rejected": -394.0, "loss": 0.6206, "rewards/accuracies": 0.5, "rewards/chosen": -1.1796875, "rewards/margins": 0.482421875, "rewards/rejected": -1.6640625, "step": 868 }, { "epoch": 1.8189429618001047, "grad_norm": 10.469581604003906, "learning_rate": 1.779884319581673e-07, "logits/chosen": 1.90625, "logits/rejected": 1.875, "logps/chosen": -440.0, "logps/rejected": -456.0, "loss": 0.5681, "rewards/accuracies": 0.75, "rewards/chosen": -0.63671875, "rewards/margins": 0.55078125, "rewards/rejected": -1.1875, "step": 869 }, { "epoch": 1.8210361067503924, "grad_norm": 10.928384780883789, "learning_rate": 1.7744408255529361e-07, "logits/chosen": 1.34375, "logits/rejected": 2.265625, "logps/chosen": -580.0, "logps/rejected": -478.0, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": -1.4921875, "rewards/margins": 0.310546875, "rewards/rejected": -1.8046875, "step": 870 }, { "epoch": 1.8231292517006803, "grad_norm": 11.822400093078613, "learning_rate": 1.7690010876026495e-07, "logits/chosen": 2.28125, "logits/rejected": 2.21875, "logps/chosen": -552.0, "logps/rejected": -442.0, "loss": 0.642, "rewards/accuracies": 0.5, "rewards/chosen": -1.515625, "rewards/margins": -0.140625, "rewards/rejected": -1.375, "step": 871 }, { "epoch": 1.8252223966509682, "grad_norm": 11.517511367797852, "learning_rate": 1.7635651338815767e-07, "logits/chosen": 1.4921875, "logits/rejected": 1.53125, "logps/chosen": -350.0, "logps/rejected": -320.0, "loss": 0.5991, "rewards/accuracies": 0.75, "rewards/chosen": -1.328125, "rewards/margins": 0.29296875, "rewards/rejected": -1.6171875, "step": 872 }, { "epoch": 1.8273155416012559, "grad_norm": 11.321386337280273, "learning_rate": 1.758132992520898e-07, "logits/chosen": 0.82421875, "logits/rejected": 0.80078125, "logps/chosen": -312.0, "logps/rejected": -296.0, "loss": 0.5651, "rewards/accuracies": 0.75, "rewards/chosen": -1.1796875, "rewards/margins": -0.001953125, "rewards/rejected": -1.171875, "step": 873 }, { "epoch": 1.8294086865515435, "grad_norm": 10.976082801818848, "learning_rate": 1.7527046916320643e-07, "logits/chosen": 1.4296875, "logits/rejected": 2.296875, "logps/chosen": -620.0, "logps/rejected": -492.0, "loss": 0.6224, "rewards/accuracies": 0.5, "rewards/chosen": -1.4140625, "rewards/margins": 0.09033203125, "rewards/rejected": -1.5078125, "step": 874 }, { "epoch": 1.8315018315018317, "grad_norm": 10.818897247314453, "learning_rate": 1.7472802593066518e-07, "logits/chosen": 1.6796875, "logits/rejected": 1.5859375, "logps/chosen": -446.0, "logps/rejected": -418.0, "loss": 0.625, "rewards/accuracies": 0.75, "rewards/chosen": -1.453125, "rewards/margins": 0.3046875, "rewards/rejected": -1.765625, "step": 875 }, { "epoch": 1.8335949764521193, "grad_norm": 10.937468528747559, "learning_rate": 1.7418597236162187e-07, "logits/chosen": 1.5859375, "logits/rejected": 2.0625, "logps/chosen": -448.0, "logps/rejected": -988.0, "loss": 0.6065, "rewards/accuracies": 0.75, "rewards/chosen": -1.8671875, "rewards/margins": 1.140625, "rewards/rejected": -3.0, "step": 876 }, { "epoch": 1.835688121402407, "grad_norm": 11.407981872558594, "learning_rate": 1.7364431126121546e-07, "logits/chosen": 1.21875, "logits/rejected": 1.921875, "logps/chosen": -292.0, "logps/rejected": -201.0, "loss": 0.6084, "rewards/accuracies": 0.25, "rewards/chosen": -1.09375, "rewards/margins": 0.09375, "rewards/rejected": -1.1875, "step": 877 }, { "epoch": 1.837781266352695, "grad_norm": 9.613057136535645, "learning_rate": 1.7310304543255417e-07, "logits/chosen": 2.140625, "logits/rejected": 2.21875, "logps/chosen": -584.0, "logps/rejected": -384.0, "loss": 0.5748, "rewards/accuracies": 1.0, "rewards/chosen": -1.359375, "rewards/margins": 0.40625, "rewards/rejected": -1.765625, "step": 878 }, { "epoch": 1.8398744113029828, "grad_norm": 11.72396469116211, "learning_rate": 1.7256217767670046e-07, "logits/chosen": 1.9609375, "logits/rejected": 2.078125, "logps/chosen": -498.0, "logps/rejected": -576.0, "loss": 0.6049, "rewards/accuracies": 1.0, "rewards/chosen": -1.03125, "rewards/margins": 1.03125, "rewards/rejected": -2.0625, "step": 879 }, { "epoch": 1.8419675562532705, "grad_norm": 11.825212478637695, "learning_rate": 1.7202171079265702e-07, "logits/chosen": 2.25, "logits/rejected": 1.640625, "logps/chosen": -396.0, "logps/rejected": -408.0, "loss": 0.6028, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.20703125, "rewards/rejected": -1.390625, "step": 880 }, { "epoch": 1.8440607012035584, "grad_norm": 13.535147666931152, "learning_rate": 1.7148164757735178e-07, "logits/chosen": 1.4296875, "logits/rejected": 1.9140625, "logps/chosen": -492.0, "logps/rejected": -450.0, "loss": 0.6377, "rewards/accuracies": 1.0, "rewards/chosen": -0.765625, "rewards/margins": 0.515625, "rewards/rejected": -1.28125, "step": 881 }, { "epoch": 1.8461538461538463, "grad_norm": 10.785517692565918, "learning_rate": 1.7094199082562378e-07, "logits/chosen": 1.3125, "logits/rejected": 2.203125, "logps/chosen": -374.0, "logps/rejected": -320.0, "loss": 0.6003, "rewards/accuracies": 0.75, "rewards/chosen": -0.91796875, "rewards/margins": 0.244140625, "rewards/rejected": -1.1640625, "step": 882 }, { "epoch": 1.848246991104134, "grad_norm": 10.35688591003418, "learning_rate": 1.7040274333020858e-07, "logits/chosen": 1.4296875, "logits/rejected": 1.8203125, "logps/chosen": -616.0, "logps/rejected": -468.0, "loss": 0.5577, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.29296875, "rewards/rejected": -1.4296875, "step": 883 }, { "epoch": 1.8503401360544216, "grad_norm": 10.81713581085205, "learning_rate": 1.6986390788172395e-07, "logits/chosen": 1.5234375, "logits/rejected": 1.8984375, "logps/chosen": -322.0, "logps/rejected": -320.0, "loss": 0.5617, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.2490234375, "rewards/rejected": -1.484375, "step": 884 }, { "epoch": 1.8524332810047097, "grad_norm": 12.084059715270996, "learning_rate": 1.6932548726865504e-07, "logits/chosen": 2.90625, "logits/rejected": 2.828125, "logps/chosen": -756.0, "logps/rejected": -904.0, "loss": 0.6348, "rewards/accuracies": 0.75, "rewards/chosen": -1.5390625, "rewards/margins": 0.78125, "rewards/rejected": -2.3125, "step": 885 }, { "epoch": 1.8545264259549974, "grad_norm": 11.9383544921875, "learning_rate": 1.687874842773403e-07, "logits/chosen": 1.8359375, "logits/rejected": 2.875, "logps/chosen": -528.0, "logps/rejected": -392.0, "loss": 0.6028, "rewards/accuracies": 1.0, "rewards/chosen": -0.75, "rewards/margins": 0.6875, "rewards/rejected": -1.4375, "step": 886 }, { "epoch": 1.856619570905285, "grad_norm": 11.799210548400879, "learning_rate": 1.682499016919573e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.5703125, "logps/chosen": -340.0, "logps/rejected": -364.0, "loss": 0.6263, "rewards/accuracies": 0.5, "rewards/chosen": -1.296875, "rewards/margins": 0.3671875, "rewards/rejected": -1.6640625, "step": 887 }, { "epoch": 1.858712715855573, "grad_norm": 10.321686744689941, "learning_rate": 1.6771274229450764e-07, "logits/chosen": 2.171875, "logits/rejected": 2.09375, "logps/chosen": -624.0, "logps/rejected": -600.0, "loss": 0.5805, "rewards/accuracies": 0.75, "rewards/chosen": -1.0546875, "rewards/margins": 0.46484375, "rewards/rejected": -1.5234375, "step": 888 }, { "epoch": 1.8608058608058609, "grad_norm": 11.320591926574707, "learning_rate": 1.6717600886480297e-07, "logits/chosen": 1.7734375, "logits/rejected": 2.546875, "logps/chosen": -612.0, "logps/rejected": -792.0, "loss": 0.6008, "rewards/accuracies": 0.25, "rewards/chosen": -1.5, "rewards/margins": -0.0703125, "rewards/rejected": -1.421875, "step": 889 }, { "epoch": 1.8628990057561485, "grad_norm": 11.110066413879395, "learning_rate": 1.6663970418045052e-07, "logits/chosen": 1.6953125, "logits/rejected": 2.375, "logps/chosen": -552.0, "logps/rejected": -414.0, "loss": 0.5938, "rewards/accuracies": 0.5, "rewards/chosen": -1.5, "rewards/margins": 0.140625, "rewards/rejected": -1.640625, "step": 890 }, { "epoch": 1.8649921507064364, "grad_norm": 12.742599487304688, "learning_rate": 1.6610383101683913e-07, "logits/chosen": 1.9140625, "logits/rejected": 1.6328125, "logps/chosen": -314.0, "logps/rejected": -608.0, "loss": 0.6081, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.47265625, "rewards/rejected": -1.7109375, "step": 891 }, { "epoch": 1.8670852956567243, "grad_norm": 10.694188117980957, "learning_rate": 1.6556839214712397e-07, "logits/chosen": 1.6875, "logits/rejected": 2.109375, "logps/chosen": -444.0, "logps/rejected": -458.0, "loss": 0.5673, "rewards/accuracies": 0.75, "rewards/chosen": -1.140625, "rewards/margins": 0.80078125, "rewards/rejected": -1.9375, "step": 892 }, { "epoch": 1.869178440607012, "grad_norm": 11.569079399108887, "learning_rate": 1.6503339034221296e-07, "logits/chosen": 1.703125, "logits/rejected": 1.5234375, "logps/chosen": -592.0, "logps/rejected": -744.0, "loss": 0.6228, "rewards/accuracies": 1.0, "rewards/chosen": -1.3203125, "rewards/margins": 0.33203125, "rewards/rejected": -1.65625, "step": 893 }, { "epoch": 1.8712715855572999, "grad_norm": 11.102907180786133, "learning_rate": 1.644988283707524e-07, "logits/chosen": 2.5625, "logits/rejected": 2.203125, "logps/chosen": -504.0, "logps/rejected": -656.0, "loss": 0.5926, "rewards/accuracies": 1.0, "rewards/chosen": -1.21875, "rewards/margins": 0.87109375, "rewards/rejected": -2.09375, "step": 894 }, { "epoch": 1.8733647305075878, "grad_norm": 10.342639923095703, "learning_rate": 1.639647089991121e-07, "logits/chosen": 2.21875, "logits/rejected": 2.59375, "logps/chosen": -348.0, "logps/rejected": -376.0, "loss": 0.5621, "rewards/accuracies": 0.75, "rewards/chosen": -1.1953125, "rewards/margins": 0.236328125, "rewards/rejected": -1.4296875, "step": 895 }, { "epoch": 1.8754578754578755, "grad_norm": 11.535297393798828, "learning_rate": 1.6343103499137167e-07, "logits/chosen": 1.671875, "logits/rejected": 1.90625, "logps/chosen": -336.0, "logps/rejected": -390.0, "loss": 0.6047, "rewards/accuracies": 0.75, "rewards/chosen": -1.703125, "rewards/margins": 0.48828125, "rewards/rejected": -2.1875, "step": 896 }, { "epoch": 1.8775510204081631, "grad_norm": 10.489069938659668, "learning_rate": 1.628978091093056e-07, "logits/chosen": 2.09375, "logits/rejected": 2.609375, "logps/chosen": -788.0, "logps/rejected": -536.0, "loss": 0.5512, "rewards/accuracies": 0.75, "rewards/chosen": -1.359375, "rewards/margins": 0.1796875, "rewards/rejected": -1.5390625, "step": 897 }, { "epoch": 1.879644165358451, "grad_norm": 11.661781311035156, "learning_rate": 1.6236503411236996e-07, "logits/chosen": 2.15625, "logits/rejected": 2.46875, "logps/chosen": -358.0, "logps/rejected": -376.0, "loss": 0.5957, "rewards/accuracies": 0.5, "rewards/chosen": -1.2109375, "rewards/margins": 0.158203125, "rewards/rejected": -1.3671875, "step": 898 }, { "epoch": 1.881737310308739, "grad_norm": 11.3607816696167, "learning_rate": 1.6183271275768678e-07, "logits/chosen": 1.9765625, "logits/rejected": 1.953125, "logps/chosen": -330.0, "logps/rejected": -342.0, "loss": 0.6138, "rewards/accuracies": 0.75, "rewards/chosen": -1.234375, "rewards/margins": 0.30859375, "rewards/rejected": -1.546875, "step": 899 }, { "epoch": 1.8838304552590266, "grad_norm": 10.10186767578125, "learning_rate": 1.6130084780003093e-07, "logits/chosen": 3.375, "logits/rejected": 3.09375, "logps/chosen": -960.0, "logps/rejected": -948.0, "loss": 0.5809, "rewards/accuracies": 0.5, "rewards/chosen": -1.5, "rewards/margins": 0.056640625, "rewards/rejected": -1.5625, "step": 900 }, { "epoch": 1.8859236002093145, "grad_norm": 11.025842666625977, "learning_rate": 1.607694419918151e-07, "logits/chosen": 1.7890625, "logits/rejected": 1.96875, "logps/chosen": -652.0, "logps/rejected": -684.0, "loss": 0.5718, "rewards/accuracies": 0.75, "rewards/chosen": -1.8203125, "rewards/margins": 0.68359375, "rewards/rejected": -2.5, "step": 901 }, { "epoch": 1.8880167451596024, "grad_norm": 10.691457748413086, "learning_rate": 1.602384980830762e-07, "logits/chosen": 2.34375, "logits/rejected": 1.5390625, "logps/chosen": -460.0, "logps/rejected": -402.0, "loss": 0.5914, "rewards/accuracies": 0.5, "rewards/chosen": -1.015625, "rewards/margins": 0.7578125, "rewards/rejected": -1.7734375, "step": 902 }, { "epoch": 1.89010989010989, "grad_norm": 10.016210556030273, "learning_rate": 1.597080188214607e-07, "logits/chosen": 1.390625, "logits/rejected": 2.40625, "logps/chosen": -380.0, "logps/rejected": -376.0, "loss": 0.559, "rewards/accuracies": 0.75, "rewards/chosen": -0.98046875, "rewards/margins": 0.173828125, "rewards/rejected": -1.15625, "step": 903 }, { "epoch": 1.892203035060178, "grad_norm": 12.242632865905762, "learning_rate": 1.5917800695221019e-07, "logits/chosen": 2.203125, "logits/rejected": 2.65625, "logps/chosen": -516.0, "logps/rejected": -368.0, "loss": 0.605, "rewards/accuracies": 1.0, "rewards/chosen": -0.9765625, "rewards/margins": 0.359375, "rewards/rejected": -1.3359375, "step": 904 }, { "epoch": 1.8942961800104658, "grad_norm": 11.27692699432373, "learning_rate": 1.5864846521814807e-07, "logits/chosen": 1.671875, "logits/rejected": 1.5859375, "logps/chosen": -286.0, "logps/rejected": -584.0, "loss": 0.6068, "rewards/accuracies": 1.0, "rewards/chosen": -1.15625, "rewards/margins": 0.578125, "rewards/rejected": -1.734375, "step": 905 }, { "epoch": 1.8963893249607535, "grad_norm": 10.626609802246094, "learning_rate": 1.5811939635966424e-07, "logits/chosen": 1.8671875, "logits/rejected": 2.4375, "logps/chosen": -436.0, "logps/rejected": -272.0, "loss": 0.5666, "rewards/accuracies": 1.0, "rewards/chosen": -0.99609375, "rewards/margins": 0.53125, "rewards/rejected": -1.5234375, "step": 906 }, { "epoch": 1.8984824699110412, "grad_norm": 10.588828086853027, "learning_rate": 1.5759080311470184e-07, "logits/chosen": 1.921875, "logits/rejected": 1.3671875, "logps/chosen": -470.0, "logps/rejected": -510.0, "loss": 0.6039, "rewards/accuracies": 1.0, "rewards/chosen": -1.1953125, "rewards/margins": 0.39453125, "rewards/rejected": -1.59375, "step": 907 }, { "epoch": 1.9005756148613293, "grad_norm": 10.58249568939209, "learning_rate": 1.570626882187423e-07, "logits/chosen": 1.671875, "logits/rejected": 1.75, "logps/chosen": -230.0, "logps/rejected": -360.0, "loss": 0.5565, "rewards/accuracies": 0.5, "rewards/chosen": -1.3515625, "rewards/margins": 0.578125, "rewards/rejected": -1.921875, "step": 908 }, { "epoch": 1.902668759811617, "grad_norm": 11.328306198120117, "learning_rate": 1.5653505440479215e-07, "logits/chosen": 2.703125, "logits/rejected": 2.71875, "logps/chosen": -832.0, "logps/rejected": -584.0, "loss": 0.6241, "rewards/accuracies": 1.0, "rewards/chosen": -0.9453125, "rewards/margins": 0.41015625, "rewards/rejected": -1.359375, "step": 909 }, { "epoch": 1.9047619047619047, "grad_norm": 12.08849811553955, "learning_rate": 1.5600790440336784e-07, "logits/chosen": 2.140625, "logits/rejected": 2.296875, "logps/chosen": -596.0, "logps/rejected": -576.0, "loss": 0.6246, "rewards/accuracies": 0.75, "rewards/chosen": -1.0390625, "rewards/margins": 0.44921875, "rewards/rejected": -1.4921875, "step": 910 }, { "epoch": 1.9068550497121926, "grad_norm": 10.502798080444336, "learning_rate": 1.554812409424822e-07, "logits/chosen": 2.21875, "logits/rejected": 3.71875, "logps/chosen": -736.0, "logps/rejected": -632.0, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": -1.6171875, "rewards/margins": 0.416015625, "rewards/rejected": -2.03125, "step": 911 }, { "epoch": 1.9089481946624804, "grad_norm": 11.264755249023438, "learning_rate": 1.5495506674763014e-07, "logits/chosen": 1.359375, "logits/rejected": 1.8125, "logps/chosen": -228.0, "logps/rejected": -388.0, "loss": 0.5653, "rewards/accuracies": 1.0, "rewards/chosen": -1.1875, "rewards/margins": 0.75, "rewards/rejected": -1.9375, "step": 912 }, { "epoch": 1.9110413396127681, "grad_norm": 14.091341972351074, "learning_rate": 1.544293845417749e-07, "logits/chosen": 1.6171875, "logits/rejected": 1.953125, "logps/chosen": -592.0, "logps/rejected": -284.0, "loss": 0.6636, "rewards/accuracies": 0.75, "rewards/chosen": -1.7109375, "rewards/margins": -0.28125, "rewards/rejected": -1.4375, "step": 913 }, { "epoch": 1.913134484563056, "grad_norm": 10.890253067016602, "learning_rate": 1.5390419704533341e-07, "logits/chosen": 2.671875, "logits/rejected": 3.359375, "logps/chosen": -800.0, "logps/rejected": -776.0, "loss": 0.5613, "rewards/accuracies": 0.75, "rewards/chosen": -1.796875, "rewards/margins": 0.484375, "rewards/rejected": -2.28125, "step": 914 }, { "epoch": 1.915227629513344, "grad_norm": 11.80103588104248, "learning_rate": 1.5337950697616237e-07, "logits/chosen": 1.53125, "logits/rejected": 2.921875, "logps/chosen": -552.0, "logps/rejected": -580.0, "loss": 0.5861, "rewards/accuracies": 1.0, "rewards/chosen": -1.0, "rewards/margins": 0.5234375, "rewards/rejected": -1.5234375, "step": 915 }, { "epoch": 1.9173207744636316, "grad_norm": 11.781697273254395, "learning_rate": 1.5285531704954466e-07, "logits/chosen": 1.96875, "logits/rejected": 2.265625, "logps/chosen": -308.0, "logps/rejected": -296.0, "loss": 0.6292, "rewards/accuracies": 1.0, "rewards/chosen": -1.03125, "rewards/margins": 0.4296875, "rewards/rejected": -1.4609375, "step": 916 }, { "epoch": 1.9194139194139193, "grad_norm": 11.060654640197754, "learning_rate": 1.5233162997817455e-07, "logits/chosen": 2.4375, "logits/rejected": 2.125, "logps/chosen": -302.0, "logps/rejected": -460.0, "loss": 0.5788, "rewards/accuracies": 1.0, "rewards/chosen": -1.453125, "rewards/margins": 0.23046875, "rewards/rejected": -1.6875, "step": 917 }, { "epoch": 1.9215070643642074, "grad_norm": 10.4055814743042, "learning_rate": 1.5180844847214423e-07, "logits/chosen": 3.09375, "logits/rejected": 3.25, "logps/chosen": -816.0, "logps/rejected": -486.0, "loss": 0.5728, "rewards/accuracies": 1.0, "rewards/chosen": -0.9765625, "rewards/margins": 0.68359375, "rewards/rejected": -1.65625, "step": 918 }, { "epoch": 1.923600209314495, "grad_norm": 10.363648414611816, "learning_rate": 1.5128577523892936e-07, "logits/chosen": 1.84375, "logits/rejected": 1.7734375, "logps/chosen": -302.0, "logps/rejected": -264.0, "loss": 0.5987, "rewards/accuracies": 0.5, "rewards/chosen": -1.3671875, "rewards/margins": -0.1220703125, "rewards/rejected": -1.2421875, "step": 919 }, { "epoch": 1.9256933542647827, "grad_norm": 11.018433570861816, "learning_rate": 1.5076361298337561e-07, "logits/chosen": 2.40625, "logits/rejected": 2.421875, "logps/chosen": -544.0, "logps/rejected": -452.0, "loss": 0.5838, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.09765625, "rewards/rejected": -1.1875, "step": 920 }, { "epoch": 1.9277864992150706, "grad_norm": 11.115631103515625, "learning_rate": 1.50241964407684e-07, "logits/chosen": 1.5078125, "logits/rejected": 1.640625, "logps/chosen": -388.0, "logps/rejected": -524.0, "loss": 0.5934, "rewards/accuracies": 0.75, "rewards/chosen": -1.5625, "rewards/margins": 0.345703125, "rewards/rejected": -1.90625, "step": 921 }, { "epoch": 1.9298796441653585, "grad_norm": 11.797347068786621, "learning_rate": 1.4972083221139747e-07, "logits/chosen": 2.46875, "logits/rejected": 2.234375, "logps/chosen": -616.0, "logps/rejected": -512.0, "loss": 0.6202, "rewards/accuracies": 1.0, "rewards/chosen": -0.9609375, "rewards/margins": 0.5546875, "rewards/rejected": -1.515625, "step": 922 }, { "epoch": 1.9319727891156462, "grad_norm": 11.789385795593262, "learning_rate": 1.4920021909138656e-07, "logits/chosen": 2.1875, "logits/rejected": 2.6875, "logps/chosen": -404.0, "logps/rejected": -318.0, "loss": 0.5951, "rewards/accuracies": 0.75, "rewards/chosen": -1.2109375, "rewards/margins": 0.17578125, "rewards/rejected": -1.390625, "step": 923 }, { "epoch": 1.934065934065934, "grad_norm": 11.329683303833008, "learning_rate": 1.4868012774183568e-07, "logits/chosen": 1.6484375, "logits/rejected": 1.4140625, "logps/chosen": -324.0, "logps/rejected": -548.0, "loss": 0.611, "rewards/accuracies": 0.75, "rewards/chosen": -1.28125, "rewards/margins": 0.890625, "rewards/rejected": -2.171875, "step": 924 }, { "epoch": 1.936159079016222, "grad_norm": 11.30130672454834, "learning_rate": 1.4816056085422904e-07, "logits/chosen": 2.21875, "logits/rejected": 2.75, "logps/chosen": -438.0, "logps/rejected": -496.0, "loss": 0.5717, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.291015625, "rewards/rejected": -1.609375, "step": 925 }, { "epoch": 1.9382522239665096, "grad_norm": 11.121234893798828, "learning_rate": 1.4764152111733649e-07, "logits/chosen": 1.9296875, "logits/rejected": 2.296875, "logps/chosen": -380.0, "logps/rejected": -382.0, "loss": 0.5575, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.6328125, "rewards/rejected": -1.890625, "step": 926 }, { "epoch": 1.9403453689167975, "grad_norm": 11.710637092590332, "learning_rate": 1.471230112172004e-07, "logits/chosen": 2.640625, "logits/rejected": 3.015625, "logps/chosen": -744.0, "logps/rejected": -580.0, "loss": 0.5562, "rewards/accuracies": 1.0, "rewards/chosen": -0.890625, "rewards/margins": 1.6875, "rewards/rejected": -2.578125, "step": 927 }, { "epoch": 1.9424385138670854, "grad_norm": 11.492278099060059, "learning_rate": 1.466050338371207e-07, "logits/chosen": 2.59375, "logits/rejected": 3.15625, "logps/chosen": -524.0, "logps/rejected": -380.0, "loss": 0.5853, "rewards/accuracies": 0.5, "rewards/chosen": -1.75, "rewards/margins": 0.07421875, "rewards/rejected": -1.8203125, "step": 928 }, { "epoch": 1.944531658817373, "grad_norm": 11.663466453552246, "learning_rate": 1.460875916576418e-07, "logits/chosen": 2.0625, "logits/rejected": 3.46875, "logps/chosen": -756.0, "logps/rejected": -580.0, "loss": 0.59, "rewards/accuracies": 0.75, "rewards/chosen": -1.328125, "rewards/margins": 0.06640625, "rewards/rejected": -1.3984375, "step": 929 }, { "epoch": 1.9466248037676608, "grad_norm": 11.041068077087402, "learning_rate": 1.4557068735653835e-07, "logits/chosen": 1.59375, "logits/rejected": 1.5703125, "logps/chosen": -466.0, "logps/rejected": -382.0, "loss": 0.6007, "rewards/accuracies": 1.0, "rewards/chosen": -1.0625, "rewards/margins": 0.41015625, "rewards/rejected": -1.4765625, "step": 930 }, { "epoch": 1.9487179487179487, "grad_norm": 11.396171569824219, "learning_rate": 1.4505432360880155e-07, "logits/chosen": 2.59375, "logits/rejected": 2.640625, "logps/chosen": -664.0, "logps/rejected": -568.0, "loss": 0.5673, "rewards/accuracies": 1.0, "rewards/chosen": -1.4296875, "rewards/margins": 0.625, "rewards/rejected": -2.0625, "step": 931 }, { "epoch": 1.9508110936682366, "grad_norm": 11.42209243774414, "learning_rate": 1.4453850308662502e-07, "logits/chosen": 2.65625, "logits/rejected": 2.421875, "logps/chosen": -406.0, "logps/rejected": -418.0, "loss": 0.5992, "rewards/accuracies": 0.5, "rewards/chosen": -1.25, "rewards/margins": 0.0673828125, "rewards/rejected": -1.3203125, "step": 932 }, { "epoch": 1.9529042386185242, "grad_norm": 11.87649154663086, "learning_rate": 1.4402322845939152e-07, "logits/chosen": 1.171875, "logits/rejected": 1.328125, "logps/chosen": -310.0, "logps/rejected": -552.0, "loss": 0.6093, "rewards/accuracies": 1.0, "rewards/chosen": -1.4140625, "rewards/margins": 0.4296875, "rewards/rejected": -1.84375, "step": 933 }, { "epoch": 1.9549973835688121, "grad_norm": 12.434530258178711, "learning_rate": 1.4350850239365836e-07, "logits/chosen": 1.484375, "logits/rejected": 1.7421875, "logps/chosen": -488.0, "logps/rejected": -484.0, "loss": 0.5743, "rewards/accuracies": 1.0, "rewards/chosen": -1.6875, "rewards/margins": 0.5703125, "rewards/rejected": -2.25, "step": 934 }, { "epoch": 1.9570905285191, "grad_norm": 11.32255744934082, "learning_rate": 1.4299432755314434e-07, "logits/chosen": 1.59375, "logits/rejected": 1.7265625, "logps/chosen": -298.0, "logps/rejected": -268.0, "loss": 0.5831, "rewards/accuracies": 0.5, "rewards/chosen": -0.9296875, "rewards/margins": 0.345703125, "rewards/rejected": -1.2734375, "step": 935 }, { "epoch": 1.9591836734693877, "grad_norm": 10.894527435302734, "learning_rate": 1.424807065987157e-07, "logits/chosen": 1.265625, "logits/rejected": 1.7109375, "logps/chosen": -326.0, "logps/rejected": -544.0, "loss": 0.5713, "rewards/accuracies": 1.0, "rewards/chosen": -1.0234375, "rewards/margins": 0.6328125, "rewards/rejected": -1.65625, "step": 936 }, { "epoch": 1.9612768184196756, "grad_norm": 12.204416275024414, "learning_rate": 1.41967642188372e-07, "logits/chosen": 2.1875, "logits/rejected": 3.375, "logps/chosen": -556.0, "logps/rejected": -412.0, "loss": 0.6099, "rewards/accuracies": 0.25, "rewards/chosen": -1.6171875, "rewards/margins": 0.09765625, "rewards/rejected": -1.71875, "step": 937 }, { "epoch": 1.9633699633699635, "grad_norm": 11.9826078414917, "learning_rate": 1.4145513697723298e-07, "logits/chosen": 1.0078125, "logits/rejected": 1.2265625, "logps/chosen": -532.0, "logps/rejected": -370.0, "loss": 0.5968, "rewards/accuracies": 0.5, "rewards/chosen": -2.015625, "rewards/margins": -0.4140625, "rewards/rejected": -1.6015625, "step": 938 }, { "epoch": 1.9654631083202512, "grad_norm": 11.560630798339844, "learning_rate": 1.409431936175243e-07, "logits/chosen": 1.90625, "logits/rejected": 2.140625, "logps/chosen": -532.0, "logps/rejected": -540.0, "loss": 0.5679, "rewards/accuracies": 0.75, "rewards/chosen": -1.4921875, "rewards/margins": 0.5859375, "rewards/rejected": -2.078125, "step": 939 }, { "epoch": 1.9675562532705388, "grad_norm": 12.335697174072266, "learning_rate": 1.404318147585642e-07, "logits/chosen": 2.140625, "logits/rejected": 3.125, "logps/chosen": -580.0, "logps/rejected": -552.0, "loss": 0.6404, "rewards/accuracies": 1.0, "rewards/chosen": -1.515625, "rewards/margins": 0.8671875, "rewards/rejected": -2.375, "step": 940 }, { "epoch": 1.9696493982208267, "grad_norm": 11.62963581085205, "learning_rate": 1.399210030467494e-07, "logits/chosen": 2.21875, "logits/rejected": 2.40625, "logps/chosen": -744.0, "logps/rejected": -352.0, "loss": 0.6233, "rewards/accuracies": 1.0, "rewards/chosen": -0.8984375, "rewards/margins": 0.5703125, "rewards/rejected": -1.46875, "step": 941 }, { "epoch": 1.9717425431711146, "grad_norm": 11.435396194458008, "learning_rate": 1.3941076112554183e-07, "logits/chosen": 2.203125, "logits/rejected": 2.484375, "logps/chosen": -788.0, "logps/rejected": -448.0, "loss": 0.5569, "rewards/accuracies": 1.0, "rewards/chosen": -1.2734375, "rewards/margins": 0.42578125, "rewards/rejected": -1.703125, "step": 942 }, { "epoch": 1.9738356881214023, "grad_norm": 11.883343696594238, "learning_rate": 1.3890109163545475e-07, "logits/chosen": 2.03125, "logits/rejected": 1.8046875, "logps/chosen": -460.0, "logps/rejected": -512.0, "loss": 0.6044, "rewards/accuracies": 1.0, "rewards/chosen": -1.0078125, "rewards/margins": 0.65234375, "rewards/rejected": -1.6640625, "step": 943 }, { "epoch": 1.9759288330716902, "grad_norm": 10.637882232666016, "learning_rate": 1.3839199721403893e-07, "logits/chosen": 2.5625, "logits/rejected": 2.53125, "logps/chosen": -406.0, "logps/rejected": -412.0, "loss": 0.5791, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.90234375, "rewards/rejected": -2.09375, "step": 944 }, { "epoch": 1.978021978021978, "grad_norm": 13.070167541503906, "learning_rate": 1.37883480495869e-07, "logits/chosen": 1.421875, "logits/rejected": 1.109375, "logps/chosen": -402.0, "logps/rejected": -460.0, "loss": 0.638, "rewards/accuracies": 0.5, "rewards/chosen": -1.84375, "rewards/margins": 0.150390625, "rewards/rejected": -1.9921875, "step": 945 }, { "epoch": 1.9801151229722658, "grad_norm": 11.330702781677246, "learning_rate": 1.373755441125304e-07, "logits/chosen": 2.015625, "logits/rejected": 2.578125, "logps/chosen": -528.0, "logps/rejected": -358.0, "loss": 0.597, "rewards/accuracies": 0.5, "rewards/chosen": -1.5546875, "rewards/margins": 0.189453125, "rewards/rejected": -1.75, "step": 946 }, { "epoch": 1.9822082679225537, "grad_norm": 12.942609786987305, "learning_rate": 1.368681906926051e-07, "logits/chosen": 2.53125, "logits/rejected": 3.125, "logps/chosen": -516.0, "logps/rejected": -316.0, "loss": 0.6289, "rewards/accuracies": 0.25, "rewards/chosen": -1.75, "rewards/margins": -0.248046875, "rewards/rejected": -1.5078125, "step": 947 }, { "epoch": 1.9843014128728416, "grad_norm": 10.92066478729248, "learning_rate": 1.363614228616581e-07, "logits/chosen": 2.515625, "logits/rejected": 2.125, "logps/chosen": -378.0, "logps/rejected": -512.0, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": -1.25, "rewards/margins": 0.208984375, "rewards/rejected": -1.453125, "step": 948 }, { "epoch": 1.9863945578231292, "grad_norm": 12.061305046081543, "learning_rate": 1.3585524324222406e-07, "logits/chosen": 1.8359375, "logits/rejected": 1.5078125, "logps/chosen": -496.0, "logps/rejected": -396.0, "loss": 0.6059, "rewards/accuracies": 1.0, "rewards/chosen": -1.53125, "rewards/margins": 0.494140625, "rewards/rejected": -2.03125, "step": 949 }, { "epoch": 1.988487702773417, "grad_norm": 11.782267570495605, "learning_rate": 1.3534965445379382e-07, "logits/chosen": 2.125, "logits/rejected": 2.5, "logps/chosen": -768.0, "logps/rejected": -568.0, "loss": 0.5928, "rewards/accuracies": 0.5, "rewards/chosen": -1.875, "rewards/margins": -0.25390625, "rewards/rejected": -1.6171875, "step": 950 }, { "epoch": 1.990580847723705, "grad_norm": 11.26547908782959, "learning_rate": 1.3484465911280038e-07, "logits/chosen": 1.34375, "logits/rejected": 1.875, "logps/chosen": -544.0, "logps/rejected": -572.0, "loss": 0.5709, "rewards/accuracies": 1.0, "rewards/chosen": -1.0546875, "rewards/margins": 1.3125, "rewards/rejected": -2.359375, "step": 951 }, { "epoch": 1.9926739926739927, "grad_norm": 11.085479736328125, "learning_rate": 1.3434025983260566e-07, "logits/chosen": 1.578125, "logits/rejected": 1.6015625, "logps/chosen": -406.0, "logps/rejected": -564.0, "loss": 0.595, "rewards/accuracies": 0.75, "rewards/chosen": -1.15625, "rewards/margins": 0.5859375, "rewards/rejected": -1.7421875, "step": 952 }, { "epoch": 1.9947671376242804, "grad_norm": 11.633567810058594, "learning_rate": 1.338364592234871e-07, "logits/chosen": 3.171875, "logits/rejected": 3.34375, "logps/chosen": -748.0, "logps/rejected": -600.0, "loss": 0.6095, "rewards/accuracies": 1.0, "rewards/chosen": -1.265625, "rewards/margins": 0.392578125, "rewards/rejected": -1.65625, "step": 953 }, { "epoch": 1.9968602825745683, "grad_norm": 11.245035171508789, "learning_rate": 1.3333325989262405e-07, "logits/chosen": 2.65625, "logits/rejected": 3.3125, "logps/chosen": -644.0, "logps/rejected": -672.0, "loss": 0.5893, "rewards/accuracies": 1.0, "rewards/chosen": -1.21875, "rewards/margins": 0.78125, "rewards/rejected": -2.0, "step": 954 }, { "epoch": 1.9989534275248562, "grad_norm": 11.77606201171875, "learning_rate": 1.3283066444408403e-07, "logits/chosen": 1.5625, "logits/rejected": 1.4140625, "logps/chosen": -238.0, "logps/rejected": -316.0, "loss": 0.6104, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.462890625, "rewards/rejected": -1.734375, "step": 955 }, { "epoch": 2.001046572475144, "grad_norm": 11.85208511352539, "learning_rate": 1.3232867547880933e-07, "logits/chosen": 2.0625, "logits/rejected": 3.078125, "logps/chosen": -556.0, "logps/rejected": -342.0, "loss": 0.581, "rewards/accuracies": 0.5, "rewards/chosen": -1.7109375, "rewards/margins": -0.08203125, "rewards/rejected": -1.6328125, "step": 956 }, { "epoch": 2.0031397174254315, "grad_norm": 12.282042503356934, "learning_rate": 1.318272955946043e-07, "logits/chosen": 0.625, "logits/rejected": 0.41015625, "logps/chosen": -222.0, "logps/rejected": -294.0, "loss": 0.6083, "rewards/accuracies": 0.5, "rewards/chosen": -1.453125, "rewards/margins": 0.357421875, "rewards/rejected": -1.8125, "step": 957 }, { "epoch": 2.0052328623757196, "grad_norm": 10.775773048400879, "learning_rate": 1.3132652738612068e-07, "logits/chosen": 2.421875, "logits/rejected": 2.46875, "logps/chosen": -442.0, "logps/rejected": -390.0, "loss": 0.6026, "rewards/accuracies": 0.5, "rewards/chosen": -1.46875, "rewards/margins": 0.44921875, "rewards/rejected": -1.921875, "step": 958 }, { "epoch": 2.0073260073260073, "grad_norm": 11.614090919494629, "learning_rate": 1.308263734448449e-07, "logits/chosen": 2.765625, "logits/rejected": 2.65625, "logps/chosen": -676.0, "logps/rejected": -812.0, "loss": 0.6351, "rewards/accuracies": 0.75, "rewards/chosen": -1.125, "rewards/margins": 0.1513671875, "rewards/rejected": -1.2734375, "step": 959 }, { "epoch": 2.009419152276295, "grad_norm": 10.593873977661133, "learning_rate": 1.3032683635908465e-07, "logits/chosen": 1.2109375, "logits/rejected": 0.734375, "logps/chosen": -252.0, "logps/rejected": -426.0, "loss": 0.5732, "rewards/accuracies": 0.75, "rewards/chosen": -1.484375, "rewards/margins": -0.033203125, "rewards/rejected": -1.453125, "step": 960 }, { "epoch": 2.011512297226583, "grad_norm": 11.892288208007812, "learning_rate": 1.2982791871395545e-07, "logits/chosen": 2.515625, "logits/rejected": 2.5, "logps/chosen": -692.0, "logps/rejected": -788.0, "loss": 0.5866, "rewards/accuracies": 1.0, "rewards/chosen": -1.375, "rewards/margins": 0.314453125, "rewards/rejected": -1.6875, "step": 961 }, { "epoch": 2.0136054421768708, "grad_norm": 10.537877082824707, "learning_rate": 1.2932962309136702e-07, "logits/chosen": 1.734375, "logits/rejected": 2.03125, "logps/chosen": -584.0, "logps/rejected": -476.0, "loss": 0.568, "rewards/accuracies": 0.5, "rewards/chosen": -1.34375, "rewards/margins": -0.2109375, "rewards/rejected": -1.1328125, "step": 962 }, { "epoch": 2.0156985871271584, "grad_norm": 10.745752334594727, "learning_rate": 1.2883195207001e-07, "logits/chosen": 0.98828125, "logits/rejected": 1.15625, "logps/chosen": -338.0, "logps/rejected": -260.0, "loss": 0.595, "rewards/accuracies": 0.5, "rewards/chosen": -1.2890625, "rewards/margins": 0.0810546875, "rewards/rejected": -1.375, "step": 963 }, { "epoch": 2.0177917320774466, "grad_norm": 10.625443458557129, "learning_rate": 1.2833490822534327e-07, "logits/chosen": 2.453125, "logits/rejected": 2.640625, "logps/chosen": -458.0, "logps/rejected": -326.0, "loss": 0.5472, "rewards/accuracies": 0.5, "rewards/chosen": -1.515625, "rewards/margins": 0.158203125, "rewards/rejected": -1.671875, "step": 964 }, { "epoch": 2.0198848770277342, "grad_norm": 10.723146438598633, "learning_rate": 1.2783849412957937e-07, "logits/chosen": 2.609375, "logits/rejected": 2.359375, "logps/chosen": -380.0, "logps/rejected": -506.0, "loss": 0.5894, "rewards/accuracies": 1.0, "rewards/chosen": -1.21875, "rewards/margins": 0.435546875, "rewards/rejected": -1.6484375, "step": 965 }, { "epoch": 2.021978021978022, "grad_norm": 11.235821723937988, "learning_rate": 1.2734271235167214e-07, "logits/chosen": 1.53125, "logits/rejected": 1.4765625, "logps/chosen": -414.0, "logps/rejected": -588.0, "loss": 0.5805, "rewards/accuracies": 0.75, "rewards/chosen": -1.390625, "rewards/margins": 0.208984375, "rewards/rejected": -1.59375, "step": 966 }, { "epoch": 2.0240711669283096, "grad_norm": 11.411491394042969, "learning_rate": 1.2684756545730336e-07, "logits/chosen": 0.5390625, "logits/rejected": 1.0859375, "logps/chosen": -204.0, "logps/rejected": -188.0, "loss": 0.5965, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.125, "rewards/rejected": -1.359375, "step": 967 }, { "epoch": 2.0261643118785977, "grad_norm": 11.736352920532227, "learning_rate": 1.2635305600886905e-07, "logits/chosen": 1.796875, "logits/rejected": 2.03125, "logps/chosen": -604.0, "logps/rejected": -462.0, "loss": 0.5857, "rewards/accuracies": 0.5, "rewards/chosen": -1.15625, "rewards/margins": 0.1669921875, "rewards/rejected": -1.328125, "step": 968 }, { "epoch": 2.0282574568288854, "grad_norm": 11.645047187805176, "learning_rate": 1.2585918656546644e-07, "logits/chosen": 2.40625, "logits/rejected": 3.21875, "logps/chosen": -708.0, "logps/rejected": -460.0, "loss": 0.5444, "rewards/accuracies": 1.0, "rewards/chosen": -1.2421875, "rewards/margins": 0.6015625, "rewards/rejected": -1.84375, "step": 969 }, { "epoch": 2.030350601779173, "grad_norm": 12.109193801879883, "learning_rate": 1.2536595968288074e-07, "logits/chosen": 1.046875, "logits/rejected": 0.828125, "logps/chosen": -308.0, "logps/rejected": -364.0, "loss": 0.6242, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.1259765625, "rewards/rejected": -1.4375, "step": 970 }, { "epoch": 2.032443746729461, "grad_norm": 10.74459171295166, "learning_rate": 1.248733779135721e-07, "logits/chosen": 1.53125, "logits/rejected": 1.6015625, "logps/chosen": -258.0, "logps/rejected": -588.0, "loss": 0.5712, "rewards/accuracies": 1.0, "rewards/chosen": -1.4375, "rewards/margins": 1.015625, "rewards/rejected": -2.453125, "step": 971 }, { "epoch": 2.034536891679749, "grad_norm": 12.447172164916992, "learning_rate": 1.243814438066619e-07, "logits/chosen": 1.6328125, "logits/rejected": 2.765625, "logps/chosen": -608.0, "logps/rejected": -382.0, "loss": 0.5826, "rewards/accuracies": 0.75, "rewards/chosen": -1.3046875, "rewards/margins": 0.275390625, "rewards/rejected": -1.578125, "step": 972 }, { "epoch": 2.0366300366300365, "grad_norm": 12.21883773803711, "learning_rate": 1.2389015990791987e-07, "logits/chosen": 1.9609375, "logits/rejected": 1.203125, "logps/chosen": -412.0, "logps/rejected": -976.0, "loss": 0.5723, "rewards/accuracies": 0.75, "rewards/chosen": -1.390625, "rewards/margins": 0.6328125, "rewards/rejected": -2.03125, "step": 973 }, { "epoch": 2.0387231815803246, "grad_norm": 11.15044116973877, "learning_rate": 1.2339952875975111e-07, "logits/chosen": 1.3359375, "logits/rejected": 1.234375, "logps/chosen": -548.0, "logps/rejected": -460.0, "loss": 0.5791, "rewards/accuracies": 0.75, "rewards/chosen": -1.46875, "rewards/margins": 0.6484375, "rewards/rejected": -2.125, "step": 974 }, { "epoch": 2.0408163265306123, "grad_norm": 11.227256774902344, "learning_rate": 1.229095529011827e-07, "logits/chosen": 1.796875, "logits/rejected": 1.7890625, "logps/chosen": -348.0, "logps/rejected": -416.0, "loss": 0.6088, "rewards/accuracies": 0.5, "rewards/chosen": -1.4765625, "rewards/margins": -0.12890625, "rewards/rejected": -1.3515625, "step": 975 }, { "epoch": 2.0429094714809, "grad_norm": 11.456323623657227, "learning_rate": 1.2242023486785027e-07, "logits/chosen": 1.5546875, "logits/rejected": 1.7890625, "logps/chosen": -684.0, "logps/rejected": -498.0, "loss": 0.5806, "rewards/accuracies": 0.5, "rewards/chosen": -1.546875, "rewards/margins": 0.35546875, "rewards/rejected": -1.8984375, "step": 976 }, { "epoch": 2.045002616431188, "grad_norm": 11.750733375549316, "learning_rate": 1.219315771919856e-07, "logits/chosen": 1.3515625, "logits/rejected": 1.640625, "logps/chosen": -434.0, "logps/rejected": -450.0, "loss": 0.5408, "rewards/accuracies": 1.0, "rewards/chosen": -1.2421875, "rewards/margins": 0.75390625, "rewards/rejected": -2.0, "step": 977 }, { "epoch": 2.0470957613814758, "grad_norm": 11.425230979919434, "learning_rate": 1.2144358240240275e-07, "logits/chosen": 2.578125, "logits/rejected": 2.4375, "logps/chosen": -510.0, "logps/rejected": -680.0, "loss": 0.5966, "rewards/accuracies": 1.0, "rewards/chosen": -1.359375, "rewards/margins": 1.015625, "rewards/rejected": -2.375, "step": 978 }, { "epoch": 2.0491889063317634, "grad_norm": 10.762430191040039, "learning_rate": 1.209562530244857e-07, "logits/chosen": 2.46875, "logits/rejected": 2.46875, "logps/chosen": -644.0, "logps/rejected": -720.0, "loss": 0.567, "rewards/accuracies": 0.75, "rewards/chosen": -1.3671875, "rewards/margins": 0.515625, "rewards/rejected": -1.8828125, "step": 979 }, { "epoch": 2.051282051282051, "grad_norm": 11.131372451782227, "learning_rate": 1.2046959158017447e-07, "logits/chosen": 2.015625, "logits/rejected": 2.3125, "logps/chosen": -478.0, "logps/rejected": -502.0, "loss": 0.5882, "rewards/accuracies": 0.75, "rewards/chosen": -1.5703125, "rewards/margins": 0.26953125, "rewards/rejected": -1.84375, "step": 980 }, { "epoch": 2.053375196232339, "grad_norm": 10.981115341186523, "learning_rate": 1.199836005879529e-07, "logits/chosen": 2.140625, "logits/rejected": 2.5625, "logps/chosen": -544.0, "logps/rejected": -440.0, "loss": 0.6054, "rewards/accuracies": 0.75, "rewards/chosen": -1.25, "rewards/margins": 0.2080078125, "rewards/rejected": -1.4609375, "step": 981 }, { "epoch": 2.055468341182627, "grad_norm": 10.351678848266602, "learning_rate": 1.194982825628351e-07, "logits/chosen": 2.015625, "logits/rejected": 2.015625, "logps/chosen": -358.0, "logps/rejected": -227.0, "loss": 0.585, "rewards/accuracies": 0.5, "rewards/chosen": -1.34375, "rewards/margins": 0.0400390625, "rewards/rejected": -1.3828125, "step": 982 }, { "epoch": 2.0575614861329146, "grad_norm": 11.870306968688965, "learning_rate": 1.1901364001635238e-07, "logits/chosen": 1.15625, "logits/rejected": 1.796875, "logps/chosen": -422.0, "logps/rejected": -324.0, "loss": 0.6144, "rewards/accuracies": 0.5, "rewards/chosen": -1.7421875, "rewards/margins": -0.3984375, "rewards/rejected": -1.34375, "step": 983 }, { "epoch": 2.0596546310832027, "grad_norm": 12.057429313659668, "learning_rate": 1.1852967545654076e-07, "logits/chosen": 2.65625, "logits/rejected": 3.171875, "logps/chosen": -600.0, "logps/rejected": -490.0, "loss": 0.641, "rewards/accuracies": 0.75, "rewards/chosen": -1.0625, "rewards/margins": 0.396484375, "rewards/rejected": -1.453125, "step": 984 }, { "epoch": 2.0617477760334904, "grad_norm": 11.112940788269043, "learning_rate": 1.1804639138792731e-07, "logits/chosen": 2.109375, "logits/rejected": 2.59375, "logps/chosen": -466.0, "logps/rejected": -392.0, "loss": 0.5666, "rewards/accuracies": 0.75, "rewards/chosen": -0.859375, "rewards/margins": 0.44140625, "rewards/rejected": -1.296875, "step": 985 }, { "epoch": 2.063840920983778, "grad_norm": 10.575416564941406, "learning_rate": 1.1756379031151787e-07, "logits/chosen": 2.59375, "logits/rejected": 1.8359375, "logps/chosen": -440.0, "logps/rejected": -520.0, "loss": 0.5904, "rewards/accuracies": 0.5, "rewards/chosen": -1.71875, "rewards/margins": -0.080078125, "rewards/rejected": -1.640625, "step": 986 }, { "epoch": 2.065934065934066, "grad_norm": 10.981159210205078, "learning_rate": 1.170818747247835e-07, "logits/chosen": 2.796875, "logits/rejected": 2.578125, "logps/chosen": -524.0, "logps/rejected": -688.0, "loss": 0.6016, "rewards/accuracies": 0.75, "rewards/chosen": -1.484375, "rewards/margins": 0.123046875, "rewards/rejected": -1.609375, "step": 987 }, { "epoch": 2.068027210884354, "grad_norm": 11.04464340209961, "learning_rate": 1.1660064712164814e-07, "logits/chosen": 1.7421875, "logits/rejected": 1.8359375, "logps/chosen": -620.0, "logps/rejected": -548.0, "loss": 0.5942, "rewards/accuracies": 0.75, "rewards/chosen": -1.5625, "rewards/margins": 0.271484375, "rewards/rejected": -1.8359375, "step": 988 }, { "epoch": 2.0701203558346415, "grad_norm": 12.590449333190918, "learning_rate": 1.16120109992475e-07, "logits/chosen": 1.7109375, "logits/rejected": 2.03125, "logps/chosen": -784.0, "logps/rejected": -640.0, "loss": 0.5942, "rewards/accuracies": 0.25, "rewards/chosen": -2.359375, "rewards/margins": -0.70703125, "rewards/rejected": -1.65625, "step": 989 }, { "epoch": 2.072213500784929, "grad_norm": 10.410261154174805, "learning_rate": 1.156402658240544e-07, "logits/chosen": 1.71875, "logits/rejected": 1.625, "logps/chosen": -364.0, "logps/rejected": -352.0, "loss": 0.5884, "rewards/accuracies": 0.5, "rewards/chosen": -1.28125, "rewards/margins": -0.0927734375, "rewards/rejected": -1.1875, "step": 990 }, { "epoch": 2.0743066457352173, "grad_norm": 10.231704711914062, "learning_rate": 1.1516111709959061e-07, "logits/chosen": 2.0625, "logits/rejected": 1.9921875, "logps/chosen": -660.0, "logps/rejected": -434.0, "loss": 0.5557, "rewards/accuracies": 0.75, "rewards/chosen": -1.7578125, "rewards/margins": -0.0791015625, "rewards/rejected": -1.6796875, "step": 991 }, { "epoch": 2.076399790685505, "grad_norm": 10.845260620117188, "learning_rate": 1.1468266629868861e-07, "logits/chosen": 1.34375, "logits/rejected": 1.5390625, "logps/chosen": -428.0, "logps/rejected": -386.0, "loss": 0.5847, "rewards/accuracies": 1.0, "rewards/chosen": -1.0859375, "rewards/margins": 0.4765625, "rewards/rejected": -1.5625, "step": 992 }, { "epoch": 2.0784929356357926, "grad_norm": 11.413115501403809, "learning_rate": 1.1420491589734201e-07, "logits/chosen": 1.8984375, "logits/rejected": 2.5, "logps/chosen": -480.0, "logps/rejected": -352.0, "loss": 0.6148, "rewards/accuracies": 0.5, "rewards/chosen": -1.1875, "rewards/margins": 0.1171875, "rewards/rejected": -1.296875, "step": 993 }, { "epoch": 2.0805860805860807, "grad_norm": 11.16401195526123, "learning_rate": 1.1372786836791945e-07, "logits/chosen": 1.875, "logits/rejected": 2.203125, "logps/chosen": -836.0, "logps/rejected": -382.0, "loss": 0.5992, "rewards/accuracies": 0.5, "rewards/chosen": -0.984375, "rewards/margins": 0.2734375, "rewards/rejected": -1.2578125, "step": 994 }, { "epoch": 2.0826792255363684, "grad_norm": 11.828091621398926, "learning_rate": 1.132515261791526e-07, "logits/chosen": 2.5, "logits/rejected": 2.5, "logps/chosen": -720.0, "logps/rejected": -572.0, "loss": 0.5973, "rewards/accuracies": 1.0, "rewards/chosen": -0.9375, "rewards/margins": 0.8046875, "rewards/rejected": -1.7421875, "step": 995 }, { "epoch": 2.084772370486656, "grad_norm": 11.524788856506348, "learning_rate": 1.1277589179612257e-07, "logits/chosen": 1.9453125, "logits/rejected": 1.8125, "logps/chosen": -356.0, "logps/rejected": -462.0, "loss": 0.5597, "rewards/accuracies": 0.25, "rewards/chosen": -1.3984375, "rewards/margins": 0.01171875, "rewards/rejected": -1.4140625, "step": 996 }, { "epoch": 2.086865515436944, "grad_norm": 11.08915901184082, "learning_rate": 1.1230096768024787e-07, "logits/chosen": 1.9140625, "logits/rejected": 1.8359375, "logps/chosen": -434.0, "logps/rejected": -656.0, "loss": 0.5984, "rewards/accuracies": 0.5, "rewards/chosen": -1.546875, "rewards/margins": -0.099609375, "rewards/rejected": -1.4453125, "step": 997 }, { "epoch": 2.088958660387232, "grad_norm": 11.421136856079102, "learning_rate": 1.1182675628927133e-07, "logits/chosen": 1.7421875, "logits/rejected": 2.5, "logps/chosen": -472.0, "logps/rejected": -472.0, "loss": 0.5609, "rewards/accuracies": 1.0, "rewards/chosen": -0.984375, "rewards/margins": 1.1328125, "rewards/rejected": -2.125, "step": 998 }, { "epoch": 2.0910518053375196, "grad_norm": 10.709765434265137, "learning_rate": 1.1135326007724723e-07, "logits/chosen": 2.234375, "logits/rejected": 1.8828125, "logps/chosen": -342.0, "logps/rejected": -524.0, "loss": 0.5907, "rewards/accuracies": 0.75, "rewards/chosen": -1.1171875, "rewards/margins": 0.59765625, "rewards/rejected": -1.7109375, "step": 999 }, { "epoch": 2.0931449502878072, "grad_norm": 11.891133308410645, "learning_rate": 1.1088048149452881e-07, "logits/chosen": 1.71875, "logits/rejected": 2.4375, "logps/chosen": -490.0, "logps/rejected": -504.0, "loss": 0.6031, "rewards/accuracies": 0.5, "rewards/chosen": -1.4609375, "rewards/margins": -0.1337890625, "rewards/rejected": -1.328125, "step": 1000 }, { "epoch": 2.0952380952380953, "grad_norm": 12.135580062866211, "learning_rate": 1.1040842298775572e-07, "logits/chosen": 2.109375, "logits/rejected": 1.78125, "logps/chosen": -300.0, "logps/rejected": -496.0, "loss": 0.5992, "rewards/accuracies": 0.75, "rewards/chosen": -1.28125, "rewards/margins": 0.41015625, "rewards/rejected": -1.6875, "step": 1001 }, { "epoch": 2.097331240188383, "grad_norm": 11.127384185791016, "learning_rate": 1.0993708699984125e-07, "logits/chosen": 0.69921875, "logits/rejected": 1.03125, "logps/chosen": -388.0, "logps/rejected": -564.0, "loss": 0.5675, "rewards/accuracies": 0.75, "rewards/chosen": -1.5703125, "rewards/margins": 0.478515625, "rewards/rejected": -2.046875, "step": 1002 }, { "epoch": 2.0994243851386707, "grad_norm": 12.559244155883789, "learning_rate": 1.0946647596995929e-07, "logits/chosen": 2.015625, "logits/rejected": 1.7265625, "logps/chosen": -328.0, "logps/rejected": -342.0, "loss": 0.6289, "rewards/accuracies": 0.25, "rewards/chosen": -1.265625, "rewards/margins": 0.0029296875, "rewards/rejected": -1.2734375, "step": 1003 }, { "epoch": 2.101517530088959, "grad_norm": 11.606091499328613, "learning_rate": 1.0899659233353235e-07, "logits/chosen": 2.515625, "logits/rejected": 2.796875, "logps/chosen": -752.0, "logps/rejected": -532.0, "loss": 0.5846, "rewards/accuracies": 0.75, "rewards/chosen": -1.34375, "rewards/margins": 0.1796875, "rewards/rejected": -1.5234375, "step": 1004 }, { "epoch": 2.1036106750392465, "grad_norm": 11.270895004272461, "learning_rate": 1.0852743852221874e-07, "logits/chosen": 2.03125, "logits/rejected": 2.78125, "logps/chosen": -600.0, "logps/rejected": -304.0, "loss": 0.5836, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.34375, "rewards/rejected": -1.53125, "step": 1005 }, { "epoch": 2.105703819989534, "grad_norm": 11.021132469177246, "learning_rate": 1.0805901696389961e-07, "logits/chosen": 1.765625, "logits/rejected": 2.375, "logps/chosen": -294.0, "logps/rejected": -340.0, "loss": 0.5985, "rewards/accuracies": 1.0, "rewards/chosen": -1.2109375, "rewards/margins": 0.56640625, "rewards/rejected": -1.78125, "step": 1006 }, { "epoch": 2.1077969649398223, "grad_norm": 12.249751091003418, "learning_rate": 1.075913300826668e-07, "logits/chosen": 2.8125, "logits/rejected": 2.75, "logps/chosen": -692.0, "logps/rejected": -908.0, "loss": 0.6222, "rewards/accuracies": 0.75, "rewards/chosen": -1.4375, "rewards/margins": 0.30078125, "rewards/rejected": -1.734375, "step": 1007 }, { "epoch": 2.10989010989011, "grad_norm": 11.67434310913086, "learning_rate": 1.0712438029881024e-07, "logits/chosen": 2.578125, "logits/rejected": 2.984375, "logps/chosen": -692.0, "logps/rejected": -592.0, "loss": 0.5725, "rewards/accuracies": 0.75, "rewards/chosen": -1.3515625, "rewards/margins": 0.296875, "rewards/rejected": -1.6484375, "step": 1008 }, { "epoch": 2.1119832548403976, "grad_norm": 11.299335479736328, "learning_rate": 1.0665817002880547e-07, "logits/chosen": 2.109375, "logits/rejected": 2.09375, "logps/chosen": -378.0, "logps/rejected": -302.0, "loss": 0.5981, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.0654296875, "rewards/rejected": -1.25, "step": 1009 }, { "epoch": 2.1140763997906853, "grad_norm": 11.45093822479248, "learning_rate": 1.0619270168530069e-07, "logits/chosen": 2.890625, "logits/rejected": 2.796875, "logps/chosen": -808.0, "logps/rejected": -1016.0, "loss": 0.6147, "rewards/accuracies": 0.75, "rewards/chosen": -1.4765625, "rewards/margins": 0.69921875, "rewards/rejected": -2.171875, "step": 1010 }, { "epoch": 2.1161695447409734, "grad_norm": 10.814870834350586, "learning_rate": 1.0572797767710492e-07, "logits/chosen": 1.6796875, "logits/rejected": 2.1875, "logps/chosen": -392.0, "logps/rejected": -302.0, "loss": 0.5401, "rewards/accuracies": 0.5, "rewards/chosen": -1.640625, "rewards/margins": -0.15234375, "rewards/rejected": -1.484375, "step": 1011 }, { "epoch": 2.118262689691261, "grad_norm": 10.025163650512695, "learning_rate": 1.0526400040917522e-07, "logits/chosen": 2.8125, "logits/rejected": 2.140625, "logps/chosen": -468.0, "logps/rejected": -560.0, "loss": 0.5737, "rewards/accuracies": 1.0, "rewards/chosen": -1.0390625, "rewards/margins": 0.34765625, "rewards/rejected": -1.3828125, "step": 1012 }, { "epoch": 2.1203558346415488, "grad_norm": 11.721614837646484, "learning_rate": 1.048007722826041e-07, "logits/chosen": 2.21875, "logits/rejected": 3.125, "logps/chosen": -644.0, "logps/rejected": -420.0, "loss": 0.5554, "rewards/accuracies": 1.0, "rewards/chosen": -0.9765625, "rewards/margins": 1.0625, "rewards/rejected": -2.03125, "step": 1013 }, { "epoch": 2.122448979591837, "grad_norm": 10.800631523132324, "learning_rate": 1.0433829569460719e-07, "logits/chosen": 2.421875, "logits/rejected": 2.8125, "logps/chosen": -500.0, "logps/rejected": -388.0, "loss": 0.6006, "rewards/accuracies": 0.5, "rewards/chosen": -0.82421875, "rewards/margins": 0.1044921875, "rewards/rejected": -0.9296875, "step": 1014 }, { "epoch": 2.1245421245421245, "grad_norm": 12.868189811706543, "learning_rate": 1.038765730385111e-07, "logits/chosen": 1.875, "logits/rejected": 2.03125, "logps/chosen": -500.0, "logps/rejected": -300.0, "loss": 0.6121, "rewards/accuracies": 0.5, "rewards/chosen": -1.6640625, "rewards/margins": -0.0888671875, "rewards/rejected": -1.578125, "step": 1015 }, { "epoch": 2.126635269492412, "grad_norm": 10.62038803100586, "learning_rate": 1.0341560670374084e-07, "logits/chosen": 1.75, "logits/rejected": 1.7421875, "logps/chosen": -376.0, "logps/rejected": -504.0, "loss": 0.5804, "rewards/accuracies": 0.75, "rewards/chosen": -1.4296875, "rewards/margins": 0.205078125, "rewards/rejected": -1.6328125, "step": 1016 }, { "epoch": 2.1287284144427003, "grad_norm": 11.46533489227295, "learning_rate": 1.0295539907580711e-07, "logits/chosen": 2.3125, "logits/rejected": 2.6875, "logps/chosen": -704.0, "logps/rejected": -588.0, "loss": 0.596, "rewards/accuracies": 1.0, "rewards/chosen": -1.2890625, "rewards/margins": 1.234375, "rewards/rejected": -2.53125, "step": 1017 }, { "epoch": 2.130821559392988, "grad_norm": 12.483968734741211, "learning_rate": 1.0249595253629467e-07, "logits/chosen": 2.0, "logits/rejected": 2.234375, "logps/chosen": -340.0, "logps/rejected": -474.0, "loss": 0.6082, "rewards/accuracies": 0.75, "rewards/chosen": -1.46875, "rewards/margins": 0.8828125, "rewards/rejected": -2.34375, "step": 1018 }, { "epoch": 2.1329147043432757, "grad_norm": 12.161870002746582, "learning_rate": 1.0203726946284953e-07, "logits/chosen": 2.15625, "logits/rejected": 3.046875, "logps/chosen": -728.0, "logps/rejected": -592.0, "loss": 0.5685, "rewards/accuracies": 1.0, "rewards/chosen": -0.91015625, "rewards/margins": 0.8984375, "rewards/rejected": -1.8125, "step": 1019 }, { "epoch": 2.1350078492935634, "grad_norm": 11.978885650634766, "learning_rate": 1.015793522291666e-07, "logits/chosen": 2.1875, "logits/rejected": 2.71875, "logps/chosen": -596.0, "logps/rejected": -612.0, "loss": 0.6069, "rewards/accuracies": 0.5, "rewards/chosen": -2.125, "rewards/margins": -0.2734375, "rewards/rejected": -1.8515625, "step": 1020 }, { "epoch": 2.1371009942438515, "grad_norm": 10.298712730407715, "learning_rate": 1.0112220320497752e-07, "logits/chosen": 1.21875, "logits/rejected": 0.703125, "logps/chosen": -160.0, "logps/rejected": -278.0, "loss": 0.5887, "rewards/accuracies": 1.0, "rewards/chosen": -1.1015625, "rewards/margins": 0.390625, "rewards/rejected": -1.484375, "step": 1021 }, { "epoch": 2.139194139194139, "grad_norm": 11.2387113571167, "learning_rate": 1.0066582475603872e-07, "logits/chosen": 2.453125, "logits/rejected": 1.9375, "logps/chosen": -428.0, "logps/rejected": -458.0, "loss": 0.5564, "rewards/accuracies": 0.75, "rewards/chosen": -1.3515625, "rewards/margins": 0.294921875, "rewards/rejected": -1.640625, "step": 1022 }, { "epoch": 2.141287284144427, "grad_norm": 10.46700668334961, "learning_rate": 1.0021021924411874e-07, "logits/chosen": 2.015625, "logits/rejected": 1.6640625, "logps/chosen": -430.0, "logps/rejected": -644.0, "loss": 0.586, "rewards/accuracies": 0.5, "rewards/chosen": -1.3828125, "rewards/margins": 0.1650390625, "rewards/rejected": -1.546875, "step": 1023 }, { "epoch": 2.143380429094715, "grad_norm": 10.900199890136719, "learning_rate": 9.975538902698597e-08, "logits/chosen": 1.65625, "logits/rejected": 2.28125, "logps/chosen": -510.0, "logps/rejected": -462.0, "loss": 0.597, "rewards/accuracies": 0.75, "rewards/chosen": -1.4140625, "rewards/margins": 0.232421875, "rewards/rejected": -1.6484375, "step": 1024 }, { "epoch": 2.1454735740450026, "grad_norm": 11.70801830291748, "learning_rate": 9.930133645839689e-08, "logits/chosen": 1.9453125, "logits/rejected": 1.6796875, "logps/chosen": -568.0, "logps/rejected": -608.0, "loss": 0.6152, "rewards/accuracies": 0.5, "rewards/chosen": -1.3359375, "rewards/margins": 0.150390625, "rewards/rejected": -1.484375, "step": 1025 }, { "epoch": 2.1475667189952903, "grad_norm": 10.580524444580078, "learning_rate": 9.884806388808362e-08, "logits/chosen": 2.484375, "logits/rejected": 2.953125, "logps/chosen": -488.0, "logps/rejected": -508.0, "loss": 0.5511, "rewards/accuracies": 0.5, "rewards/chosen": -1.40625, "rewards/margins": 0.126953125, "rewards/rejected": -1.5390625, "step": 1026 }, { "epoch": 2.1496598639455784, "grad_norm": 10.129754066467285, "learning_rate": 9.83955736617416e-08, "logits/chosen": 2.421875, "logits/rejected": 3.1875, "logps/chosen": -612.0, "logps/rejected": -482.0, "loss": 0.5524, "rewards/accuracies": 1.0, "rewards/chosen": -1.375, "rewards/margins": 0.8046875, "rewards/rejected": -2.171875, "step": 1027 }, { "epoch": 2.151753008895866, "grad_norm": 10.40230941772461, "learning_rate": 9.794386812101759e-08, "logits/chosen": 2.59375, "logits/rejected": 2.3125, "logps/chosen": -390.0, "logps/rejected": -716.0, "loss": 0.5755, "rewards/accuracies": 1.0, "rewards/chosen": -1.15625, "rewards/margins": 0.796875, "rewards/rejected": -1.953125, "step": 1028 }, { "epoch": 2.1538461538461537, "grad_norm": 12.058321952819824, "learning_rate": 9.749294960349783e-08, "logits/chosen": 1.390625, "logits/rejected": 2.546875, "logps/chosen": -388.0, "logps/rejected": -344.0, "loss": 0.5656, "rewards/accuracies": 0.75, "rewards/chosen": -1.03125, "rewards/margins": 0.1201171875, "rewards/rejected": -1.1484375, "step": 1029 }, { "epoch": 2.155939298796442, "grad_norm": 11.147902488708496, "learning_rate": 9.704282044269563e-08, "logits/chosen": 1.4765625, "logits/rejected": 1.671875, "logps/chosen": -412.0, "logps/rejected": -576.0, "loss": 0.6192, "rewards/accuracies": 0.75, "rewards/chosen": -1.6640625, "rewards/margins": 1.0234375, "rewards/rejected": -2.6875, "step": 1030 }, { "epoch": 2.1580324437467295, "grad_norm": 10.608804702758789, "learning_rate": 9.659348296803916e-08, "logits/chosen": 1.859375, "logits/rejected": 1.71875, "logps/chosen": -394.0, "logps/rejected": -468.0, "loss": 0.5777, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.65625, "rewards/rejected": -1.96875, "step": 1031 }, { "epoch": 2.160125588697017, "grad_norm": 11.858631134033203, "learning_rate": 9.61449395048598e-08, "logits/chosen": 1.8984375, "logits/rejected": 2.5625, "logps/chosen": -832.0, "logps/rejected": -760.0, "loss": 0.5936, "rewards/accuracies": 0.5, "rewards/chosen": -2.03125, "rewards/margins": 0.259765625, "rewards/rejected": -2.28125, "step": 1032 }, { "epoch": 2.162218733647305, "grad_norm": 12.69530200958252, "learning_rate": 9.569719237437995e-08, "logits/chosen": 0.765625, "logits/rejected": 1.1171875, "logps/chosen": -294.0, "logps/rejected": -278.0, "loss": 0.6072, "rewards/accuracies": 0.25, "rewards/chosen": -1.734375, "rewards/margins": 0.0712890625, "rewards/rejected": -1.8046875, "step": 1033 }, { "epoch": 2.164311878597593, "grad_norm": 11.752124786376953, "learning_rate": 9.525024389370076e-08, "logits/chosen": 2.46875, "logits/rejected": 2.59375, "logps/chosen": -708.0, "logps/rejected": -592.0, "loss": 0.5949, "rewards/accuracies": 0.25, "rewards/chosen": -1.5625, "rewards/margins": -0.0126953125, "rewards/rejected": -1.546875, "step": 1034 }, { "epoch": 2.1664050235478807, "grad_norm": 11.493417739868164, "learning_rate": 9.480409637579037e-08, "logits/chosen": 2.015625, "logits/rejected": 1.78125, "logps/chosen": -552.0, "logps/rejected": -668.0, "loss": 0.5822, "rewards/accuracies": 0.75, "rewards/chosen": -1.3515625, "rewards/margins": 0.8671875, "rewards/rejected": -2.21875, "step": 1035 }, { "epoch": 2.1684981684981683, "grad_norm": 10.749217987060547, "learning_rate": 9.43587521294721e-08, "logits/chosen": 2.796875, "logits/rejected": 2.90625, "logps/chosen": -736.0, "logps/rejected": -576.0, "loss": 0.5537, "rewards/accuracies": 0.75, "rewards/chosen": -1.359375, "rewards/margins": 0.56640625, "rewards/rejected": -1.9296875, "step": 1036 }, { "epoch": 2.1705913134484565, "grad_norm": 11.612716674804688, "learning_rate": 9.39142134594123e-08, "logits/chosen": 2.375, "logits/rejected": 2.40625, "logps/chosen": -588.0, "logps/rejected": -364.0, "loss": 0.6297, "rewards/accuracies": 0.5, "rewards/chosen": -1.375, "rewards/margins": 0.107421875, "rewards/rejected": -1.484375, "step": 1037 }, { "epoch": 2.172684458398744, "grad_norm": 11.497761726379395, "learning_rate": 9.34704826661082e-08, "logits/chosen": 1.28125, "logits/rejected": 1.8203125, "logps/chosen": -414.0, "logps/rejected": -496.0, "loss": 0.5917, "rewards/accuracies": 0.75, "rewards/chosen": -1.421875, "rewards/margins": 0.515625, "rewards/rejected": -1.9375, "step": 1038 }, { "epoch": 2.174777603349032, "grad_norm": 11.313504219055176, "learning_rate": 9.302756204587662e-08, "logits/chosen": 0.98046875, "logits/rejected": 1.3984375, "logps/chosen": -240.0, "logps/rejected": -246.0, "loss": 0.5883, "rewards/accuracies": 0.75, "rewards/chosen": -1.109375, "rewards/margins": 0.189453125, "rewards/rejected": -1.3046875, "step": 1039 }, { "epoch": 2.17687074829932, "grad_norm": 12.031342506408691, "learning_rate": 9.25854538908413e-08, "logits/chosen": 2.015625, "logits/rejected": 2.0625, "logps/chosen": -616.0, "logps/rejected": -576.0, "loss": 0.5908, "rewards/accuracies": 0.75, "rewards/chosen": -1.28125, "rewards/margins": 0.212890625, "rewards/rejected": -1.5, "step": 1040 }, { "epoch": 2.1789638932496076, "grad_norm": 11.418907165527344, "learning_rate": 9.214416048892185e-08, "logits/chosen": 1.4921875, "logits/rejected": 1.2734375, "logps/chosen": -336.0, "logps/rejected": -382.0, "loss": 0.6313, "rewards/accuracies": 0.5, "rewards/chosen": -1.203125, "rewards/margins": 0.125, "rewards/rejected": -1.328125, "step": 1041 }, { "epoch": 2.1810570381998953, "grad_norm": 11.404754638671875, "learning_rate": 9.170368412382117e-08, "logits/chosen": 2.171875, "logits/rejected": 1.71875, "logps/chosen": -384.0, "logps/rejected": -624.0, "loss": 0.5772, "rewards/accuracies": 0.75, "rewards/chosen": -1.625, "rewards/margins": 0.234375, "rewards/rejected": -1.859375, "step": 1042 }, { "epoch": 2.183150183150183, "grad_norm": 12.32582950592041, "learning_rate": 9.126402707501426e-08, "logits/chosen": 2.46875, "logits/rejected": 3.390625, "logps/chosen": -576.0, "logps/rejected": -360.0, "loss": 0.5829, "rewards/accuracies": 0.5, "rewards/chosen": -1.375, "rewards/margins": 0.146484375, "rewards/rejected": -1.5234375, "step": 1043 }, { "epoch": 2.185243328100471, "grad_norm": 11.554696083068848, "learning_rate": 9.08251916177361e-08, "logits/chosen": 1.796875, "logits/rejected": 2.0625, "logps/chosen": -239.0, "logps/rejected": -284.0, "loss": 0.6263, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.423828125, "rewards/rejected": -1.5546875, "step": 1044 }, { "epoch": 2.1873364730507587, "grad_norm": 13.371793746948242, "learning_rate": 9.038718002296962e-08, "logits/chosen": 2.8125, "logits/rejected": 2.671875, "logps/chosen": -408.0, "logps/rejected": -456.0, "loss": 0.575, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 0.36328125, "rewards/rejected": -1.4609375, "step": 1045 }, { "epoch": 2.1894296180010464, "grad_norm": 10.569348335266113, "learning_rate": 8.994999455743467e-08, "logits/chosen": 1.71875, "logits/rejected": 1.671875, "logps/chosen": -406.0, "logps/rejected": -440.0, "loss": 0.5697, "rewards/accuracies": 1.0, "rewards/chosen": -1.4375, "rewards/margins": 0.328125, "rewards/rejected": -1.765625, "step": 1046 }, { "epoch": 2.1915227629513345, "grad_norm": 10.752891540527344, "learning_rate": 8.951363748357547e-08, "logits/chosen": 0.55859375, "logits/rejected": 1.203125, "logps/chosen": -205.0, "logps/rejected": -202.0, "loss": 0.5561, "rewards/accuracies": 0.5, "rewards/chosen": -1.171875, "rewards/margins": 0.224609375, "rewards/rejected": -1.3984375, "step": 1047 }, { "epoch": 2.193615907901622, "grad_norm": 10.378030776977539, "learning_rate": 8.907811105954968e-08, "logits/chosen": 1.640625, "logits/rejected": 1.8046875, "logps/chosen": -486.0, "logps/rejected": -636.0, "loss": 0.5671, "rewards/accuracies": 1.0, "rewards/chosen": -1.140625, "rewards/margins": 0.703125, "rewards/rejected": -1.84375, "step": 1048 }, { "epoch": 2.19570905285191, "grad_norm": 11.485549926757812, "learning_rate": 8.864341753921596e-08, "logits/chosen": 1.1875, "logits/rejected": 1.90625, "logps/chosen": -360.0, "logps/rejected": -376.0, "loss": 0.5675, "rewards/accuracies": 0.75, "rewards/chosen": -1.421875, "rewards/margins": 0.35546875, "rewards/rejected": -1.78125, "step": 1049 }, { "epoch": 2.197802197802198, "grad_norm": 12.059419631958008, "learning_rate": 8.820955917212295e-08, "logits/chosen": 1.6796875, "logits/rejected": 1.9296875, "logps/chosen": -508.0, "logps/rejected": -580.0, "loss": 0.6126, "rewards/accuracies": 0.75, "rewards/chosen": -0.8515625, "rewards/margins": 0.50390625, "rewards/rejected": -1.359375, "step": 1050 }, { "epoch": 2.1998953427524857, "grad_norm": 10.468564987182617, "learning_rate": 8.777653820349714e-08, "logits/chosen": 1.8125, "logits/rejected": 1.6875, "logps/chosen": -368.0, "logps/rejected": -544.0, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": -0.9921875, "rewards/margins": 0.9296875, "rewards/rejected": -1.921875, "step": 1051 }, { "epoch": 2.2019884877027733, "grad_norm": 11.059528350830078, "learning_rate": 8.734435687423162e-08, "logits/chosen": 1.96875, "logits/rejected": 0.75, "logps/chosen": -235.0, "logps/rejected": -372.0, "loss": 0.6141, "rewards/accuracies": 0.25, "rewards/chosen": -1.546875, "rewards/margins": -0.056640625, "rewards/rejected": -1.484375, "step": 1052 }, { "epoch": 2.204081632653061, "grad_norm": 11.069193840026855, "learning_rate": 8.691301742087442e-08, "logits/chosen": 2.359375, "logits/rejected": 2.125, "logps/chosen": -426.0, "logps/rejected": -460.0, "loss": 0.6007, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 0.27734375, "rewards/rejected": -1.6015625, "step": 1053 }, { "epoch": 2.206174777603349, "grad_norm": 11.647639274597168, "learning_rate": 8.648252207561646e-08, "logits/chosen": 2.34375, "logits/rejected": 2.421875, "logps/chosen": -438.0, "logps/rejected": -548.0, "loss": 0.591, "rewards/accuracies": 0.75, "rewards/chosen": -1.6484375, "rewards/margins": 0.1826171875, "rewards/rejected": -1.8359375, "step": 1054 }, { "epoch": 2.208267922553637, "grad_norm": 12.953960418701172, "learning_rate": 8.605287306628074e-08, "logits/chosen": 2.5, "logits/rejected": 1.8984375, "logps/chosen": -404.0, "logps/rejected": -572.0, "loss": 0.6395, "rewards/accuracies": 0.5, "rewards/chosen": -1.375, "rewards/margins": 0.0546875, "rewards/rejected": -1.4296875, "step": 1055 }, { "epoch": 2.2103610675039245, "grad_norm": 12.250329971313477, "learning_rate": 8.562407261631043e-08, "logits/chosen": 2.203125, "logits/rejected": 1.359375, "logps/chosen": -384.0, "logps/rejected": -468.0, "loss": 0.5657, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 0.453125, "rewards/rejected": -1.5546875, "step": 1056 }, { "epoch": 2.2124542124542126, "grad_norm": 11.850432395935059, "learning_rate": 8.519612294475724e-08, "logits/chosen": 2.046875, "logits/rejected": 1.9375, "logps/chosen": -336.0, "logps/rejected": -456.0, "loss": 0.617, "rewards/accuracies": 0.25, "rewards/chosen": -1.609375, "rewards/margins": -0.0546875, "rewards/rejected": -1.5546875, "step": 1057 }, { "epoch": 2.2145473574045003, "grad_norm": 11.546448707580566, "learning_rate": 8.476902626626997e-08, "logits/chosen": 1.859375, "logits/rejected": 1.5703125, "logps/chosen": -388.0, "logps/rejected": -400.0, "loss": 0.5888, "rewards/accuracies": 0.5, "rewards/chosen": -1.359375, "rewards/margins": 0.177734375, "rewards/rejected": -1.53125, "step": 1058 }, { "epoch": 2.216640502354788, "grad_norm": 11.809260368347168, "learning_rate": 8.434278479108352e-08, "logits/chosen": 1.46875, "logits/rejected": 1.859375, "logps/chosen": -416.0, "logps/rejected": -440.0, "loss": 0.6061, "rewards/accuracies": 0.75, "rewards/chosen": -1.1953125, "rewards/margins": 0.099609375, "rewards/rejected": -1.296875, "step": 1059 }, { "epoch": 2.218733647305076, "grad_norm": 10.582293510437012, "learning_rate": 8.39174007250069e-08, "logits/chosen": 2.09375, "logits/rejected": 2.421875, "logps/chosen": -616.0, "logps/rejected": -438.0, "loss": 0.549, "rewards/accuracies": 0.5, "rewards/chosen": -1.390625, "rewards/margins": 0.16015625, "rewards/rejected": -1.546875, "step": 1060 }, { "epoch": 2.2208267922553637, "grad_norm": 11.779565811157227, "learning_rate": 8.349287626941198e-08, "logits/chosen": 2.375, "logits/rejected": 2.78125, "logps/chosen": -624.0, "logps/rejected": -480.0, "loss": 0.6026, "rewards/accuracies": 0.5, "rewards/chosen": -1.4609375, "rewards/margins": 0.126953125, "rewards/rejected": -1.5859375, "step": 1061 }, { "epoch": 2.2229199372056514, "grad_norm": 11.7490873336792, "learning_rate": 8.306921362122195e-08, "logits/chosen": 2.25, "logits/rejected": 2.34375, "logps/chosen": -544.0, "logps/rejected": -660.0, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": -1.5234375, "rewards/margins": 0.5546875, "rewards/rejected": -2.078125, "step": 1062 }, { "epoch": 2.2250130821559395, "grad_norm": 11.424396514892578, "learning_rate": 8.264641497290072e-08, "logits/chosen": 1.828125, "logits/rejected": 2.296875, "logps/chosen": -468.0, "logps/rejected": -476.0, "loss": 0.5615, "rewards/accuracies": 0.75, "rewards/chosen": -1.1484375, "rewards/margins": 0.2314453125, "rewards/rejected": -1.375, "step": 1063 }, { "epoch": 2.227106227106227, "grad_norm": 10.509209632873535, "learning_rate": 8.22244825124404e-08, "logits/chosen": 2.375, "logits/rejected": 2.265625, "logps/chosen": -430.0, "logps/rejected": -426.0, "loss": 0.5448, "rewards/accuracies": 0.25, "rewards/chosen": -1.3203125, "rewards/margins": -0.0625, "rewards/rejected": -1.2578125, "step": 1064 }, { "epoch": 2.229199372056515, "grad_norm": 10.109917640686035, "learning_rate": 8.18034184233507e-08, "logits/chosen": 1.8046875, "logits/rejected": 2.03125, "logps/chosen": -442.0, "logps/rejected": -434.0, "loss": 0.5692, "rewards/accuracies": 0.5, "rewards/chosen": -1.5234375, "rewards/margins": -0.03125, "rewards/rejected": -1.4921875, "step": 1065 }, { "epoch": 2.2312925170068025, "grad_norm": 11.283658027648926, "learning_rate": 8.13832248846476e-08, "logits/chosen": 2.03125, "logits/rejected": 2.515625, "logps/chosen": -474.0, "logps/rejected": -612.0, "loss": 0.6121, "rewards/accuracies": 0.75, "rewards/chosen": -1.453125, "rewards/margins": 1.1015625, "rewards/rejected": -2.5625, "step": 1066 }, { "epoch": 2.2333856619570907, "grad_norm": 12.197108268737793, "learning_rate": 8.0963904070842e-08, "logits/chosen": 1.84375, "logits/rejected": 2.71875, "logps/chosen": -648.0, "logps/rejected": -478.0, "loss": 0.6114, "rewards/accuracies": 1.0, "rewards/chosen": -1.4765625, "rewards/margins": 0.75, "rewards/rejected": -2.21875, "step": 1067 }, { "epoch": 2.2354788069073783, "grad_norm": 11.025632858276367, "learning_rate": 8.054545815192828e-08, "logits/chosen": 0.71484375, "logits/rejected": 0.984375, "logps/chosen": -274.0, "logps/rejected": -215.0, "loss": 0.5513, "rewards/accuracies": 0.25, "rewards/chosen": -1.375, "rewards/margins": -0.26171875, "rewards/rejected": -1.109375, "step": 1068 }, { "epoch": 2.237571951857666, "grad_norm": 12.587658882141113, "learning_rate": 8.01278892933731e-08, "logits/chosen": 1.8203125, "logits/rejected": 2.4375, "logps/chosen": -484.0, "logps/rejected": -410.0, "loss": 0.6006, "rewards/accuracies": 0.5, "rewards/chosen": -1.15625, "rewards/margins": 0.341796875, "rewards/rejected": -1.5, "step": 1069 }, { "epoch": 2.239665096807954, "grad_norm": 11.071043968200684, "learning_rate": 7.971119965610481e-08, "logits/chosen": 1.765625, "logits/rejected": 1.4296875, "logps/chosen": -410.0, "logps/rejected": -688.0, "loss": 0.5699, "rewards/accuracies": 0.25, "rewards/chosen": -1.6640625, "rewards/margins": 0.73046875, "rewards/rejected": -2.390625, "step": 1070 }, { "epoch": 2.241758241758242, "grad_norm": 11.76019287109375, "learning_rate": 7.929539139650132e-08, "logits/chosen": 1.90625, "logits/rejected": 2.75, "logps/chosen": -460.0, "logps/rejected": -516.0, "loss": 0.6034, "rewards/accuracies": 0.75, "rewards/chosen": -1.375, "rewards/margins": 0.56640625, "rewards/rejected": -1.9375, "step": 1071 }, { "epoch": 2.2438513867085295, "grad_norm": 10.587785720825195, "learning_rate": 7.888046666637941e-08, "logits/chosen": 2.234375, "logits/rejected": 2.3125, "logps/chosen": -616.0, "logps/rejected": -536.0, "loss": 0.5602, "rewards/accuracies": 0.75, "rewards/chosen": -1.515625, "rewards/margins": 0.34375, "rewards/rejected": -1.859375, "step": 1072 }, { "epoch": 2.2459445316588176, "grad_norm": 10.504775047302246, "learning_rate": 7.846642761298378e-08, "logits/chosen": 2.296875, "logits/rejected": 1.6640625, "logps/chosen": -334.0, "logps/rejected": -656.0, "loss": 0.5856, "rewards/accuracies": 1.0, "rewards/chosen": -1.578125, "rewards/margins": 0.5390625, "rewards/rejected": -2.125, "step": 1073 }, { "epoch": 2.2480376766091053, "grad_norm": 11.749526023864746, "learning_rate": 7.805327637897571e-08, "logits/chosen": 2.1875, "logits/rejected": 3.03125, "logps/chosen": -596.0, "logps/rejected": -430.0, "loss": 0.5889, "rewards/accuracies": 0.75, "rewards/chosen": -1.234375, "rewards/margins": 0.138671875, "rewards/rejected": -1.3671875, "step": 1074 }, { "epoch": 2.250130821559393, "grad_norm": 11.638836860656738, "learning_rate": 7.764101510242188e-08, "logits/chosen": 1.53125, "logits/rejected": 1.7578125, "logps/chosen": -252.0, "logps/rejected": -296.0, "loss": 0.5808, "rewards/accuracies": 0.75, "rewards/chosen": -1.65625, "rewards/margins": 0.123046875, "rewards/rejected": -1.78125, "step": 1075 }, { "epoch": 2.252223966509681, "grad_norm": 11.232507705688477, "learning_rate": 7.722964591678327e-08, "logits/chosen": 2.828125, "logits/rejected": 2.453125, "logps/chosen": -428.0, "logps/rejected": -504.0, "loss": 0.5817, "rewards/accuracies": 0.5, "rewards/chosen": -1.4453125, "rewards/margins": -0.0771484375, "rewards/rejected": -1.375, "step": 1076 }, { "epoch": 2.2543171114599687, "grad_norm": 10.541213035583496, "learning_rate": 7.681917095090483e-08, "logits/chosen": 1.65625, "logits/rejected": 1.6484375, "logps/chosen": -390.0, "logps/rejected": -350.0, "loss": 0.5898, "rewards/accuracies": 0.25, "rewards/chosen": -1.3359375, "rewards/margins": 0.0009765625, "rewards/rejected": -1.3359375, "step": 1077 }, { "epoch": 2.2564102564102564, "grad_norm": 10.962115287780762, "learning_rate": 7.640959232900337e-08, "logits/chosen": 2.234375, "logits/rejected": 2.625, "logps/chosen": -628.0, "logps/rejected": -338.0, "loss": 0.5776, "rewards/accuracies": 0.75, "rewards/chosen": -1.3984375, "rewards/margins": 0.25390625, "rewards/rejected": -1.6484375, "step": 1078 }, { "epoch": 2.258503401360544, "grad_norm": 11.30371379852295, "learning_rate": 7.600091217065716e-08, "logits/chosen": 1.2265625, "logits/rejected": 0.98046875, "logps/chosen": -360.0, "logps/rejected": -460.0, "loss": 0.5604, "rewards/accuracies": 0.5, "rewards/chosen": -1.265625, "rewards/margins": 0.173828125, "rewards/rejected": -1.4375, "step": 1079 }, { "epoch": 2.260596546310832, "grad_norm": 12.675722122192383, "learning_rate": 7.559313259079511e-08, "logits/chosen": 1.546875, "logits/rejected": 2.875, "logps/chosen": -556.0, "logps/rejected": -332.0, "loss": 0.621, "rewards/accuracies": 0.75, "rewards/chosen": -1.1796875, "rewards/margins": 0.60546875, "rewards/rejected": -1.78125, "step": 1080 }, { "epoch": 2.26268969126112, "grad_norm": 11.507694244384766, "learning_rate": 7.518625569968563e-08, "logits/chosen": 0.78515625, "logits/rejected": 1.7578125, "logps/chosen": -298.0, "logps/rejected": -284.0, "loss": 0.6126, "rewards/accuracies": 1.0, "rewards/chosen": -1.046875, "rewards/margins": 0.54296875, "rewards/rejected": -1.59375, "step": 1081 }, { "epoch": 2.2647828362114075, "grad_norm": 10.317126274108887, "learning_rate": 7.478028360292546e-08, "logits/chosen": 0.9296875, "logits/rejected": 0.94140625, "logps/chosen": -364.0, "logps/rejected": -332.0, "loss": 0.5658, "rewards/accuracies": 0.5, "rewards/chosen": -1.6015625, "rewards/margins": -0.150390625, "rewards/rejected": -1.453125, "step": 1082 }, { "epoch": 2.2668759811616956, "grad_norm": 11.577229499816895, "learning_rate": 7.437521840142908e-08, "logits/chosen": 1.515625, "logits/rejected": 2.09375, "logps/chosen": -442.0, "logps/rejected": -408.0, "loss": 0.6232, "rewards/accuracies": 0.25, "rewards/chosen": -1.578125, "rewards/margins": 0.16796875, "rewards/rejected": -1.7421875, "step": 1083 }, { "epoch": 2.2689691261119833, "grad_norm": 11.189661979675293, "learning_rate": 7.397106219141791e-08, "logits/chosen": 2.359375, "logits/rejected": 1.5859375, "logps/chosen": -456.0, "logps/rejected": -500.0, "loss": 0.5966, "rewards/accuracies": 0.75, "rewards/chosen": -1.375, "rewards/margins": 0.46484375, "rewards/rejected": -1.84375, "step": 1084 }, { "epoch": 2.271062271062271, "grad_norm": 10.638973236083984, "learning_rate": 7.356781706440928e-08, "logits/chosen": 1.7421875, "logits/rejected": 2.3125, "logps/chosen": -576.0, "logps/rejected": -490.0, "loss": 0.5327, "rewards/accuracies": 0.75, "rewards/chosen": -1.3515625, "rewards/margins": 0.435546875, "rewards/rejected": -1.7890625, "step": 1085 }, { "epoch": 2.2731554160125587, "grad_norm": 11.273877143859863, "learning_rate": 7.316548510720549e-08, "logits/chosen": 2.671875, "logits/rejected": 1.8125, "logps/chosen": -464.0, "logps/rejected": -516.0, "loss": 0.571, "rewards/accuracies": 1.0, "rewards/chosen": -1.359375, "rewards/margins": 0.431640625, "rewards/rejected": -1.7890625, "step": 1086 }, { "epoch": 2.2752485609628468, "grad_norm": 11.14000129699707, "learning_rate": 7.276406840188328e-08, "logits/chosen": 1.4609375, "logits/rejected": 1.578125, "logps/chosen": -488.0, "logps/rejected": -588.0, "loss": 0.5683, "rewards/accuracies": 0.5, "rewards/chosen": -1.5546875, "rewards/margins": 0.02734375, "rewards/rejected": -1.578125, "step": 1087 }, { "epoch": 2.2773417059131345, "grad_norm": 13.463235855102539, "learning_rate": 7.236356902578304e-08, "logits/chosen": 2.1875, "logits/rejected": 2.3125, "logps/chosen": -556.0, "logps/rejected": -568.0, "loss": 0.6012, "rewards/accuracies": 0.5, "rewards/chosen": -1.4921875, "rewards/margins": 0.0615234375, "rewards/rejected": -1.5546875, "step": 1088 }, { "epoch": 2.279434850863422, "grad_norm": 11.114773750305176, "learning_rate": 7.196398905149775e-08, "logits/chosen": 1.2734375, "logits/rejected": 0.984375, "logps/chosen": -274.0, "logps/rejected": -348.0, "loss": 0.5931, "rewards/accuracies": 0.75, "rewards/chosen": -1.4296875, "rewards/margins": 0.34375, "rewards/rejected": -1.7734375, "step": 1089 }, { "epoch": 2.2815279958137102, "grad_norm": 10.774499893188477, "learning_rate": 7.156533054686264e-08, "logits/chosen": 2.171875, "logits/rejected": 2.09375, "logps/chosen": -512.0, "logps/rejected": -402.0, "loss": 0.548, "rewards/accuracies": 0.75, "rewards/chosen": -1.140625, "rewards/margins": 0.2099609375, "rewards/rejected": -1.34375, "step": 1090 }, { "epoch": 2.283621140763998, "grad_norm": 11.815675735473633, "learning_rate": 7.116759557494416e-08, "logits/chosen": 0.9765625, "logits/rejected": 1.65625, "logps/chosen": -504.0, "logps/rejected": -338.0, "loss": 0.6095, "rewards/accuracies": 0.75, "rewards/chosen": -1.390625, "rewards/margins": 0.119140625, "rewards/rejected": -1.5078125, "step": 1091 }, { "epoch": 2.2857142857142856, "grad_norm": 11.589235305786133, "learning_rate": 7.077078619402966e-08, "logits/chosen": 2.78125, "logits/rejected": 2.0625, "logps/chosen": -608.0, "logps/rejected": -752.0, "loss": 0.5925, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.26171875, "rewards/rejected": -1.53125, "step": 1092 }, { "epoch": 2.2878074306645737, "grad_norm": 11.489415168762207, "learning_rate": 7.037490445761629e-08, "logits/chosen": 1.7734375, "logits/rejected": 2.65625, "logps/chosen": -604.0, "logps/rejected": -540.0, "loss": 0.612, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.5, "rewards/rejected": -1.6328125, "step": 1093 }, { "epoch": 2.2899005756148614, "grad_norm": 11.35988712310791, "learning_rate": 6.997995241440086e-08, "logits/chosen": 2.78125, "logits/rejected": 2.71875, "logps/chosen": -1120.0, "logps/rejected": -828.0, "loss": 0.5576, "rewards/accuracies": 0.75, "rewards/chosen": -1.0078125, "rewards/margins": 0.2890625, "rewards/rejected": -1.296875, "step": 1094 }, { "epoch": 2.291993720565149, "grad_norm": 11.50139331817627, "learning_rate": 6.958593210826879e-08, "logits/chosen": 1.5390625, "logits/rejected": 1.46875, "logps/chosen": -392.0, "logps/rejected": -406.0, "loss": 0.598, "rewards/accuracies": 1.0, "rewards/chosen": -1.3125, "rewards/margins": 0.361328125, "rewards/rejected": -1.671875, "step": 1095 }, { "epoch": 2.294086865515437, "grad_norm": 11.497011184692383, "learning_rate": 6.919284557828384e-08, "logits/chosen": 2.65625, "logits/rejected": 3.0, "logps/chosen": -498.0, "logps/rejected": -432.0, "loss": 0.5669, "rewards/accuracies": 0.75, "rewards/chosen": -1.53125, "rewards/margins": 0.271484375, "rewards/rejected": -1.796875, "step": 1096 }, { "epoch": 2.296180010465725, "grad_norm": 11.839705467224121, "learning_rate": 6.88006948586776e-08, "logits/chosen": 2.0, "logits/rejected": 1.9375, "logps/chosen": -326.0, "logps/rejected": -400.0, "loss": 0.5985, "rewards/accuracies": 0.75, "rewards/chosen": -1.484375, "rewards/margins": 0.62109375, "rewards/rejected": -2.09375, "step": 1097 }, { "epoch": 2.2982731554160125, "grad_norm": 12.152945518493652, "learning_rate": 6.840948197883847e-08, "logits/chosen": 1.484375, "logits/rejected": 1.21875, "logps/chosen": -362.0, "logps/rejected": -422.0, "loss": 0.5717, "rewards/accuracies": 1.0, "rewards/chosen": -1.171875, "rewards/margins": 0.51171875, "rewards/rejected": -1.6796875, "step": 1098 }, { "epoch": 2.3003663003663, "grad_norm": 11.379349708557129, "learning_rate": 6.80192089633019e-08, "logits/chosen": 1.34375, "logits/rejected": 2.0, "logps/chosen": -466.0, "logps/rejected": -424.0, "loss": 0.6013, "rewards/accuracies": 1.0, "rewards/chosen": -1.234375, "rewards/margins": 0.54296875, "rewards/rejected": -1.7734375, "step": 1099 }, { "epoch": 2.3024594453165883, "grad_norm": 11.67951488494873, "learning_rate": 6.762987783173914e-08, "logits/chosen": 2.8125, "logits/rejected": 2.71875, "logps/chosen": -772.0, "logps/rejected": -464.0, "loss": 0.576, "rewards/accuracies": 0.75, "rewards/chosen": -1.4140625, "rewards/margins": 0.216796875, "rewards/rejected": -1.625, "step": 1100 }, { "epoch": 2.304552590266876, "grad_norm": 11.653464317321777, "learning_rate": 6.724149059894758e-08, "logits/chosen": 1.9375, "logits/rejected": 2.671875, "logps/chosen": -392.0, "logps/rejected": -320.0, "loss": 0.5662, "rewards/accuracies": 0.75, "rewards/chosen": -1.1484375, "rewards/margins": 0.515625, "rewards/rejected": -1.6640625, "step": 1101 }, { "epoch": 2.3066457352171637, "grad_norm": 10.961610794067383, "learning_rate": 6.685404927483948e-08, "logits/chosen": 2.046875, "logits/rejected": 2.28125, "logps/chosen": -728.0, "logps/rejected": -516.0, "loss": 0.5507, "rewards/accuracies": 0.5, "rewards/chosen": -1.328125, "rewards/margins": 0.080078125, "rewards/rejected": -1.40625, "step": 1102 }, { "epoch": 2.3087388801674518, "grad_norm": 12.406999588012695, "learning_rate": 6.646755586443231e-08, "logits/chosen": 2.375, "logits/rejected": 3.78125, "logps/chosen": -540.0, "logps/rejected": -332.0, "loss": 0.6069, "rewards/accuracies": 0.75, "rewards/chosen": -1.0625, "rewards/margins": 0.2158203125, "rewards/rejected": -1.28125, "step": 1103 }, { "epoch": 2.3108320251177394, "grad_norm": 13.117595672607422, "learning_rate": 6.60820123678381e-08, "logits/chosen": 2.546875, "logits/rejected": 3.328125, "logps/chosen": -772.0, "logps/rejected": -532.0, "loss": 0.5875, "rewards/accuracies": 0.75, "rewards/chosen": -1.5546875, "rewards/margins": 0.55859375, "rewards/rejected": -2.109375, "step": 1104 }, { "epoch": 2.312925170068027, "grad_norm": 11.685809135437012, "learning_rate": 6.56974207802528e-08, "logits/chosen": 2.109375, "logits/rejected": 2.046875, "logps/chosen": -492.0, "logps/rejected": -336.0, "loss": 0.5702, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 0.392578125, "rewards/rejected": -1.5, "step": 1105 }, { "epoch": 2.315018315018315, "grad_norm": 10.609480857849121, "learning_rate": 6.531378309194625e-08, "logits/chosen": 1.1328125, "logits/rejected": 1.515625, "logps/chosen": -394.0, "logps/rejected": -248.0, "loss": 0.5857, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 0.234375, "rewards/rejected": -1.5546875, "step": 1106 }, { "epoch": 2.317111459968603, "grad_norm": 11.093987464904785, "learning_rate": 6.493110128825207e-08, "logits/chosen": 2.34375, "logits/rejected": 2.40625, "logps/chosen": -418.0, "logps/rejected": -344.0, "loss": 0.5887, "rewards/accuracies": 0.25, "rewards/chosen": -1.453125, "rewards/margins": -0.3125, "rewards/rejected": -1.140625, "step": 1107 }, { "epoch": 2.3192046049188906, "grad_norm": 11.48231029510498, "learning_rate": 6.454937734955702e-08, "logits/chosen": 2.265625, "logits/rejected": 2.328125, "logps/chosen": -600.0, "logps/rejected": -494.0, "loss": 0.5699, "rewards/accuracies": 0.25, "rewards/chosen": -1.53125, "rewards/margins": -0.0576171875, "rewards/rejected": -1.4765625, "step": 1108 }, { "epoch": 2.3212977498691783, "grad_norm": 11.149823188781738, "learning_rate": 6.416861325129081e-08, "logits/chosen": 2.4375, "logits/rejected": 2.296875, "logps/chosen": -544.0, "logps/rejected": -506.0, "loss": 0.5575, "rewards/accuracies": 0.5, "rewards/chosen": -1.265625, "rewards/margins": 0.52734375, "rewards/rejected": -1.796875, "step": 1109 }, { "epoch": 2.3233908948194664, "grad_norm": 12.775769233703613, "learning_rate": 6.378881096391602e-08, "logits/chosen": 1.859375, "logits/rejected": 1.984375, "logps/chosen": -398.0, "logps/rejected": -470.0, "loss": 0.5861, "rewards/accuracies": 0.25, "rewards/chosen": -1.59375, "rewards/margins": -0.033203125, "rewards/rejected": -1.5625, "step": 1110 }, { "epoch": 2.325484039769754, "grad_norm": 10.973607063293457, "learning_rate": 6.340997245291798e-08, "logits/chosen": 1.9296875, "logits/rejected": 2.4375, "logps/chosen": -488.0, "logps/rejected": -452.0, "loss": 0.5227, "rewards/accuracies": 0.0, "rewards/chosen": -1.546875, "rewards/margins": -0.19921875, "rewards/rejected": -1.3515625, "step": 1111 }, { "epoch": 2.3275771847200417, "grad_norm": 12.10044002532959, "learning_rate": 6.303209967879422e-08, "logits/chosen": 1.78125, "logits/rejected": 2.21875, "logps/chosen": -536.0, "logps/rejected": -584.0, "loss": 0.5664, "rewards/accuracies": 0.5, "rewards/chosen": -1.46875, "rewards/margins": 0.404296875, "rewards/rejected": -1.8671875, "step": 1112 }, { "epoch": 2.32967032967033, "grad_norm": 12.980206489562988, "learning_rate": 6.26551945970446e-08, "logits/chosen": 1.6875, "logits/rejected": 2.046875, "logps/chosen": -492.0, "logps/rejected": -412.0, "loss": 0.6242, "rewards/accuracies": 1.0, "rewards/chosen": -1.2109375, "rewards/margins": 0.423828125, "rewards/rejected": -1.640625, "step": 1113 }, { "epoch": 2.3317634746206175, "grad_norm": 10.602160453796387, "learning_rate": 6.22792591581613e-08, "logits/chosen": 2.875, "logits/rejected": 2.8125, "logps/chosen": -540.0, "logps/rejected": -568.0, "loss": 0.5601, "rewards/accuracies": 0.5, "rewards/chosen": -1.703125, "rewards/margins": 0.048828125, "rewards/rejected": -1.75, "step": 1114 }, { "epoch": 2.333856619570905, "grad_norm": 12.55278205871582, "learning_rate": 6.190429530761851e-08, "logits/chosen": 1.1875, "logits/rejected": 1.5, "logps/chosen": -384.0, "logps/rejected": -236.0, "loss": 0.633, "rewards/accuracies": 0.75, "rewards/chosen": -1.46875, "rewards/margins": -0.0048828125, "rewards/rejected": -1.46875, "step": 1115 }, { "epoch": 2.3359497645211933, "grad_norm": 11.216866493225098, "learning_rate": 6.153030498586239e-08, "logits/chosen": 1.671875, "logits/rejected": 1.671875, "logps/chosen": -318.0, "logps/rejected": -358.0, "loss": 0.5519, "rewards/accuracies": 1.0, "rewards/chosen": -1.1015625, "rewards/margins": 0.640625, "rewards/rejected": -1.75, "step": 1116 }, { "epoch": 2.338042909471481, "grad_norm": 11.164164543151855, "learning_rate": 6.115729012830089e-08, "logits/chosen": 1.171875, "logits/rejected": 0.64453125, "logps/chosen": -328.0, "logps/rejected": -520.0, "loss": 0.5421, "rewards/accuracies": 1.0, "rewards/chosen": -1.640625, "rewards/margins": 0.5078125, "rewards/rejected": -2.15625, "step": 1117 }, { "epoch": 2.3401360544217686, "grad_norm": 11.40911865234375, "learning_rate": 6.078525266529446e-08, "logits/chosen": 1.296875, "logits/rejected": 0.609375, "logps/chosen": -244.0, "logps/rejected": -372.0, "loss": 0.5939, "rewards/accuracies": 0.75, "rewards/chosen": -1.3359375, "rewards/margins": 0.33984375, "rewards/rejected": -1.671875, "step": 1118 }, { "epoch": 2.3422291993720563, "grad_norm": 10.803315162658691, "learning_rate": 6.041419452214497e-08, "logits/chosen": 1.375, "logits/rejected": 1.8203125, "logps/chosen": -424.0, "logps/rejected": -334.0, "loss": 0.5617, "rewards/accuracies": 0.5, "rewards/chosen": -1.1953125, "rewards/margins": 0.1533203125, "rewards/rejected": -1.34375, "step": 1119 }, { "epoch": 2.3443223443223444, "grad_norm": 10.840127944946289, "learning_rate": 6.00441176190864e-08, "logits/chosen": 1.6484375, "logits/rejected": 1.46875, "logps/chosen": -406.0, "logps/rejected": -528.0, "loss": 0.5366, "rewards/accuracies": 0.75, "rewards/chosen": -1.703125, "rewards/margins": 0.4453125, "rewards/rejected": -2.15625, "step": 1120 }, { "epoch": 2.346415489272632, "grad_norm": 11.171231269836426, "learning_rate": 5.967502387127494e-08, "logits/chosen": 1.171875, "logits/rejected": 1.3046875, "logps/chosen": -332.0, "logps/rejected": -344.0, "loss": 0.5896, "rewards/accuracies": 0.75, "rewards/chosen": -1.453125, "rewards/margins": 0.474609375, "rewards/rejected": -1.9296875, "step": 1121 }, { "epoch": 2.3485086342229198, "grad_norm": 11.783544540405273, "learning_rate": 5.930691518877897e-08, "logits/chosen": 1.6875, "logits/rejected": 1.640625, "logps/chosen": -446.0, "logps/rejected": -440.0, "loss": 0.5661, "rewards/accuracies": 0.75, "rewards/chosen": -1.421875, "rewards/margins": 0.271484375, "rewards/rejected": -1.6953125, "step": 1122 }, { "epoch": 2.350601779173208, "grad_norm": 12.148418426513672, "learning_rate": 5.8939793476568814e-08, "logits/chosen": 2.203125, "logits/rejected": 2.4375, "logps/chosen": -520.0, "logps/rejected": -752.0, "loss": 0.5712, "rewards/accuracies": 0.75, "rewards/chosen": -1.46875, "rewards/margins": 0.384765625, "rewards/rejected": -1.8515625, "step": 1123 }, { "epoch": 2.3526949241234956, "grad_norm": 12.935588836669922, "learning_rate": 5.857366063450755e-08, "logits/chosen": 1.984375, "logits/rejected": 2.515625, "logps/chosen": -528.0, "logps/rejected": -474.0, "loss": 0.6003, "rewards/accuracies": 0.75, "rewards/chosen": -1.5, "rewards/margins": 0.2265625, "rewards/rejected": -1.71875, "step": 1124 }, { "epoch": 2.3547880690737832, "grad_norm": 11.099451065063477, "learning_rate": 5.8208518557340725e-08, "logits/chosen": 0.9375, "logits/rejected": 2.4375, "logps/chosen": -772.0, "logps/rejected": -492.0, "loss": 0.5653, "rewards/accuracies": 1.0, "rewards/chosen": -1.3828125, "rewards/margins": 0.5703125, "rewards/rejected": -1.953125, "step": 1125 }, { "epoch": 2.3568812140240714, "grad_norm": 11.670310974121094, "learning_rate": 5.784436913468656e-08, "logits/chosen": 1.5390625, "logits/rejected": 1.125, "logps/chosen": -278.0, "logps/rejected": -378.0, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": -1.453125, "rewards/margins": 0.60546875, "rewards/rejected": -2.0625, "step": 1126 }, { "epoch": 2.358974358974359, "grad_norm": 11.362000465393066, "learning_rate": 5.7481214251026286e-08, "logits/chosen": 2.78125, "logits/rejected": 2.71875, "logps/chosen": -400.0, "logps/rejected": -446.0, "loss": 0.5735, "rewards/accuracies": 1.0, "rewards/chosen": -1.2578125, "rewards/margins": 0.59375, "rewards/rejected": -1.8515625, "step": 1127 }, { "epoch": 2.3610675039246467, "grad_norm": 11.208846092224121, "learning_rate": 5.7119055785694426e-08, "logits/chosen": 2.03125, "logits/rejected": 1.71875, "logps/chosen": -316.0, "logps/rejected": -552.0, "loss": 0.5515, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.365234375, "rewards/rejected": -1.5703125, "step": 1128 }, { "epoch": 2.363160648874935, "grad_norm": 11.73038387298584, "learning_rate": 5.675789561286913e-08, "logits/chosen": 1.6640625, "logits/rejected": 2.65625, "logps/chosen": -464.0, "logps/rejected": -280.0, "loss": 0.5804, "rewards/accuracies": 0.5, "rewards/chosen": -1.5234375, "rewards/margins": 0.060546875, "rewards/rejected": -1.578125, "step": 1129 }, { "epoch": 2.3652537938252225, "grad_norm": 10.981866836547852, "learning_rate": 5.639773560156211e-08, "logits/chosen": 2.5, "logits/rejected": 1.9921875, "logps/chosen": -828.0, "logps/rejected": -892.0, "loss": 0.5706, "rewards/accuracies": 1.0, "rewards/chosen": -1.0859375, "rewards/margins": 0.921875, "rewards/rejected": -2.0, "step": 1130 }, { "epoch": 2.36734693877551, "grad_norm": 10.912532806396484, "learning_rate": 5.6038577615609356e-08, "logits/chosen": 2.359375, "logits/rejected": 2.59375, "logps/chosen": -532.0, "logps/rejected": -428.0, "loss": 0.5606, "rewards/accuracies": 0.75, "rewards/chosen": -1.25, "rewards/margins": 0.36328125, "rewards/rejected": -1.6171875, "step": 1131 }, { "epoch": 2.369440083725798, "grad_norm": 11.479551315307617, "learning_rate": 5.5680423513661484e-08, "logits/chosen": 2.28125, "logits/rejected": 2.65625, "logps/chosen": -544.0, "logps/rejected": -500.0, "loss": 0.5733, "rewards/accuracies": 0.75, "rewards/chosen": -1.2734375, "rewards/margins": 0.125, "rewards/rejected": -1.3984375, "step": 1132 }, { "epoch": 2.371533228676086, "grad_norm": 11.486126899719238, "learning_rate": 5.532327514917377e-08, "logits/chosen": 2.171875, "logits/rejected": 2.1875, "logps/chosen": -688.0, "logps/rejected": -418.0, "loss": 0.5891, "rewards/accuracies": 0.5, "rewards/chosen": -1.3359375, "rewards/margins": 0.369140625, "rewards/rejected": -1.703125, "step": 1133 }, { "epoch": 2.3736263736263736, "grad_norm": 11.69915771484375, "learning_rate": 5.496713437039675e-08, "logits/chosen": 2.6875, "logits/rejected": 3.09375, "logps/chosen": -480.0, "logps/rejected": -464.0, "loss": 0.6, "rewards/accuracies": 1.0, "rewards/chosen": -1.28125, "rewards/margins": 0.6640625, "rewards/rejected": -1.9375, "step": 1134 }, { "epoch": 2.3757195185766613, "grad_norm": 10.8568754196167, "learning_rate": 5.461200302036689e-08, "logits/chosen": 3.0, "logits/rejected": 2.359375, "logps/chosen": -440.0, "logps/rejected": -656.0, "loss": 0.6175, "rewards/accuracies": 0.75, "rewards/chosen": -1.1484375, "rewards/margins": 0.359375, "rewards/rejected": -1.5078125, "step": 1135 }, { "epoch": 2.3778126635269494, "grad_norm": 10.859973907470703, "learning_rate": 5.4257882936896834e-08, "logits/chosen": 0.388671875, "logits/rejected": 0.380859375, "logps/chosen": -193.0, "logps/rejected": -226.0, "loss": 0.574, "rewards/accuracies": 1.0, "rewards/chosen": -1.0703125, "rewards/margins": 0.21875, "rewards/rejected": -1.2890625, "step": 1136 }, { "epoch": 2.379905808477237, "grad_norm": 11.677450180053711, "learning_rate": 5.390477595256566e-08, "logits/chosen": 1.5625, "logits/rejected": 2.1875, "logps/chosen": -528.0, "logps/rejected": -464.0, "loss": 0.5571, "rewards/accuracies": 0.75, "rewards/chosen": -1.4921875, "rewards/margins": 0.69140625, "rewards/rejected": -2.1875, "step": 1137 }, { "epoch": 2.3819989534275248, "grad_norm": 11.75250244140625, "learning_rate": 5.355268389470979e-08, "logits/chosen": 2.328125, "logits/rejected": 2.53125, "logps/chosen": -680.0, "logps/rejected": -396.0, "loss": 0.5862, "rewards/accuracies": 0.5, "rewards/chosen": -1.53125, "rewards/margins": 0.048828125, "rewards/rejected": -1.5859375, "step": 1138 }, { "epoch": 2.3840920983778124, "grad_norm": 11.258225440979004, "learning_rate": 5.320160858541352e-08, "logits/chosen": 1.0859375, "logits/rejected": 1.6328125, "logps/chosen": -260.0, "logps/rejected": -206.0, "loss": 0.6036, "rewards/accuracies": 0.5, "rewards/chosen": -1.15625, "rewards/margins": 0.169921875, "rewards/rejected": -1.328125, "step": 1139 }, { "epoch": 2.3861852433281006, "grad_norm": 10.5697603225708, "learning_rate": 5.285155184149918e-08, "logits/chosen": 2.4375, "logits/rejected": 3.21875, "logps/chosen": -704.0, "logps/rejected": -632.0, "loss": 0.5534, "rewards/accuracies": 1.0, "rewards/chosen": -0.82421875, "rewards/margins": 0.828125, "rewards/rejected": -1.6484375, "step": 1140 }, { "epoch": 2.3882783882783882, "grad_norm": 10.948512077331543, "learning_rate": 5.2502515474518105e-08, "logits/chosen": 2.734375, "logits/rejected": 2.125, "logps/chosen": -498.0, "logps/rejected": -640.0, "loss": 0.5722, "rewards/accuracies": 0.75, "rewards/chosen": -1.296875, "rewards/margins": 0.390625, "rewards/rejected": -1.6875, "step": 1141 }, { "epoch": 2.390371533228676, "grad_norm": 10.868309020996094, "learning_rate": 5.2154501290741196e-08, "logits/chosen": 2.6875, "logits/rejected": 2.3125, "logps/chosen": -480.0, "logps/rejected": -588.0, "loss": 0.5845, "rewards/accuracies": 0.25, "rewards/chosen": -1.9375, "rewards/margins": -0.09375, "rewards/rejected": -1.84375, "step": 1142 }, { "epoch": 2.392464678178964, "grad_norm": 12.436022758483887, "learning_rate": 5.180751109114958e-08, "logits/chosen": 2.71875, "logits/rejected": 3.015625, "logps/chosen": -956.0, "logps/rejected": -572.0, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": -2.109375, "rewards/margins": -0.32421875, "rewards/rejected": -1.7890625, "step": 1143 }, { "epoch": 2.3945578231292517, "grad_norm": 11.055791854858398, "learning_rate": 5.146154667142509e-08, "logits/chosen": 2.1875, "logits/rejected": 1.9296875, "logps/chosen": -724.0, "logps/rejected": -588.0, "loss": 0.5652, "rewards/accuracies": 1.0, "rewards/chosen": -1.2265625, "rewards/margins": 0.53125, "rewards/rejected": -1.7578125, "step": 1144 }, { "epoch": 2.3966509680795394, "grad_norm": 11.497798919677734, "learning_rate": 5.1116609821941295e-08, "logits/chosen": 1.5625, "logits/rejected": 2.015625, "logps/chosen": -444.0, "logps/rejected": -258.0, "loss": 0.5711, "rewards/accuracies": 0.25, "rewards/chosen": -1.4765625, "rewards/margins": -0.23828125, "rewards/rejected": -1.234375, "step": 1145 }, { "epoch": 2.3987441130298275, "grad_norm": 11.741530418395996, "learning_rate": 5.0772702327753885e-08, "logits/chosen": 1.03125, "logits/rejected": 1.2890625, "logps/chosen": -398.0, "logps/rejected": -354.0, "loss": 0.5506, "rewards/accuracies": 0.5, "rewards/chosen": -1.203125, "rewards/margins": 0.55078125, "rewards/rejected": -1.7578125, "step": 1146 }, { "epoch": 2.400837257980115, "grad_norm": 11.657857894897461, "learning_rate": 5.042982596859181e-08, "logits/chosen": 2.375, "logits/rejected": 2.640625, "logps/chosen": -840.0, "logps/rejected": -422.0, "loss": 0.5945, "rewards/accuracies": 0.5, "rewards/chosen": -2.296875, "rewards/margins": -0.0908203125, "rewards/rejected": -2.203125, "step": 1147 }, { "epoch": 2.402930402930403, "grad_norm": 12.115666389465332, "learning_rate": 5.008798251884766e-08, "logits/chosen": 1.765625, "logits/rejected": 1.7265625, "logps/chosen": -304.0, "logps/rejected": -490.0, "loss": 0.5824, "rewards/accuracies": 1.0, "rewards/chosen": -0.9609375, "rewards/margins": 0.56640625, "rewards/rejected": -1.53125, "step": 1148 }, { "epoch": 2.405023547880691, "grad_norm": 13.008218765258789, "learning_rate": 4.97471737475689e-08, "logits/chosen": 2.234375, "logits/rejected": 2.5625, "logps/chosen": -540.0, "logps/rejected": -440.0, "loss": 0.6402, "rewards/accuracies": 0.75, "rewards/chosen": -1.6640625, "rewards/margins": 0.2021484375, "rewards/rejected": -1.8671875, "step": 1149 }, { "epoch": 2.4071166928309786, "grad_norm": 12.10905933380127, "learning_rate": 4.940740141844843e-08, "logits/chosen": 1.609375, "logits/rejected": 2.328125, "logps/chosen": -652.0, "logps/rejected": -450.0, "loss": 0.5795, "rewards/accuracies": 0.5, "rewards/chosen": -1.5625, "rewards/margins": 0.1015625, "rewards/rejected": -1.6640625, "step": 1150 }, { "epoch": 2.4092098377812663, "grad_norm": 11.480064392089844, "learning_rate": 4.9068667289815444e-08, "logits/chosen": 1.6328125, "logits/rejected": 2.109375, "logps/chosen": -478.0, "logps/rejected": -468.0, "loss": 0.602, "rewards/accuracies": 0.5, "rewards/chosen": -2.125, "rewards/margins": -0.115234375, "rewards/rejected": -2.0, "step": 1151 }, { "epoch": 2.411302982731554, "grad_norm": 11.225536346435547, "learning_rate": 4.873097311462662e-08, "logits/chosen": 1.8203125, "logits/rejected": 1.90625, "logps/chosen": -286.0, "logps/rejected": -362.0, "loss": 0.597, "rewards/accuracies": 1.0, "rewards/chosen": -0.9453125, "rewards/margins": 1.0234375, "rewards/rejected": -1.96875, "step": 1152 }, { "epoch": 2.413396127681842, "grad_norm": 12.10788345336914, "learning_rate": 4.839432064045664e-08, "logits/chosen": 1.9375, "logits/rejected": 2.1875, "logps/chosen": -422.0, "logps/rejected": -456.0, "loss": 0.5994, "rewards/accuracies": 0.5, "rewards/chosen": -1.390625, "rewards/margins": 0.212890625, "rewards/rejected": -1.6015625, "step": 1153 }, { "epoch": 2.4154892726321298, "grad_norm": 12.278083801269531, "learning_rate": 4.805871160948957e-08, "logits/chosen": 2.4375, "logits/rejected": 2.09375, "logps/chosen": -460.0, "logps/rejected": -472.0, "loss": 0.5527, "rewards/accuracies": 0.75, "rewards/chosen": -1.1953125, "rewards/margins": 0.63671875, "rewards/rejected": -1.828125, "step": 1154 }, { "epoch": 2.4175824175824174, "grad_norm": 12.179211616516113, "learning_rate": 4.772414775850942e-08, "logits/chosen": 1.7734375, "logits/rejected": 1.890625, "logps/chosen": -304.0, "logps/rejected": -1008.0, "loss": 0.5881, "rewards/accuracies": 0.5, "rewards/chosen": -1.2734375, "rewards/margins": 1.046875, "rewards/rejected": -2.3125, "step": 1155 }, { "epoch": 2.4196755625327055, "grad_norm": 12.0684175491333, "learning_rate": 4.739063081889161e-08, "logits/chosen": 2.734375, "logits/rejected": 2.921875, "logps/chosen": -552.0, "logps/rejected": -592.0, "loss": 0.5928, "rewards/accuracies": 0.25, "rewards/chosen": -1.953125, "rewards/margins": -0.208984375, "rewards/rejected": -1.7421875, "step": 1156 }, { "epoch": 2.421768707482993, "grad_norm": 13.287016868591309, "learning_rate": 4.705816251659352e-08, "logits/chosen": 2.359375, "logits/rejected": 1.7578125, "logps/chosen": -672.0, "logps/rejected": -648.0, "loss": 0.5639, "rewards/accuracies": 0.75, "rewards/chosen": -1.2109375, "rewards/margins": 0.326171875, "rewards/rejected": -1.5390625, "step": 1157 }, { "epoch": 2.423861852433281, "grad_norm": 10.981544494628906, "learning_rate": 4.6726744572145964e-08, "logits/chosen": 1.6015625, "logits/rejected": 1.484375, "logps/chosen": -464.0, "logps/rejected": -470.0, "loss": 0.559, "rewards/accuracies": 0.5, "rewards/chosen": -1.609375, "rewards/margins": 0.12890625, "rewards/rejected": -1.734375, "step": 1158 }, { "epoch": 2.4259549973835686, "grad_norm": 10.738006591796875, "learning_rate": 4.639637870064416e-08, "logits/chosen": 1.1015625, "logits/rejected": 1.234375, "logps/chosen": -372.0, "logps/rejected": -382.0, "loss": 0.5604, "rewards/accuracies": 0.75, "rewards/chosen": -1.46875, "rewards/margins": 0.189453125, "rewards/rejected": -1.65625, "step": 1159 }, { "epoch": 2.4280481423338567, "grad_norm": 11.973393440246582, "learning_rate": 4.606706661173869e-08, "logits/chosen": 2.125, "logits/rejected": 2.171875, "logps/chosen": -624.0, "logps/rejected": -480.0, "loss": 0.5848, "rewards/accuracies": 0.75, "rewards/chosen": -1.375, "rewards/margins": 0.59375, "rewards/rejected": -1.96875, "step": 1160 }, { "epoch": 2.4301412872841444, "grad_norm": 11.391327857971191, "learning_rate": 4.573881000962693e-08, "logits/chosen": 0.96875, "logits/rejected": 1.359375, "logps/chosen": -272.0, "logps/rejected": -222.0, "loss": 0.5727, "rewards/accuracies": 0.5, "rewards/chosen": -1.4375, "rewards/margins": 0.1806640625, "rewards/rejected": -1.6171875, "step": 1161 }, { "epoch": 2.4322344322344325, "grad_norm": 11.80610466003418, "learning_rate": 4.5411610593043916e-08, "logits/chosen": 2.546875, "logits/rejected": 2.9375, "logps/chosen": -680.0, "logps/rejected": -740.0, "loss": 0.5857, "rewards/accuracies": 0.25, "rewards/chosen": -1.5625, "rewards/margins": -0.0625, "rewards/rejected": -1.5, "step": 1162 }, { "epoch": 2.43432757718472, "grad_norm": 11.733301162719727, "learning_rate": 4.508547005525395e-08, "logits/chosen": 2.3125, "logits/rejected": 2.578125, "logps/chosen": -636.0, "logps/rejected": -498.0, "loss": 0.5774, "rewards/accuracies": 0.5, "rewards/chosen": -1.6953125, "rewards/margins": -0.0986328125, "rewards/rejected": -1.59375, "step": 1163 }, { "epoch": 2.436420722135008, "grad_norm": 11.919504165649414, "learning_rate": 4.4760390084041395e-08, "logits/chosen": 2.234375, "logits/rejected": 3.0, "logps/chosen": -474.0, "logps/rejected": -466.0, "loss": 0.5693, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.38671875, "rewards/rejected": -1.484375, "step": 1164 }, { "epoch": 2.4385138670852955, "grad_norm": 10.880753517150879, "learning_rate": 4.4436372361702287e-08, "logits/chosen": 2.5, "logits/rejected": 3.015625, "logps/chosen": -752.0, "logps/rejected": -604.0, "loss": 0.5715, "rewards/accuracies": 1.0, "rewards/chosen": -1.2890625, "rewards/margins": 0.578125, "rewards/rejected": -1.859375, "step": 1165 }, { "epoch": 2.4406070120355836, "grad_norm": 11.72057819366455, "learning_rate": 4.4113418565035556e-08, "logits/chosen": 1.3828125, "logits/rejected": 1.21875, "logps/chosen": -304.0, "logps/rejected": -418.0, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": -1.625, "rewards/margins": 0.1015625, "rewards/rejected": -1.7265625, "step": 1166 }, { "epoch": 2.4427001569858713, "grad_norm": 11.343137741088867, "learning_rate": 4.379153036533411e-08, "logits/chosen": 0.75, "logits/rejected": 0.9921875, "logps/chosen": -436.0, "logps/rejected": -408.0, "loss": 0.5857, "rewards/accuracies": 0.5, "rewards/chosen": -1.421875, "rewards/margins": 0.625, "rewards/rejected": -2.046875, "step": 1167 }, { "epoch": 2.444793301936159, "grad_norm": 11.733400344848633, "learning_rate": 4.3470709428376414e-08, "logits/chosen": 2.0625, "logits/rejected": 2.25, "logps/chosen": -470.0, "logps/rejected": -416.0, "loss": 0.5901, "rewards/accuracies": 0.5, "rewards/chosen": -1.4375, "rewards/margins": 0.080078125, "rewards/rejected": -1.515625, "step": 1168 }, { "epoch": 2.446886446886447, "grad_norm": 12.811859130859375, "learning_rate": 4.315095741441796e-08, "logits/chosen": 1.359375, "logits/rejected": 1.4765625, "logps/chosen": -576.0, "logps/rejected": -384.0, "loss": 0.599, "rewards/accuracies": 0.5, "rewards/chosen": -1.7734375, "rewards/margins": 0.072265625, "rewards/rejected": -1.84375, "step": 1169 }, { "epoch": 2.4489795918367347, "grad_norm": 11.338305473327637, "learning_rate": 4.283227597818252e-08, "logits/chosen": 1.8125, "logits/rejected": 1.8828125, "logps/chosen": -580.0, "logps/rejected": -752.0, "loss": 0.5989, "rewards/accuracies": 1.0, "rewards/chosen": -1.34375, "rewards/margins": 0.5625, "rewards/rejected": -1.90625, "step": 1170 }, { "epoch": 2.4510727367870224, "grad_norm": 11.298980712890625, "learning_rate": 4.251466676885338e-08, "logits/chosen": 2.4375, "logits/rejected": 2.25, "logps/chosen": -588.0, "logps/rejected": -620.0, "loss": 0.5822, "rewards/accuracies": 0.5, "rewards/chosen": -1.5546875, "rewards/margins": 0.0263671875, "rewards/rejected": -1.578125, "step": 1171 }, { "epoch": 2.45316588173731, "grad_norm": 11.491308212280273, "learning_rate": 4.21981314300653e-08, "logits/chosen": 2.078125, "logits/rejected": 2.65625, "logps/chosen": -536.0, "logps/rejected": -708.0, "loss": 0.5373, "rewards/accuracies": 0.75, "rewards/chosen": -1.1484375, "rewards/margins": 0.1962890625, "rewards/rejected": -1.34375, "step": 1172 }, { "epoch": 2.455259026687598, "grad_norm": 11.813673973083496, "learning_rate": 4.188267159989565e-08, "logits/chosen": 1.5, "logits/rejected": 2.234375, "logps/chosen": -480.0, "logps/rejected": -310.0, "loss": 0.5628, "rewards/accuracies": 0.5, "rewards/chosen": -1.578125, "rewards/margins": 0.021484375, "rewards/rejected": -1.59375, "step": 1173 }, { "epoch": 2.457352171637886, "grad_norm": 12.04299259185791, "learning_rate": 4.156828891085592e-08, "logits/chosen": 1.640625, "logits/rejected": 1.71875, "logps/chosen": -336.0, "logps/rejected": -456.0, "loss": 0.5216, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 0.4140625, "rewards/rejected": -1.515625, "step": 1174 }, { "epoch": 2.4594453165881736, "grad_norm": 12.253011703491211, "learning_rate": 4.125498498988334e-08, "logits/chosen": 1.765625, "logits/rejected": 1.96875, "logps/chosen": -572.0, "logps/rejected": -488.0, "loss": 0.57, "rewards/accuracies": 1.0, "rewards/chosen": -1.265625, "rewards/margins": 0.828125, "rewards/rejected": -2.09375, "step": 1175 }, { "epoch": 2.4615384615384617, "grad_norm": 10.873589515686035, "learning_rate": 4.094276145833286e-08, "logits/chosen": 2.46875, "logits/rejected": 2.28125, "logps/chosen": -398.0, "logps/rejected": -572.0, "loss": 0.5735, "rewards/accuracies": 1.0, "rewards/chosen": -1.375, "rewards/margins": 0.6328125, "rewards/rejected": -2.0, "step": 1176 }, { "epoch": 2.4636316064887493, "grad_norm": 12.69218921661377, "learning_rate": 4.0631619931967995e-08, "logits/chosen": 1.5, "logits/rejected": 0.99609375, "logps/chosen": -306.0, "logps/rejected": -462.0, "loss": 0.6132, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.875, "rewards/rejected": -2.1875, "step": 1177 }, { "epoch": 2.465724751439037, "grad_norm": 11.359230995178223, "learning_rate": 4.032156202095291e-08, "logits/chosen": 2.15625, "logits/rejected": 2.21875, "logps/chosen": -378.0, "logps/rejected": -620.0, "loss": 0.5502, "rewards/accuracies": 1.0, "rewards/chosen": -1.265625, "rewards/margins": 1.171875, "rewards/rejected": -2.4375, "step": 1178 }, { "epoch": 2.467817896389325, "grad_norm": 12.737220764160156, "learning_rate": 4.001258932984418e-08, "logits/chosen": 3.0625, "logits/rejected": 2.125, "logps/chosen": -768.0, "logps/rejected": -636.0, "loss": 0.5639, "rewards/accuracies": 1.0, "rewards/chosen": -1.2890625, "rewards/margins": 0.27734375, "rewards/rejected": -1.5625, "step": 1179 }, { "epoch": 2.469911041339613, "grad_norm": 11.188276290893555, "learning_rate": 3.970470345758236e-08, "logits/chosen": 2.421875, "logits/rejected": 1.9375, "logps/chosen": -808.0, "logps/rejected": -640.0, "loss": 0.5684, "rewards/accuracies": 1.0, "rewards/chosen": -1.09375, "rewards/margins": 1.3203125, "rewards/rejected": -2.40625, "step": 1180 }, { "epoch": 2.4720041862899005, "grad_norm": 12.295984268188477, "learning_rate": 3.939790599748357e-08, "logits/chosen": 2.84375, "logits/rejected": 2.90625, "logps/chosen": -680.0, "logps/rejected": -608.0, "loss": 0.6257, "rewards/accuracies": 0.75, "rewards/chosen": -1.390625, "rewards/margins": 0.5703125, "rewards/rejected": -1.9609375, "step": 1181 }, { "epoch": 2.4740973312401886, "grad_norm": 12.252960205078125, "learning_rate": 3.909219853723124e-08, "logits/chosen": 2.375, "logits/rejected": 2.5625, "logps/chosen": -728.0, "logps/rejected": -430.0, "loss": 0.5755, "rewards/accuracies": 1.0, "rewards/chosen": -1.1796875, "rewards/margins": 0.53515625, "rewards/rejected": -1.71875, "step": 1182 }, { "epoch": 2.4761904761904763, "grad_norm": 11.002705574035645, "learning_rate": 3.878758265886848e-08, "logits/chosen": 0.890625, "logits/rejected": 0.40625, "logps/chosen": -184.0, "logps/rejected": -226.0, "loss": 0.566, "rewards/accuracies": 0.5, "rewards/chosen": -1.0625, "rewards/margins": 0.13671875, "rewards/rejected": -1.203125, "step": 1183 }, { "epoch": 2.478283621140764, "grad_norm": 12.764391899108887, "learning_rate": 3.848405993878906e-08, "logits/chosen": 1.46875, "logits/rejected": 2.015625, "logps/chosen": -528.0, "logps/rejected": -628.0, "loss": 0.6226, "rewards/accuracies": 0.75, "rewards/chosen": -1.1171875, "rewards/margins": 1.0078125, "rewards/rejected": -2.125, "step": 1184 }, { "epoch": 2.4803767660910516, "grad_norm": 12.30169677734375, "learning_rate": 3.818163194772964e-08, "logits/chosen": 1.1640625, "logits/rejected": 1.3203125, "logps/chosen": -384.0, "logps/rejected": -320.0, "loss": 0.617, "rewards/accuracies": 0.5, "rewards/chosen": -1.859375, "rewards/margins": -0.072265625, "rewards/rejected": -1.7890625, "step": 1185 }, { "epoch": 2.4824699110413397, "grad_norm": 11.491966247558594, "learning_rate": 3.788030025076183e-08, "logits/chosen": 1.0390625, "logits/rejected": 1.6015625, "logps/chosen": -316.0, "logps/rejected": -314.0, "loss": 0.5852, "rewards/accuracies": 0.75, "rewards/chosen": -0.83203125, "rewards/margins": 0.80859375, "rewards/rejected": -1.640625, "step": 1186 }, { "epoch": 2.4845630559916274, "grad_norm": 11.358709335327148, "learning_rate": 3.758006640728381e-08, "logits/chosen": 1.7734375, "logits/rejected": 1.890625, "logps/chosen": -436.0, "logps/rejected": -404.0, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.34375, "rewards/rejected": -1.546875, "step": 1187 }, { "epoch": 2.486656200941915, "grad_norm": 10.98845100402832, "learning_rate": 3.728093197101228e-08, "logits/chosen": 2.71875, "logits/rejected": 2.921875, "logps/chosen": -864.0, "logps/rejected": -584.0, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": -0.9453125, "rewards/margins": 0.9609375, "rewards/rejected": -1.90625, "step": 1188 }, { "epoch": 2.488749345892203, "grad_norm": 10.837740898132324, "learning_rate": 3.698289848997448e-08, "logits/chosen": 2.234375, "logits/rejected": 2.765625, "logps/chosen": -540.0, "logps/rejected": -544.0, "loss": 0.5729, "rewards/accuracies": 1.0, "rewards/chosen": -1.203125, "rewards/margins": 0.400390625, "rewards/rejected": -1.6015625, "step": 1189 }, { "epoch": 2.490842490842491, "grad_norm": 10.766701698303223, "learning_rate": 3.6685967506500306e-08, "logits/chosen": 1.1640625, "logits/rejected": 0.87890625, "logps/chosen": -314.0, "logps/rejected": -348.0, "loss": 0.5811, "rewards/accuracies": 0.75, "rewards/chosen": -1.046875, "rewards/margins": 0.234375, "rewards/rejected": -1.28125, "step": 1190 }, { "epoch": 2.4929356357927785, "grad_norm": 11.748739242553711, "learning_rate": 3.639014055721417e-08, "logits/chosen": 0.9765625, "logits/rejected": 0.91015625, "logps/chosen": -236.0, "logps/rejected": -266.0, "loss": 0.5899, "rewards/accuracies": 0.75, "rewards/chosen": -1.03125, "rewards/margins": 0.34765625, "rewards/rejected": -1.3828125, "step": 1191 }, { "epoch": 2.495028780743066, "grad_norm": 11.828984260559082, "learning_rate": 3.609541917302693e-08, "logits/chosen": 1.65625, "logits/rejected": 1.5703125, "logps/chosen": -368.0, "logps/rejected": -504.0, "loss": 0.5884, "rewards/accuracies": 0.5, "rewards/chosen": -1.1171875, "rewards/margins": 0.173828125, "rewards/rejected": -1.296875, "step": 1192 }, { "epoch": 2.4971219256933543, "grad_norm": 12.25663948059082, "learning_rate": 3.580180487912831e-08, "logits/chosen": 1.9921875, "logits/rejected": 2.5625, "logps/chosen": -652.0, "logps/rejected": -652.0, "loss": 0.5784, "rewards/accuracies": 0.75, "rewards/chosen": -1.5625, "rewards/margins": 0.609375, "rewards/rejected": -2.171875, "step": 1193 }, { "epoch": 2.499215070643642, "grad_norm": 13.72636604309082, "learning_rate": 3.550929919497876e-08, "logits/chosen": 1.5625, "logits/rejected": 1.7421875, "logps/chosen": -352.0, "logps/rejected": -510.0, "loss": 0.6214, "rewards/accuracies": 1.0, "rewards/chosen": -1.078125, "rewards/margins": 1.09375, "rewards/rejected": -2.171875, "step": 1194 }, { "epoch": 2.50130821559393, "grad_norm": 11.22259521484375, "learning_rate": 3.521790363430161e-08, "logits/chosen": 1.984375, "logits/rejected": 2.78125, "logps/chosen": -696.0, "logps/rejected": -616.0, "loss": 0.5602, "rewards/accuracies": 1.0, "rewards/chosen": -1.125, "rewards/margins": 0.61328125, "rewards/rejected": -1.734375, "step": 1195 }, { "epoch": 2.503401360544218, "grad_norm": 12.130874633789062, "learning_rate": 3.4927619705075236e-08, "logits/chosen": 1.9921875, "logits/rejected": 2.0625, "logps/chosen": -468.0, "logps/rejected": -400.0, "loss": 0.5579, "rewards/accuracies": 1.0, "rewards/chosen": -1.34375, "rewards/margins": 0.6953125, "rewards/rejected": -2.03125, "step": 1196 }, { "epoch": 2.5054945054945055, "grad_norm": 13.3442964553833, "learning_rate": 3.463844890952541e-08, "logits/chosen": 1.5625, "logits/rejected": 2.375, "logps/chosen": -498.0, "logps/rejected": -540.0, "loss": 0.6025, "rewards/accuracies": 0.75, "rewards/chosen": -1.3359375, "rewards/margins": 0.76953125, "rewards/rejected": -2.109375, "step": 1197 }, { "epoch": 2.507587650444793, "grad_norm": 11.669981002807617, "learning_rate": 3.4350392744117424e-08, "logits/chosen": 2.46875, "logits/rejected": 3.671875, "logps/chosen": -796.0, "logps/rejected": -368.0, "loss": 0.6119, "rewards/accuracies": 0.75, "rewards/chosen": -1.046875, "rewards/margins": 0.478515625, "rewards/rejected": -1.5234375, "step": 1198 }, { "epoch": 2.5096807953950813, "grad_norm": 11.026544570922852, "learning_rate": 3.406345269954817e-08, "logits/chosen": 1.75, "logits/rejected": 2.09375, "logps/chosen": -484.0, "logps/rejected": -312.0, "loss": 0.5777, "rewards/accuracies": 0.25, "rewards/chosen": -1.6875, "rewards/margins": -0.025390625, "rewards/rejected": -1.65625, "step": 1199 }, { "epoch": 2.511773940345369, "grad_norm": 12.441347122192383, "learning_rate": 3.3777630260738765e-08, "logits/chosen": 1.1171875, "logits/rejected": 1.46875, "logps/chosen": -476.0, "logps/rejected": -532.0, "loss": 0.6326, "rewards/accuracies": 0.5, "rewards/chosen": -1.5078125, "rewards/margins": 0.28125, "rewards/rejected": -1.7890625, "step": 1200 }, { "epoch": 2.5138670852956566, "grad_norm": 11.129204750061035, "learning_rate": 3.349292690682657e-08, "logits/chosen": 1.0546875, "logits/rejected": 1.4140625, "logps/chosen": -420.0, "logps/rejected": -438.0, "loss": 0.5981, "rewards/accuracies": 0.75, "rewards/chosen": -0.9140625, "rewards/margins": 0.703125, "rewards/rejected": -1.6171875, "step": 1201 }, { "epoch": 2.5159602302459447, "grad_norm": 11.990527153015137, "learning_rate": 3.320934411115776e-08, "logits/chosen": 2.0, "logits/rejected": 1.1796875, "logps/chosen": -250.0, "logps/rejected": -328.0, "loss": 0.5998, "rewards/accuracies": 0.5, "rewards/chosen": -1.3359375, "rewards/margins": 0.298828125, "rewards/rejected": -1.6328125, "step": 1202 }, { "epoch": 2.5180533751962324, "grad_norm": 11.312446594238281, "learning_rate": 3.2926883341279474e-08, "logits/chosen": 0.984375, "logits/rejected": 1.1484375, "logps/chosen": -372.0, "logps/rejected": -544.0, "loss": 0.5453, "rewards/accuracies": 0.75, "rewards/chosen": -1.4921875, "rewards/margins": 0.2021484375, "rewards/rejected": -1.6875, "step": 1203 }, { "epoch": 2.52014652014652, "grad_norm": 11.853565216064453, "learning_rate": 3.264554605893246e-08, "logits/chosen": 2.375, "logits/rejected": 2.59375, "logps/chosen": -556.0, "logps/rejected": -576.0, "loss": 0.5507, "rewards/accuracies": 0.5, "rewards/chosen": -1.34375, "rewards/margins": -0.0029296875, "rewards/rejected": -1.3359375, "step": 1204 }, { "epoch": 2.5222396650968077, "grad_norm": 10.736586570739746, "learning_rate": 3.236533372004338e-08, "logits/chosen": 1.984375, "logits/rejected": 2.28125, "logps/chosen": -444.0, "logps/rejected": -462.0, "loss": 0.5873, "rewards/accuracies": 0.5, "rewards/chosen": -1.375, "rewards/margins": 0.244140625, "rewards/rejected": -1.6171875, "step": 1205 }, { "epoch": 2.524332810047096, "grad_norm": 12.196707725524902, "learning_rate": 3.2086247774717155e-08, "logits/chosen": 2.296875, "logits/rejected": 2.953125, "logps/chosen": -592.0, "logps/rejected": -616.0, "loss": 0.6162, "rewards/accuracies": 0.75, "rewards/chosen": -1.375, "rewards/margins": 0.734375, "rewards/rejected": -2.109375, "step": 1206 }, { "epoch": 2.5264259549973835, "grad_norm": 11.649630546569824, "learning_rate": 3.1808289667229795e-08, "logits/chosen": 1.671875, "logits/rejected": 1.625, "logps/chosen": -388.0, "logps/rejected": -560.0, "loss": 0.5946, "rewards/accuracies": 0.75, "rewards/chosen": -1.359375, "rewards/margins": 0.59375, "rewards/rejected": -1.953125, "step": 1207 }, { "epoch": 2.528519099947671, "grad_norm": 11.674649238586426, "learning_rate": 3.153146083602052e-08, "logits/chosen": 0.8515625, "logits/rejected": 0.6171875, "logps/chosen": -215.0, "logps/rejected": -300.0, "loss": 0.556, "rewards/accuracies": 0.5, "rewards/chosen": -1.296875, "rewards/margins": 0.2734375, "rewards/rejected": -1.5703125, "step": 1208 }, { "epoch": 2.5306122448979593, "grad_norm": 13.151446342468262, "learning_rate": 3.12557627136847e-08, "logits/chosen": 2.03125, "logits/rejected": 1.171875, "logps/chosen": -230.0, "logps/rejected": -368.0, "loss": 0.6325, "rewards/accuracies": 0.5, "rewards/chosen": -1.4453125, "rewards/margins": 0.16015625, "rewards/rejected": -1.609375, "step": 1209 }, { "epoch": 2.532705389848247, "grad_norm": 11.518951416015625, "learning_rate": 3.098119672696622e-08, "logits/chosen": 1.09375, "logits/rejected": 1.5078125, "logps/chosen": -292.0, "logps/rejected": -245.0, "loss": 0.5736, "rewards/accuracies": 0.75, "rewards/chosen": -1.3046875, "rewards/margins": 0.0732421875, "rewards/rejected": -1.375, "step": 1210 }, { "epoch": 2.5347985347985347, "grad_norm": 10.688014030456543, "learning_rate": 3.070776429675003e-08, "logits/chosen": 1.828125, "logits/rejected": 2.34375, "logps/chosen": -548.0, "logps/rejected": -572.0, "loss": 0.6095, "rewards/accuracies": 0.75, "rewards/chosen": -1.4296875, "rewards/margins": 0.53125, "rewards/rejected": -1.953125, "step": 1211 }, { "epoch": 2.5368916797488223, "grad_norm": 11.002942085266113, "learning_rate": 3.0435466838054944e-08, "logits/chosen": 2.0625, "logits/rejected": 2.953125, "logps/chosen": -716.0, "logps/rejected": -544.0, "loss": 0.5662, "rewards/accuracies": 0.5, "rewards/chosen": -1.296875, "rewards/margins": 0.173828125, "rewards/rejected": -1.46875, "step": 1212 }, { "epoch": 2.5389848246991105, "grad_norm": 11.257405281066895, "learning_rate": 3.0164305760026364e-08, "logits/chosen": 1.1640625, "logits/rejected": 1.828125, "logps/chosen": -340.0, "logps/rejected": -292.0, "loss": 0.5656, "rewards/accuracies": 0.25, "rewards/chosen": -1.625, "rewards/margins": 0.13671875, "rewards/rejected": -1.765625, "step": 1213 }, { "epoch": 2.541077969649398, "grad_norm": 10.610968589782715, "learning_rate": 2.9894282465928896e-08, "logits/chosen": 0.87890625, "logits/rejected": 0.88671875, "logps/chosen": -230.0, "logps/rejected": -290.0, "loss": 0.5164, "rewards/accuracies": 1.0, "rewards/chosen": -1.4296875, "rewards/margins": 0.359375, "rewards/rejected": -1.7890625, "step": 1214 }, { "epoch": 2.5431711145996863, "grad_norm": 12.95569133758545, "learning_rate": 2.9625398353138885e-08, "logits/chosen": 1.5703125, "logits/rejected": 2.078125, "logps/chosen": -396.0, "logps/rejected": -386.0, "loss": 0.5799, "rewards/accuracies": 1.0, "rewards/chosen": -1.1484375, "rewards/margins": 0.8046875, "rewards/rejected": -1.953125, "step": 1215 }, { "epoch": 2.545264259549974, "grad_norm": 12.78900146484375, "learning_rate": 2.9357654813137606e-08, "logits/chosen": 1.3359375, "logits/rejected": 1.5, "logps/chosen": -242.0, "logps/rejected": -292.0, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": -1.1875, "rewards/margins": 0.369140625, "rewards/rejected": -1.5625, "step": 1216 }, { "epoch": 2.5473574045002616, "grad_norm": 11.339468002319336, "learning_rate": 2.9091053231503798e-08, "logits/chosen": 2.40625, "logits/rejected": 2.03125, "logps/chosen": -446.0, "logps/rejected": -636.0, "loss": 0.5539, "rewards/accuracies": 1.0, "rewards/chosen": -1.0078125, "rewards/margins": 1.125, "rewards/rejected": -2.125, "step": 1217 }, { "epoch": 2.5494505494505493, "grad_norm": 12.718293190002441, "learning_rate": 2.882559498790651e-08, "logits/chosen": 2.03125, "logits/rejected": 1.3359375, "logps/chosen": -560.0, "logps/rejected": -684.0, "loss": 0.6018, "rewards/accuracies": 1.0, "rewards/chosen": -1.25, "rewards/margins": 1.0234375, "rewards/rejected": -2.265625, "step": 1218 }, { "epoch": 2.5515436944008374, "grad_norm": 11.234786987304688, "learning_rate": 2.856128145609793e-08, "logits/chosen": 2.40625, "logits/rejected": 2.703125, "logps/chosen": -600.0, "logps/rejected": -444.0, "loss": 0.6019, "rewards/accuracies": 0.75, "rewards/chosen": -1.4453125, "rewards/margins": 0.0361328125, "rewards/rejected": -1.484375, "step": 1219 }, { "epoch": 2.553636839351125, "grad_norm": 12.245379447937012, "learning_rate": 2.8298114003906423e-08, "logits/chosen": 1.3515625, "logits/rejected": 1.71875, "logps/chosen": -442.0, "logps/rejected": -528.0, "loss": 0.5604, "rewards/accuracies": 0.75, "rewards/chosen": -1.4296875, "rewards/margins": 0.796875, "rewards/rejected": -2.234375, "step": 1220 }, { "epoch": 2.5557299843014127, "grad_norm": 11.113319396972656, "learning_rate": 2.8036093993229405e-08, "logits/chosen": 2.71875, "logits/rejected": 3.15625, "logps/chosen": -612.0, "logps/rejected": -524.0, "loss": 0.5613, "rewards/accuracies": 0.5, "rewards/chosen": -1.9921875, "rewards/margins": -0.15234375, "rewards/rejected": -1.84375, "step": 1221 }, { "epoch": 2.557823129251701, "grad_norm": 12.352375030517578, "learning_rate": 2.777522278002615e-08, "logits/chosen": 2.421875, "logits/rejected": 2.421875, "logps/chosen": -480.0, "logps/rejected": -426.0, "loss": 0.5971, "rewards/accuracies": 0.25, "rewards/chosen": -1.53125, "rewards/margins": -0.00390625, "rewards/rejected": -1.5234375, "step": 1222 }, { "epoch": 2.5599162742019885, "grad_norm": 11.425958633422852, "learning_rate": 2.7515501714310855e-08, "logits/chosen": 1.6015625, "logits/rejected": 1.28125, "logps/chosen": -208.0, "logps/rejected": -354.0, "loss": 0.5876, "rewards/accuracies": 1.0, "rewards/chosen": -1.3359375, "rewards/margins": 0.546875, "rewards/rejected": -1.8828125, "step": 1223 }, { "epoch": 2.562009419152276, "grad_norm": 11.302983283996582, "learning_rate": 2.7256932140145904e-08, "logits/chosen": 1.875, "logits/rejected": 2.625, "logps/chosen": -600.0, "logps/rejected": -352.0, "loss": 0.5834, "rewards/accuracies": 0.5, "rewards/chosen": -1.6875, "rewards/margins": 0.5078125, "rewards/rejected": -2.203125, "step": 1224 }, { "epoch": 2.564102564102564, "grad_norm": 13.678489685058594, "learning_rate": 2.6999515395634473e-08, "logits/chosen": 2.375, "logits/rejected": 3.203125, "logps/chosen": -648.0, "logps/rejected": -692.0, "loss": 0.6591, "rewards/accuracies": 0.75, "rewards/chosen": -1.2109375, "rewards/margins": 0.48828125, "rewards/rejected": -1.703125, "step": 1225 }, { "epoch": 2.566195709052852, "grad_norm": 11.15230655670166, "learning_rate": 2.6743252812913822e-08, "logits/chosen": 2.46875, "logits/rejected": 2.734375, "logps/chosen": -508.0, "logps/rejected": -432.0, "loss": 0.5963, "rewards/accuracies": 0.5, "rewards/chosen": -1.3984375, "rewards/margins": -0.1875, "rewards/rejected": -1.2109375, "step": 1226 }, { "epoch": 2.5682888540031397, "grad_norm": 12.096667289733887, "learning_rate": 2.6488145718148505e-08, "logits/chosen": 1.703125, "logits/rejected": 2.15625, "logps/chosen": -454.0, "logps/rejected": -422.0, "loss": 0.597, "rewards/accuracies": 1.0, "rewards/chosen": -1.1015625, "rewards/margins": 0.75, "rewards/rejected": -1.8515625, "step": 1227 }, { "epoch": 2.570381998953428, "grad_norm": 11.339020729064941, "learning_rate": 2.623419543152337e-08, "logits/chosen": 1.9375, "logits/rejected": 2.25, "logps/chosen": -560.0, "logps/rejected": -540.0, "loss": 0.5966, "rewards/accuracies": 0.75, "rewards/chosen": -1.3984375, "rewards/margins": 0.314453125, "rewards/rejected": -1.71875, "step": 1228 }, { "epoch": 2.5724751439037155, "grad_norm": 11.454265594482422, "learning_rate": 2.5981403267236717e-08, "logits/chosen": 1.171875, "logits/rejected": 0.69140625, "logps/chosen": -238.0, "logps/rejected": -348.0, "loss": 0.56, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.380859375, "rewards/rejected": -1.609375, "step": 1229 }, { "epoch": 2.574568288854003, "grad_norm": 12.722302436828613, "learning_rate": 2.572977053349346e-08, "logits/chosen": 2.3125, "logits/rejected": 2.53125, "logps/chosen": -454.0, "logps/rejected": -368.0, "loss": 0.5985, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.22265625, "rewards/rejected": -1.4609375, "step": 1230 }, { "epoch": 2.576661433804291, "grad_norm": 11.2119722366333, "learning_rate": 2.5479298532498732e-08, "logits/chosen": 1.1328125, "logits/rejected": 1.5625, "logps/chosen": -412.0, "logps/rejected": -418.0, "loss": 0.5958, "rewards/accuracies": 0.75, "rewards/chosen": -1.4765625, "rewards/margins": -0.0478515625, "rewards/rejected": -1.4296875, "step": 1231 }, { "epoch": 2.578754578754579, "grad_norm": 10.447643280029297, "learning_rate": 2.5229988560450544e-08, "logits/chosen": 1.0390625, "logits/rejected": 0.65234375, "logps/chosen": -294.0, "logps/rejected": -502.0, "loss": 0.5653, "rewards/accuracies": 1.0, "rewards/chosen": -1.328125, "rewards/margins": 0.94921875, "rewards/rejected": -2.28125, "step": 1232 }, { "epoch": 2.5808477237048666, "grad_norm": 13.349222183227539, "learning_rate": 2.498184190753343e-08, "logits/chosen": 1.2421875, "logits/rejected": 1.1640625, "logps/chosen": -394.0, "logps/rejected": -440.0, "loss": 0.6531, "rewards/accuracies": 0.5, "rewards/chosen": -1.2265625, "rewards/margins": 0.068359375, "rewards/rejected": -1.296875, "step": 1233 }, { "epoch": 2.5829408686551543, "grad_norm": 11.777934074401855, "learning_rate": 2.4734859857911862e-08, "logits/chosen": 2.046875, "logits/rejected": 2.125, "logps/chosen": -700.0, "logps/rejected": -632.0, "loss": 0.6121, "rewards/accuracies": 0.5, "rewards/chosen": -1.7421875, "rewards/margins": 0.48046875, "rewards/rejected": -2.21875, "step": 1234 }, { "epoch": 2.5850340136054424, "grad_norm": 12.234633445739746, "learning_rate": 2.4489043689723397e-08, "logits/chosen": 1.8515625, "logits/rejected": 2.578125, "logps/chosen": -400.0, "logps/rejected": -278.0, "loss": 0.6035, "rewards/accuracies": 0.5, "rewards/chosen": -1.578125, "rewards/margins": -0.0146484375, "rewards/rejected": -1.5625, "step": 1235 }, { "epoch": 2.58712715855573, "grad_norm": 11.81685733795166, "learning_rate": 2.4244394675072046e-08, "logits/chosen": 1.9921875, "logits/rejected": 2.203125, "logps/chosen": -470.0, "logps/rejected": -456.0, "loss": 0.6209, "rewards/accuracies": 0.5, "rewards/chosen": -1.765625, "rewards/margins": -0.1513671875, "rewards/rejected": -1.6171875, "step": 1236 }, { "epoch": 2.5892203035060177, "grad_norm": 11.48193359375, "learning_rate": 2.400091408002187e-08, "logits/chosen": 2.078125, "logits/rejected": 1.703125, "logps/chosen": -458.0, "logps/rejected": -668.0, "loss": 0.5771, "rewards/accuracies": 0.5, "rewards/chosen": -1.6796875, "rewards/margins": 0.6328125, "rewards/rejected": -2.3125, "step": 1237 }, { "epoch": 2.5913134484563054, "grad_norm": 10.787525177001953, "learning_rate": 2.3758603164590344e-08, "logits/chosen": 2.078125, "logits/rejected": 2.640625, "logps/chosen": -772.0, "logps/rejected": -812.0, "loss": 0.5692, "rewards/accuracies": 0.75, "rewards/chosen": -0.94921875, "rewards/margins": 0.6796875, "rewards/rejected": -1.625, "step": 1238 }, { "epoch": 2.5934065934065935, "grad_norm": 11.714813232421875, "learning_rate": 2.3517463182741777e-08, "logits/chosen": 1.46875, "logits/rejected": 1.7109375, "logps/chosen": -346.0, "logps/rejected": -388.0, "loss": 0.5601, "rewards/accuracies": 0.75, "rewards/chosen": -1.5859375, "rewards/margins": 0.2138671875, "rewards/rejected": -1.796875, "step": 1239 }, { "epoch": 2.595499738356881, "grad_norm": 12.473237037658691, "learning_rate": 2.3277495382380804e-08, "logits/chosen": 3.3125, "logits/rejected": 3.03125, "logps/chosen": -632.0, "logps/rejected": -552.0, "loss": 0.6142, "rewards/accuracies": 0.5, "rewards/chosen": -2.015625, "rewards/margins": -0.048828125, "rewards/rejected": -1.96875, "step": 1240 }, { "epoch": 2.597592883307169, "grad_norm": 12.750757217407227, "learning_rate": 2.3038701005346117e-08, "logits/chosen": 2.171875, "logits/rejected": 1.8046875, "logps/chosen": -432.0, "logps/rejected": -432.0, "loss": 0.6186, "rewards/accuracies": 1.0, "rewards/chosen": -1.34375, "rewards/margins": 0.427734375, "rewards/rejected": -1.765625, "step": 1241 }, { "epoch": 2.599686028257457, "grad_norm": 11.450703620910645, "learning_rate": 2.2801081287403963e-08, "logits/chosen": 2.734375, "logits/rejected": 2.53125, "logps/chosen": -296.0, "logps/rejected": -472.0, "loss": 0.5627, "rewards/accuracies": 0.75, "rewards/chosen": -1.21875, "rewards/margins": 0.40234375, "rewards/rejected": -1.625, "step": 1242 }, { "epoch": 2.6017791732077447, "grad_norm": 12.390066146850586, "learning_rate": 2.2564637458241473e-08, "logits/chosen": 1.859375, "logits/rejected": 2.890625, "logps/chosen": -588.0, "logps/rejected": -466.0, "loss": 0.6006, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.50390625, "rewards/rejected": -1.7109375, "step": 1243 }, { "epoch": 2.6038723181580323, "grad_norm": 10.352241516113281, "learning_rate": 2.2329370741460762e-08, "logits/chosen": 0.734375, "logits/rejected": 0.84375, "logps/chosen": -200.0, "logps/rejected": -292.0, "loss": 0.5558, "rewards/accuracies": 0.75, "rewards/chosen": -1.1953125, "rewards/margins": 0.5703125, "rewards/rejected": -1.765625, "step": 1244 }, { "epoch": 2.60596546310832, "grad_norm": 12.06119155883789, "learning_rate": 2.2095282354572198e-08, "logits/chosen": 2.75, "logits/rejected": 2.734375, "logps/chosen": -548.0, "logps/rejected": -506.0, "loss": 0.6082, "rewards/accuracies": 0.75, "rewards/chosen": -1.609375, "rewards/margins": 0.330078125, "rewards/rejected": -1.9453125, "step": 1245 }, { "epoch": 2.608058608058608, "grad_norm": 11.703369140625, "learning_rate": 2.1862373508988392e-08, "logits/chosen": 1.6328125, "logits/rejected": 1.953125, "logps/chosen": -440.0, "logps/rejected": -384.0, "loss": 0.5871, "rewards/accuracies": 1.0, "rewards/chosen": -1.7109375, "rewards/margins": 0.4921875, "rewards/rejected": -2.203125, "step": 1246 }, { "epoch": 2.610151753008896, "grad_norm": 11.315919876098633, "learning_rate": 2.1630645410017693e-08, "logits/chosen": 2.125, "logits/rejected": 2.9375, "logps/chosen": -676.0, "logps/rejected": -334.0, "loss": 0.5555, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.158203125, "rewards/rejected": -1.25, "step": 1247 }, { "epoch": 2.612244897959184, "grad_norm": 12.547874450683594, "learning_rate": 2.140009925685815e-08, "logits/chosen": 1.5234375, "logits/rejected": 2.09375, "logps/chosen": -756.0, "logps/rejected": -506.0, "loss": 0.6101, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.43359375, "rewards/rejected": -1.5234375, "step": 1248 }, { "epoch": 2.6143380429094716, "grad_norm": 10.966303825378418, "learning_rate": 2.1170736242591206e-08, "logits/chosen": 2.1875, "logits/rejected": 1.9375, "logps/chosen": -540.0, "logps/rejected": -720.0, "loss": 0.5907, "rewards/accuracies": 0.25, "rewards/chosen": -1.5625, "rewards/margins": 0.26171875, "rewards/rejected": -1.8203125, "step": 1249 }, { "epoch": 2.6164311878597593, "grad_norm": 12.1686429977417, "learning_rate": 2.0942557554175444e-08, "logits/chosen": 2.71875, "logits/rejected": 3.09375, "logps/chosen": -588.0, "logps/rejected": -608.0, "loss": 0.5666, "rewards/accuracies": 0.75, "rewards/chosen": -1.3359375, "rewards/margins": 0.6796875, "rewards/rejected": -2.015625, "step": 1250 }, { "epoch": 2.618524332810047, "grad_norm": 10.310696601867676, "learning_rate": 2.0715564372440647e-08, "logits/chosen": 1.0390625, "logits/rejected": 0.66015625, "logps/chosen": -249.0, "logps/rejected": -456.0, "loss": 0.5393, "rewards/accuracies": 0.75, "rewards/chosen": -1.53125, "rewards/margins": 1.0390625, "rewards/rejected": -2.5625, "step": 1251 }, { "epoch": 2.620617477760335, "grad_norm": 11.116397857666016, "learning_rate": 2.0489757872081454e-08, "logits/chosen": 2.1875, "logits/rejected": 2.765625, "logps/chosen": -668.0, "logps/rejected": -528.0, "loss": 0.581, "rewards/accuracies": 0.5, "rewards/chosen": -1.203125, "rewards/margins": 0.5078125, "rewards/rejected": -1.7109375, "step": 1252 }, { "epoch": 2.6227106227106227, "grad_norm": 12.374103546142578, "learning_rate": 2.026513922165159e-08, "logits/chosen": 0.67578125, "logits/rejected": 0.76953125, "logps/chosen": -306.0, "logps/rejected": -352.0, "loss": 0.5843, "rewards/accuracies": 1.0, "rewards/chosen": -1.2890625, "rewards/margins": 0.375, "rewards/rejected": -1.6640625, "step": 1253 }, { "epoch": 2.6248037676609104, "grad_norm": 12.416691780090332, "learning_rate": 2.0041709583557405e-08, "logits/chosen": 2.65625, "logits/rejected": 2.34375, "logps/chosen": -576.0, "logps/rejected": -612.0, "loss": 0.6033, "rewards/accuracies": 0.75, "rewards/chosen": -1.5078125, "rewards/margins": 0.353515625, "rewards/rejected": -1.859375, "step": 1254 }, { "epoch": 2.6268969126111985, "grad_norm": 13.592479705810547, "learning_rate": 1.981947011405226e-08, "logits/chosen": 0.6484375, "logits/rejected": 0.91015625, "logps/chosen": -290.0, "logps/rejected": -298.0, "loss": 0.6248, "rewards/accuracies": 1.0, "rewards/chosen": -1.1953125, "rewards/margins": 0.494140625, "rewards/rejected": -1.6875, "step": 1255 }, { "epoch": 2.628990057561486, "grad_norm": 11.766193389892578, "learning_rate": 1.9598421963230253e-08, "logits/chosen": 1.96875, "logits/rejected": 1.5859375, "logps/chosen": -478.0, "logps/rejected": -500.0, "loss": 0.6101, "rewards/accuracies": 0.75, "rewards/chosen": -1.4296875, "rewards/margins": 0.4296875, "rewards/rejected": -1.859375, "step": 1256 }, { "epoch": 2.631083202511774, "grad_norm": 13.772175788879395, "learning_rate": 1.9378566275020433e-08, "logits/chosen": 1.421875, "logits/rejected": 1.625, "logps/chosen": -414.0, "logps/rejected": -324.0, "loss": 0.6685, "rewards/accuracies": 0.5, "rewards/chosen": -1.09375, "rewards/margins": 0.28125, "rewards/rejected": -1.375, "step": 1257 }, { "epoch": 2.6331763474620615, "grad_norm": 12.496501922607422, "learning_rate": 1.915990418718091e-08, "logits/chosen": 1.1484375, "logits/rejected": 1.8046875, "logps/chosen": -434.0, "logps/rejected": -350.0, "loss": 0.5714, "rewards/accuracies": 0.75, "rewards/chosen": -1.234375, "rewards/margins": 0.84765625, "rewards/rejected": -2.078125, "step": 1258 }, { "epoch": 2.6352694924123496, "grad_norm": 12.116174697875977, "learning_rate": 1.8942436831292678e-08, "logits/chosen": 2.078125, "logits/rejected": 2.265625, "logps/chosen": -560.0, "logps/rejected": -462.0, "loss": 0.6301, "rewards/accuracies": 0.75, "rewards/chosen": -1.6484375, "rewards/margins": 0.15625, "rewards/rejected": -1.8046875, "step": 1259 }, { "epoch": 2.6373626373626373, "grad_norm": 12.080883979797363, "learning_rate": 1.87261653327542e-08, "logits/chosen": 2.0, "logits/rejected": 1.828125, "logps/chosen": -476.0, "logps/rejected": -560.0, "loss": 0.6325, "rewards/accuracies": 0.5, "rewards/chosen": -1.3515625, "rewards/margins": 0.349609375, "rewards/rejected": -1.703125, "step": 1260 }, { "epoch": 2.6394557823129254, "grad_norm": 10.854859352111816, "learning_rate": 1.8511090810775125e-08, "logits/chosen": 1.71875, "logits/rejected": 2.359375, "logps/chosen": -430.0, "logps/rejected": -272.0, "loss": 0.5945, "rewards/accuracies": 0.5, "rewards/chosen": -1.703125, "rewards/margins": -0.373046875, "rewards/rejected": -1.328125, "step": 1261 }, { "epoch": 2.641548927263213, "grad_norm": 15.076943397521973, "learning_rate": 1.829721437837095e-08, "logits/chosen": 2.140625, "logits/rejected": 1.96875, "logps/chosen": -688.0, "logps/rejected": -506.0, "loss": 0.5931, "rewards/accuracies": 0.75, "rewards/chosen": -1.0078125, "rewards/margins": 0.375, "rewards/rejected": -1.3828125, "step": 1262 }, { "epoch": 2.643642072213501, "grad_norm": 11.498101234436035, "learning_rate": 1.8084537142356815e-08, "logits/chosen": 2.15625, "logits/rejected": 2.265625, "logps/chosen": -414.0, "logps/rejected": -450.0, "loss": 0.5866, "rewards/accuracies": 0.75, "rewards/chosen": -1.5546875, "rewards/margins": 0.345703125, "rewards/rejected": -1.8984375, "step": 1263 }, { "epoch": 2.6457352171637885, "grad_norm": 12.064689636230469, "learning_rate": 1.787306020334216e-08, "logits/chosen": 1.796875, "logits/rejected": 1.9609375, "logps/chosen": -548.0, "logps/rejected": -468.0, "loss": 0.5869, "rewards/accuracies": 0.75, "rewards/chosen": -0.8515625, "rewards/margins": 0.7734375, "rewards/rejected": -1.625, "step": 1264 }, { "epoch": 2.647828362114076, "grad_norm": 11.75654411315918, "learning_rate": 1.7662784655724857e-08, "logits/chosen": 1.3828125, "logits/rejected": 2.71875, "logps/chosen": -584.0, "logps/rejected": -400.0, "loss": 0.583, "rewards/accuracies": 0.5, "rewards/chosen": -1.59375, "rewards/margins": 0.08203125, "rewards/rejected": -1.671875, "step": 1265 }, { "epoch": 2.6499215070643642, "grad_norm": 11.494534492492676, "learning_rate": 1.745371158768539e-08, "logits/chosen": 0.625, "logits/rejected": 0.74609375, "logps/chosen": -290.0, "logps/rejected": -344.0, "loss": 0.5834, "rewards/accuracies": 1.0, "rewards/chosen": -1.59375, "rewards/margins": 0.8359375, "rewards/rejected": -2.4375, "step": 1266 }, { "epoch": 2.652014652014652, "grad_norm": 11.152542114257812, "learning_rate": 1.7245842081181468e-08, "logits/chosen": 1.703125, "logits/rejected": 1.828125, "logps/chosen": -748.0, "logps/rejected": -520.0, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 0.63671875, "rewards/rejected": -1.953125, "step": 1267 }, { "epoch": 2.65410779696494, "grad_norm": 11.165491104125977, "learning_rate": 1.7039177211942455e-08, "logits/chosen": 2.421875, "logits/rejected": 2.75, "logps/chosen": -620.0, "logps/rejected": -490.0, "loss": 0.5875, "rewards/accuracies": 0.5, "rewards/chosen": -1.7109375, "rewards/margins": -0.408203125, "rewards/rejected": -1.3046875, "step": 1268 }, { "epoch": 2.6562009419152277, "grad_norm": 12.534558296203613, "learning_rate": 1.6833718049463567e-08, "logits/chosen": 2.046875, "logits/rejected": 2.9375, "logps/chosen": -560.0, "logps/rejected": -344.0, "loss": 0.5836, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 0.279296875, "rewards/rejected": -1.59375, "step": 1269 }, { "epoch": 2.6582940868655154, "grad_norm": 11.990873336791992, "learning_rate": 1.6629465657000433e-08, "logits/chosen": 1.3515625, "logits/rejected": 1.5234375, "logps/chosen": -402.0, "logps/rejected": -420.0, "loss": 0.572, "rewards/accuracies": 1.0, "rewards/chosen": -1.28125, "rewards/margins": 0.5, "rewards/rejected": -1.78125, "step": 1270 }, { "epoch": 2.660387231815803, "grad_norm": 12.490434646606445, "learning_rate": 1.6426421091563755e-08, "logits/chosen": 2.1875, "logits/rejected": 1.9765625, "logps/chosen": -466.0, "logps/rejected": -492.0, "loss": 0.5329, "rewards/accuracies": 0.75, "rewards/chosen": -1.4921875, "rewards/margins": 0.166015625, "rewards/rejected": -1.65625, "step": 1271 }, { "epoch": 2.662480376766091, "grad_norm": 11.288036346435547, "learning_rate": 1.6224585403913625e-08, "logits/chosen": 3.125, "logits/rejected": 3.15625, "logps/chosen": -736.0, "logps/rejected": -660.0, "loss": 0.5632, "rewards/accuracies": 1.0, "rewards/chosen": -1.1484375, "rewards/margins": 0.78125, "rewards/rejected": -1.9296875, "step": 1272 }, { "epoch": 2.664573521716379, "grad_norm": 11.47856616973877, "learning_rate": 1.6023959638554143e-08, "logits/chosen": 1.171875, "logits/rejected": 1.5546875, "logps/chosen": -540.0, "logps/rejected": -528.0, "loss": 0.5548, "rewards/accuracies": 0.75, "rewards/chosen": -1.09375, "rewards/margins": 0.40625, "rewards/rejected": -1.5, "step": 1273 }, { "epoch": 2.6666666666666665, "grad_norm": 11.616024017333984, "learning_rate": 1.5824544833728e-08, "logits/chosen": 1.421875, "logits/rejected": 2.53125, "logps/chosen": -644.0, "logps/rejected": -612.0, "loss": 0.5916, "rewards/accuracies": 0.5, "rewards/chosen": -1.953125, "rewards/margins": 0.083984375, "rewards/rejected": -2.03125, "step": 1274 }, { "epoch": 2.6687598116169546, "grad_norm": 11.60714340209961, "learning_rate": 1.5626342021411292e-08, "logits/chosen": 2.75, "logits/rejected": 3.015625, "logps/chosen": -680.0, "logps/rejected": -528.0, "loss": 0.5459, "rewards/accuracies": 1.0, "rewards/chosen": -1.125, "rewards/margins": 0.77734375, "rewards/rejected": -1.90625, "step": 1275 }, { "epoch": 2.6708529565672423, "grad_norm": 11.576690673828125, "learning_rate": 1.542935222730791e-08, "logits/chosen": 2.328125, "logits/rejected": 2.65625, "logps/chosen": -600.0, "logps/rejected": -572.0, "loss": 0.5515, "rewards/accuracies": 0.75, "rewards/chosen": -1.078125, "rewards/margins": 0.8203125, "rewards/rejected": -1.8984375, "step": 1276 }, { "epoch": 2.67294610151753, "grad_norm": 10.313326835632324, "learning_rate": 1.5233576470844337e-08, "logits/chosen": 2.53125, "logits/rejected": 1.9375, "logps/chosen": -446.0, "logps/rejected": -512.0, "loss": 0.5445, "rewards/accuracies": 0.75, "rewards/chosen": -1.515625, "rewards/margins": 0.4921875, "rewards/rejected": -2.0, "step": 1277 }, { "epoch": 2.6750392464678177, "grad_norm": 11.188737869262695, "learning_rate": 1.5039015765164458e-08, "logits/chosen": 1.734375, "logits/rejected": 2.078125, "logps/chosen": -760.0, "logps/rejected": -402.0, "loss": 0.538, "rewards/accuracies": 0.75, "rewards/chosen": -1.0, "rewards/margins": 0.2109375, "rewards/rejected": -1.2109375, "step": 1278 }, { "epoch": 2.6771323914181058, "grad_norm": 12.750375747680664, "learning_rate": 1.4845671117124229e-08, "logits/chosen": 1.78125, "logits/rejected": 1.8125, "logps/chosen": -406.0, "logps/rejected": -384.0, "loss": 0.5843, "rewards/accuracies": 0.75, "rewards/chosen": -1.3828125, "rewards/margins": 0.6015625, "rewards/rejected": -1.984375, "step": 1279 }, { "epoch": 2.6792255363683934, "grad_norm": 11.617804527282715, "learning_rate": 1.4653543527286419e-08, "logits/chosen": 1.078125, "logits/rejected": 1.953125, "logps/chosen": -420.0, "logps/rejected": -360.0, "loss": 0.5954, "rewards/accuracies": 0.5, "rewards/chosen": -2.15625, "rewards/margins": -0.17578125, "rewards/rejected": -1.984375, "step": 1280 }, { "epoch": 2.6813186813186816, "grad_norm": 12.743657112121582, "learning_rate": 1.4462633989915488e-08, "logits/chosen": 2.5, "logits/rejected": 3.34375, "logps/chosen": -952.0, "logps/rejected": -600.0, "loss": 0.6118, "rewards/accuracies": 1.0, "rewards/chosen": -1.4921875, "rewards/margins": 0.447265625, "rewards/rejected": -1.9375, "step": 1281 }, { "epoch": 2.6834118262689692, "grad_norm": 11.60865306854248, "learning_rate": 1.4272943492972566e-08, "logits/chosen": 1.4140625, "logits/rejected": 1.84375, "logps/chosen": -560.0, "logps/rejected": -552.0, "loss": 0.5925, "rewards/accuracies": 0.75, "rewards/chosen": -1.53125, "rewards/margins": 0.80859375, "rewards/rejected": -2.34375, "step": 1282 }, { "epoch": 2.685504971219257, "grad_norm": 13.835541725158691, "learning_rate": 1.4084473018110164e-08, "logits/chosen": 1.7421875, "logits/rejected": 2.28125, "logps/chosen": -398.0, "logps/rejected": -394.0, "loss": 0.585, "rewards/accuracies": 0.75, "rewards/chosen": -1.5703125, "rewards/margins": 0.1064453125, "rewards/rejected": -1.671875, "step": 1283 }, { "epoch": 2.6875981161695446, "grad_norm": 12.084956169128418, "learning_rate": 1.3897223540667076e-08, "logits/chosen": 2.765625, "logits/rejected": 3.09375, "logps/chosen": -588.0, "logps/rejected": -580.0, "loss": 0.5902, "rewards/accuracies": 0.75, "rewards/chosen": -1.3359375, "rewards/margins": 0.5, "rewards/rejected": -1.8359375, "step": 1284 }, { "epoch": 2.6896912611198327, "grad_norm": 10.881200790405273, "learning_rate": 1.3711196029663487e-08, "logits/chosen": 1.9296875, "logits/rejected": 1.984375, "logps/chosen": -648.0, "logps/rejected": -414.0, "loss": 0.5565, "rewards/accuracies": 0.5, "rewards/chosen": -1.78125, "rewards/margins": 0.1796875, "rewards/rejected": -1.9609375, "step": 1285 }, { "epoch": 2.6917844060701204, "grad_norm": 11.821480751037598, "learning_rate": 1.3526391447795904e-08, "logits/chosen": 1.640625, "logits/rejected": 2.015625, "logps/chosen": -324.0, "logps/rejected": -438.0, "loss": 0.5904, "rewards/accuracies": 0.75, "rewards/chosen": -1.328125, "rewards/margins": 0.166015625, "rewards/rejected": -1.4921875, "step": 1286 }, { "epoch": 2.693877551020408, "grad_norm": 10.388792037963867, "learning_rate": 1.3342810751432064e-08, "logits/chosen": 2.203125, "logits/rejected": 1.0859375, "logps/chosen": -326.0, "logps/rejected": -536.0, "loss": 0.5561, "rewards/accuracies": 0.5, "rewards/chosen": -1.515625, "rewards/margins": 0.34375, "rewards/rejected": -1.859375, "step": 1287 }, { "epoch": 2.695970695970696, "grad_norm": 12.938568115234375, "learning_rate": 1.3160454890606067e-08, "logits/chosen": 1.546875, "logits/rejected": 1.609375, "logps/chosen": -284.0, "logps/rejected": -272.0, "loss": 0.5617, "rewards/accuracies": 1.0, "rewards/chosen": -1.4765625, "rewards/margins": 0.265625, "rewards/rejected": -1.7421875, "step": 1288 }, { "epoch": 2.698063840920984, "grad_norm": 11.72751522064209, "learning_rate": 1.2979324809013578e-08, "logits/chosen": 1.078125, "logits/rejected": 1.3359375, "logps/chosen": -255.0, "logps/rejected": -250.0, "loss": 0.5794, "rewards/accuracies": 0.75, "rewards/chosen": -1.2578125, "rewards/margins": 0.21484375, "rewards/rejected": -1.4765625, "step": 1289 }, { "epoch": 2.7001569858712715, "grad_norm": 10.565709114074707, "learning_rate": 1.2799421444006754e-08, "logits/chosen": 2.203125, "logits/rejected": 2.453125, "logps/chosen": -580.0, "logps/rejected": -600.0, "loss": 0.5436, "rewards/accuracies": 1.0, "rewards/chosen": -1.1640625, "rewards/margins": 1.1875, "rewards/rejected": -2.34375, "step": 1290 }, { "epoch": 2.702250130821559, "grad_norm": 11.568882942199707, "learning_rate": 1.2620745726589409e-08, "logits/chosen": 1.3125, "logits/rejected": 1.96875, "logps/chosen": -440.0, "logps/rejected": -430.0, "loss": 0.5578, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.5390625, "rewards/rejected": -1.7421875, "step": 1291 }, { "epoch": 2.7043432757718473, "grad_norm": 11.32339096069336, "learning_rate": 1.2443298581412347e-08, "logits/chosen": 1.421875, "logits/rejected": 2.28125, "logps/chosen": -502.0, "logps/rejected": -372.0, "loss": 0.5872, "rewards/accuracies": 0.25, "rewards/chosen": -1.7265625, "rewards/margins": -0.0576171875, "rewards/rejected": -1.6640625, "step": 1292 }, { "epoch": 2.706436420722135, "grad_norm": 11.9345703125, "learning_rate": 1.2267080926768485e-08, "logits/chosen": 1.46875, "logits/rejected": 1.5859375, "logps/chosen": -506.0, "logps/rejected": -372.0, "loss": 0.5798, "rewards/accuracies": 0.25, "rewards/chosen": -1.640625, "rewards/margins": -0.33984375, "rewards/rejected": -1.296875, "step": 1293 }, { "epoch": 2.708529565672423, "grad_norm": 11.131210327148438, "learning_rate": 1.2092093674588059e-08, "logits/chosen": 1.4921875, "logits/rejected": 1.7734375, "logps/chosen": -412.0, "logps/rejected": -468.0, "loss": 0.5934, "rewards/accuracies": 0.25, "rewards/chosen": -1.953125, "rewards/margins": -0.06640625, "rewards/rejected": -1.8828125, "step": 1294 }, { "epoch": 2.7106227106227108, "grad_norm": 11.777026176452637, "learning_rate": 1.1918337730433852e-08, "logits/chosen": 2.59375, "logits/rejected": 2.65625, "logps/chosen": -616.0, "logps/rejected": -468.0, "loss": 0.591, "rewards/accuracies": 0.75, "rewards/chosen": -1.65625, "rewards/margins": 0.5859375, "rewards/rejected": -2.25, "step": 1295 }, { "epoch": 2.7127158555729984, "grad_norm": 11.28355598449707, "learning_rate": 1.1745813993496789e-08, "logits/chosen": 1.390625, "logits/rejected": 1.4765625, "logps/chosen": -364.0, "logps/rejected": -536.0, "loss": 0.5969, "rewards/accuracies": 0.75, "rewards/chosen": -1.4765625, "rewards/margins": 0.4140625, "rewards/rejected": -1.890625, "step": 1296 }, { "epoch": 2.714809000523286, "grad_norm": 11.327817916870117, "learning_rate": 1.157452335659099e-08, "logits/chosen": 1.8984375, "logits/rejected": 2.421875, "logps/chosen": -426.0, "logps/rejected": -436.0, "loss": 0.5378, "rewards/accuracies": 1.0, "rewards/chosen": -1.2890625, "rewards/margins": 0.765625, "rewards/rejected": -2.046875, "step": 1297 }, { "epoch": 2.716902145473574, "grad_norm": 12.087903022766113, "learning_rate": 1.1404466706149248e-08, "logits/chosen": 2.5, "logits/rejected": 2.140625, "logps/chosen": -556.0, "logps/rejected": -680.0, "loss": 0.5745, "rewards/accuracies": 0.75, "rewards/chosen": -1.7109375, "rewards/margins": -0.037109375, "rewards/rejected": -1.671875, "step": 1298 }, { "epoch": 2.718995290423862, "grad_norm": 11.75920581817627, "learning_rate": 1.1235644922218483e-08, "logits/chosen": 1.7265625, "logits/rejected": 2.140625, "logps/chosen": -608.0, "logps/rejected": -688.0, "loss": 0.5614, "rewards/accuracies": 1.0, "rewards/chosen": -1.40625, "rewards/margins": 1.0234375, "rewards/rejected": -2.4375, "step": 1299 }, { "epoch": 2.7210884353741496, "grad_norm": 11.290567398071289, "learning_rate": 1.1068058878455178e-08, "logits/chosen": 1.2890625, "logits/rejected": 1.8515625, "logps/chosen": -362.0, "logps/rejected": -408.0, "loss": 0.5772, "rewards/accuracies": 1.0, "rewards/chosen": -0.9765625, "rewards/margins": 0.828125, "rewards/rejected": -1.8046875, "step": 1300 }, { "epoch": 2.7231815803244377, "grad_norm": 10.751604080200195, "learning_rate": 1.0901709442120792e-08, "logits/chosen": 3.0625, "logits/rejected": 2.65625, "logps/chosen": -688.0, "logps/rejected": -648.0, "loss": 0.5944, "rewards/accuracies": 0.25, "rewards/chosen": -1.453125, "rewards/margins": -0.1015625, "rewards/rejected": -1.3515625, "step": 1301 }, { "epoch": 2.7252747252747254, "grad_norm": 12.113463401794434, "learning_rate": 1.0736597474077234e-08, "logits/chosen": 1.96875, "logits/rejected": 2.046875, "logps/chosen": -422.0, "logps/rejected": -528.0, "loss": 0.515, "rewards/accuracies": 1.0, "rewards/chosen": -1.234375, "rewards/margins": 1.1328125, "rewards/rejected": -2.359375, "step": 1302 }, { "epoch": 2.727367870225013, "grad_norm": 11.601572036743164, "learning_rate": 1.0572723828782626e-08, "logits/chosen": 1.640625, "logits/rejected": 1.1875, "logps/chosen": -252.0, "logps/rejected": -272.0, "loss": 0.556, "rewards/accuracies": 0.25, "rewards/chosen": -1.5, "rewards/margins": 0.033203125, "rewards/rejected": -1.5390625, "step": 1303 }, { "epoch": 2.7294610151753007, "grad_norm": 11.546135902404785, "learning_rate": 1.0410089354286747e-08, "logits/chosen": 2.21875, "logits/rejected": 3.40625, "logps/chosen": -520.0, "logps/rejected": -510.0, "loss": 0.5902, "rewards/accuracies": 1.0, "rewards/chosen": -1.109375, "rewards/margins": 0.73828125, "rewards/rejected": -1.8515625, "step": 1304 }, { "epoch": 2.731554160125589, "grad_norm": 12.792366027832031, "learning_rate": 1.0248694892226478e-08, "logits/chosen": 1.828125, "logits/rejected": 1.734375, "logps/chosen": -748.0, "logps/rejected": -612.0, "loss": 0.6293, "rewards/accuracies": 0.25, "rewards/chosen": -1.375, "rewards/margins": -0.087890625, "rewards/rejected": -1.2890625, "step": 1305 }, { "epoch": 2.7336473050758765, "grad_norm": 10.632676124572754, "learning_rate": 1.0088541277821808e-08, "logits/chosen": 2.25, "logits/rejected": 2.359375, "logps/chosen": -536.0, "logps/rejected": -540.0, "loss": 0.5731, "rewards/accuracies": 0.5, "rewards/chosen": -1.9921875, "rewards/margins": 0.0078125, "rewards/rejected": -2.0, "step": 1306 }, { "epoch": 2.735740450026164, "grad_norm": 11.74323844909668, "learning_rate": 9.92962933987112e-09, "logits/chosen": 2.25, "logits/rejected": 2.5625, "logps/chosen": -556.0, "logps/rejected": -544.0, "loss": 0.583, "rewards/accuracies": 1.0, "rewards/chosen": -1.0, "rewards/margins": 0.462890625, "rewards/rejected": -1.4609375, "step": 1307 }, { "epoch": 2.7378335949764523, "grad_norm": 13.4630708694458, "learning_rate": 9.771959900747297e-09, "logits/chosen": 1.625, "logits/rejected": 2.015625, "logps/chosen": -580.0, "logps/rejected": -504.0, "loss": 0.5989, "rewards/accuracies": 0.25, "rewards/chosen": -1.453125, "rewards/margins": 0.150390625, "rewards/rejected": -1.6015625, "step": 1308 }, { "epoch": 2.73992673992674, "grad_norm": 11.922073364257812, "learning_rate": 9.615533776393041e-09, "logits/chosen": 1.46875, "logits/rejected": 2.234375, "logps/chosen": -548.0, "logps/rejected": -410.0, "loss": 0.5334, "rewards/accuracies": 0.5, "rewards/chosen": -1.9921875, "rewards/margins": 0.140625, "rewards/rejected": -2.125, "step": 1309 }, { "epoch": 2.7420198848770276, "grad_norm": 13.078544616699219, "learning_rate": 9.460351776317071e-09, "logits/chosen": 1.78125, "logits/rejected": 1.3984375, "logps/chosen": -312.0, "logps/rejected": -272.0, "loss": 0.5964, "rewards/accuracies": 0.5, "rewards/chosen": -1.453125, "rewards/margins": -0.0322265625, "rewards/rejected": -1.421875, "step": 1310 }, { "epoch": 2.7441130298273153, "grad_norm": 12.194268226623535, "learning_rate": 9.30641470358964e-09, "logits/chosen": 1.78125, "logits/rejected": 1.9296875, "logps/chosen": -532.0, "logps/rejected": -548.0, "loss": 0.6033, "rewards/accuracies": 0.25, "rewards/chosen": -1.578125, "rewards/margins": -0.15234375, "rewards/rejected": -1.4296875, "step": 1311 }, { "epoch": 2.7462061747776034, "grad_norm": 12.832474708557129, "learning_rate": 9.153723354838447e-09, "logits/chosen": 2.546875, "logits/rejected": 2.6875, "logps/chosen": -472.0, "logps/rejected": -536.0, "loss": 0.6222, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.625, "rewards/rejected": -1.75, "step": 1312 }, { "epoch": 2.748299319727891, "grad_norm": 11.957188606262207, "learning_rate": 9.00227852024463e-09, "logits/chosen": 2.3125, "logits/rejected": 2.671875, "logps/chosen": -512.0, "logps/rejected": -408.0, "loss": 0.5906, "rewards/accuracies": 0.75, "rewards/chosen": -1.3671875, "rewards/margins": 0.8203125, "rewards/rejected": -2.1875, "step": 1313 }, { "epoch": 2.750392464678179, "grad_norm": 11.670499801635742, "learning_rate": 8.852080983538517e-09, "logits/chosen": 1.7734375, "logits/rejected": 2.75, "logps/chosen": -632.0, "logps/rejected": -388.0, "loss": 0.5963, "rewards/accuracies": 1.0, "rewards/chosen": -1.0234375, "rewards/margins": 1.0, "rewards/rejected": -2.03125, "step": 1314 }, { "epoch": 2.752485609628467, "grad_norm": 11.205465316772461, "learning_rate": 8.703131521995693e-09, "logits/chosen": 2.421875, "logits/rejected": 2.5625, "logps/chosen": -848.0, "logps/rejected": -776.0, "loss": 0.6083, "rewards/accuracies": 0.75, "rewards/chosen": -1.1328125, "rewards/margins": 0.4140625, "rewards/rejected": -1.546875, "step": 1315 }, { "epoch": 2.7545787545787546, "grad_norm": 11.558897018432617, "learning_rate": 8.555430906432838e-09, "logits/chosen": 1.78125, "logits/rejected": 2.40625, "logps/chosen": -480.0, "logps/rejected": -404.0, "loss": 0.5502, "rewards/accuracies": 0.75, "rewards/chosen": -1.2578125, "rewards/margins": 0.44921875, "rewards/rejected": -1.7109375, "step": 1316 }, { "epoch": 2.7566718995290422, "grad_norm": 11.394280433654785, "learning_rate": 8.408979901203941e-09, "logits/chosen": 1.8046875, "logits/rejected": 2.125, "logps/chosen": -440.0, "logps/rejected": -496.0, "loss": 0.5655, "rewards/accuracies": 0.5, "rewards/chosen": -1.7109375, "rewards/margins": -0.091796875, "rewards/rejected": -1.6171875, "step": 1317 }, { "epoch": 2.7587650444793304, "grad_norm": 11.302227973937988, "learning_rate": 8.263779264196152e-09, "logits/chosen": 2.15625, "logits/rejected": 2.59375, "logps/chosen": -490.0, "logps/rejected": -394.0, "loss": 0.5616, "rewards/accuracies": 0.25, "rewards/chosen": -1.671875, "rewards/margins": 0.3671875, "rewards/rejected": -2.046875, "step": 1318 }, { "epoch": 2.760858189429618, "grad_norm": 10.928841590881348, "learning_rate": 8.119829746825964e-09, "logits/chosen": 1.640625, "logits/rejected": 2.15625, "logps/chosen": -424.0, "logps/rejected": -520.0, "loss": 0.577, "rewards/accuracies": 0.75, "rewards/chosen": -1.3984375, "rewards/margins": 0.79296875, "rewards/rejected": -2.1875, "step": 1319 }, { "epoch": 2.7629513343799057, "grad_norm": 11.287101745605469, "learning_rate": 7.977132094035315e-09, "logits/chosen": 1.6640625, "logits/rejected": 2.015625, "logps/chosen": -420.0, "logps/rejected": -430.0, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": -1.671875, "rewards/margins": 0.412109375, "rewards/rejected": -2.09375, "step": 1320 }, { "epoch": 2.765044479330194, "grad_norm": 13.27101993560791, "learning_rate": 7.835687044287696e-09, "logits/chosen": 1.2578125, "logits/rejected": 1.5859375, "logps/chosen": -380.0, "logps/rejected": -460.0, "loss": 0.5573, "rewards/accuracies": 1.0, "rewards/chosen": -1.109375, "rewards/margins": 0.6328125, "rewards/rejected": -1.7421875, "step": 1321 }, { "epoch": 2.7671376242804815, "grad_norm": 11.39008617401123, "learning_rate": 7.695495329564341e-09, "logits/chosen": 2.109375, "logits/rejected": 3.46875, "logps/chosen": -720.0, "logps/rejected": -366.0, "loss": 0.6035, "rewards/accuracies": 0.75, "rewards/chosen": -1.4453125, "rewards/margins": 0.1376953125, "rewards/rejected": -1.578125, "step": 1322 }, { "epoch": 2.769230769230769, "grad_norm": 12.085970878601074, "learning_rate": 7.556557675360443e-09, "logits/chosen": 1.8515625, "logits/rejected": 1.953125, "logps/chosen": -532.0, "logps/rejected": -296.0, "loss": 0.5797, "rewards/accuracies": 0.75, "rewards/chosen": -1.21875, "rewards/margins": 0.33984375, "rewards/rejected": -1.5625, "step": 1323 }, { "epoch": 2.771323914181057, "grad_norm": 10.792349815368652, "learning_rate": 7.418874800681472e-09, "logits/chosen": 1.1953125, "logits/rejected": 1.484375, "logps/chosen": -328.0, "logps/rejected": -246.0, "loss": 0.5747, "rewards/accuracies": 0.25, "rewards/chosen": -1.1328125, "rewards/margins": 0.0703125, "rewards/rejected": -1.203125, "step": 1324 }, { "epoch": 2.773417059131345, "grad_norm": 11.899036407470703, "learning_rate": 7.2824474180393035e-09, "logits/chosen": 1.6171875, "logits/rejected": 1.84375, "logps/chosen": -620.0, "logps/rejected": -298.0, "loss": 0.6034, "rewards/accuracies": 0.25, "rewards/chosen": -2.265625, "rewards/margins": -0.52734375, "rewards/rejected": -1.734375, "step": 1325 }, { "epoch": 2.7755102040816326, "grad_norm": 11.257723808288574, "learning_rate": 7.1472762334486005e-09, "logits/chosen": 0.6796875, "logits/rejected": 0.81640625, "logps/chosen": -218.0, "logps/rejected": -316.0, "loss": 0.5686, "rewards/accuracies": 1.0, "rewards/chosen": -1.109375, "rewards/margins": 0.609375, "rewards/rejected": -1.71875, "step": 1326 }, { "epoch": 2.7776033490319203, "grad_norm": 13.037230491638184, "learning_rate": 7.013361946423297e-09, "logits/chosen": 2.046875, "logits/rejected": 3.15625, "logps/chosen": -628.0, "logps/rejected": -510.0, "loss": 0.551, "rewards/accuracies": 0.75, "rewards/chosen": -1.4453125, "rewards/margins": 0.41015625, "rewards/rejected": -1.859375, "step": 1327 }, { "epoch": 2.7796964939822084, "grad_norm": 11.820975303649902, "learning_rate": 6.880705249972762e-09, "logits/chosen": 2.671875, "logits/rejected": 2.90625, "logps/chosen": -1168.0, "logps/rejected": -640.0, "loss": 0.5556, "rewards/accuracies": 0.75, "rewards/chosen": -1.1875, "rewards/margins": 0.6171875, "rewards/rejected": -1.796875, "step": 1328 }, { "epoch": 2.781789638932496, "grad_norm": 12.489603042602539, "learning_rate": 6.749306830598223e-09, "logits/chosen": 2.375, "logits/rejected": 2.859375, "logps/chosen": -936.0, "logps/rejected": -436.0, "loss": 0.6013, "rewards/accuracies": 0.5, "rewards/chosen": -1.5546875, "rewards/margins": 0.130859375, "rewards/rejected": -1.6875, "step": 1329 }, { "epoch": 2.7838827838827838, "grad_norm": 10.987958908081055, "learning_rate": 6.619167368289517e-09, "logits/chosen": 1.7421875, "logits/rejected": 1.453125, "logps/chosen": -524.0, "logps/rejected": -480.0, "loss": 0.5985, "rewards/accuracies": 0.75, "rewards/chosen": -1.2578125, "rewards/margins": 0.1982421875, "rewards/rejected": -1.453125, "step": 1330 }, { "epoch": 2.7859759288330714, "grad_norm": 10.890115737915039, "learning_rate": 6.490287536521181e-09, "logits/chosen": 2.328125, "logits/rejected": 2.671875, "logps/chosen": -680.0, "logps/rejected": -652.0, "loss": 0.5408, "rewards/accuracies": 0.5, "rewards/chosen": -1.59375, "rewards/margins": 0.076171875, "rewards/rejected": -1.671875, "step": 1331 }, { "epoch": 2.7880690737833596, "grad_norm": 12.001659393310547, "learning_rate": 6.362668002249141e-09, "logits/chosen": 2.125, "logits/rejected": 2.59375, "logps/chosen": -548.0, "logps/rejected": -434.0, "loss": 0.5569, "rewards/accuracies": 0.5, "rewards/chosen": -1.609375, "rewards/margins": -0.03515625, "rewards/rejected": -1.5703125, "step": 1332 }, { "epoch": 2.7901622187336472, "grad_norm": 11.607378005981445, "learning_rate": 6.236309425907337e-09, "logits/chosen": 2.125, "logits/rejected": 3.28125, "logps/chosen": -462.0, "logps/rejected": -608.0, "loss": 0.5898, "rewards/accuracies": 0.5, "rewards/chosen": -1.78125, "rewards/margins": 0.390625, "rewards/rejected": -2.171875, "step": 1333 }, { "epoch": 2.7922553636839353, "grad_norm": 13.034173011779785, "learning_rate": 6.111212461404191e-09, "logits/chosen": 1.7890625, "logits/rejected": 1.390625, "logps/chosen": -532.0, "logps/rejected": -588.0, "loss": 0.5415, "rewards/accuracies": 0.75, "rewards/chosen": -1.3671875, "rewards/margins": 0.2431640625, "rewards/rejected": -1.609375, "step": 1334 }, { "epoch": 2.794348508634223, "grad_norm": 11.403310775756836, "learning_rate": 5.987377756119224e-09, "logits/chosen": 1.3046875, "logits/rejected": 1.4609375, "logps/chosen": -332.0, "logps/rejected": -370.0, "loss": 0.5802, "rewards/accuracies": 0.5, "rewards/chosen": -1.21875, "rewards/margins": 0.1875, "rewards/rejected": -1.40625, "step": 1335 }, { "epoch": 2.7964416535845107, "grad_norm": 11.478974342346191, "learning_rate": 5.864805950899722e-09, "logits/chosen": 3.09375, "logits/rejected": 2.84375, "logps/chosen": -564.0, "logps/rejected": -704.0, "loss": 0.6002, "rewards/accuracies": 0.75, "rewards/chosen": -1.6328125, "rewards/margins": 0.4296875, "rewards/rejected": -2.0625, "step": 1336 }, { "epoch": 2.7985347985347984, "grad_norm": 11.049171447753906, "learning_rate": 5.743497680057553e-09, "logits/chosen": 2.8125, "logits/rejected": 3.046875, "logps/chosen": -816.0, "logps/rejected": -784.0, "loss": 0.5882, "rewards/accuracies": 1.0, "rewards/chosen": -1.296875, "rewards/margins": 0.6328125, "rewards/rejected": -1.9296875, "step": 1337 }, { "epoch": 2.8006279434850865, "grad_norm": 11.94770622253418, "learning_rate": 5.623453571365659e-09, "logits/chosen": 2.0625, "logits/rejected": 2.1875, "logps/chosen": -536.0, "logps/rejected": -592.0, "loss": 0.5683, "rewards/accuracies": 1.0, "rewards/chosen": -1.484375, "rewards/margins": 0.8984375, "rewards/rejected": -2.375, "step": 1338 }, { "epoch": 2.802721088435374, "grad_norm": 11.677643775939941, "learning_rate": 5.504674246054929e-09, "logits/chosen": 1.890625, "logits/rejected": 2.40625, "logps/chosen": -316.0, "logps/rejected": -312.0, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": -1.140625, "rewards/margins": 0.51953125, "rewards/rejected": -1.6640625, "step": 1339 }, { "epoch": 2.804814233385662, "grad_norm": 12.415672302246094, "learning_rate": 5.3871603188110015e-09, "logits/chosen": 2.578125, "logits/rejected": 2.46875, "logps/chosen": -624.0, "logps/rejected": -556.0, "loss": 0.604, "rewards/accuracies": 0.5, "rewards/chosen": -2.171875, "rewards/margins": -0.025390625, "rewards/rejected": -2.140625, "step": 1340 }, { "epoch": 2.80690737833595, "grad_norm": 11.957086563110352, "learning_rate": 5.270912397771023e-09, "logits/chosen": 2.375, "logits/rejected": 2.21875, "logps/chosen": -384.0, "logps/rejected": -592.0, "loss": 0.5621, "rewards/accuracies": 1.0, "rewards/chosen": -0.921875, "rewards/margins": 0.53125, "rewards/rejected": -1.453125, "step": 1341 }, { "epoch": 2.8090005232862376, "grad_norm": 11.723847389221191, "learning_rate": 5.1559310845205584e-09, "logits/chosen": 2.34375, "logits/rejected": 1.7734375, "logps/chosen": -326.0, "logps/rejected": -572.0, "loss": 0.5695, "rewards/accuracies": 0.5, "rewards/chosen": -1.46875, "rewards/margins": 0.34375, "rewards/rejected": -1.8125, "step": 1342 }, { "epoch": 2.8110936682365253, "grad_norm": 11.346755981445312, "learning_rate": 5.042216974090385e-09, "logits/chosen": 2.53125, "logits/rejected": 2.96875, "logps/chosen": -418.0, "logps/rejected": -356.0, "loss": 0.5903, "rewards/accuracies": 0.5, "rewards/chosen": -1.1484375, "rewards/margins": 0.1298828125, "rewards/rejected": -1.28125, "step": 1343 }, { "epoch": 2.813186813186813, "grad_norm": 11.852492332458496, "learning_rate": 4.9297706549536206e-09, "logits/chosen": 1.90625, "logits/rejected": 2.140625, "logps/chosen": -532.0, "logps/rejected": -528.0, "loss": 0.5812, "rewards/accuracies": 0.75, "rewards/chosen": -1.2890625, "rewards/margins": 0.33984375, "rewards/rejected": -1.6328125, "step": 1344 }, { "epoch": 2.815279958137101, "grad_norm": 11.91535472869873, "learning_rate": 4.818592709022374e-09, "logits/chosen": 1.4296875, "logits/rejected": 1.5078125, "logps/chosen": -456.0, "logps/rejected": -360.0, "loss": 0.5675, "rewards/accuracies": 0.25, "rewards/chosen": -2.0625, "rewards/margins": -0.359375, "rewards/rejected": -1.703125, "step": 1345 }, { "epoch": 2.8173731030873888, "grad_norm": 10.856095314025879, "learning_rate": 4.708683711644967e-09, "logits/chosen": 2.015625, "logits/rejected": 1.859375, "logps/chosen": -466.0, "logps/rejected": -600.0, "loss": 0.5566, "rewards/accuracies": 0.75, "rewards/chosen": -1.25, "rewards/margins": 0.70703125, "rewards/rejected": -1.953125, "step": 1346 }, { "epoch": 2.819466248037677, "grad_norm": 12.558600425720215, "learning_rate": 4.600044231602881e-09, "logits/chosen": 1.5546875, "logits/rejected": 2.09375, "logps/chosen": -548.0, "logps/rejected": -388.0, "loss": 0.565, "rewards/accuracies": 0.5, "rewards/chosen": -1.53125, "rewards/margins": 0.275390625, "rewards/rejected": -1.8046875, "step": 1347 }, { "epoch": 2.8215593929879645, "grad_norm": 11.158424377441406, "learning_rate": 4.492674831107842e-09, "logits/chosen": 1.765625, "logits/rejected": 1.578125, "logps/chosen": -272.0, "logps/rejected": -600.0, "loss": 0.5686, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 1.4296875, "rewards/rejected": -2.75, "step": 1348 }, { "epoch": 2.823652537938252, "grad_norm": 10.950251579284668, "learning_rate": 4.386576065798857e-09, "logits/chosen": 1.25, "logits/rejected": 1.296875, "logps/chosen": -192.0, "logps/rejected": -230.0, "loss": 0.5563, "rewards/accuracies": 0.75, "rewards/chosen": -0.90625, "rewards/margins": 0.265625, "rewards/rejected": -1.171875, "step": 1349 }, { "epoch": 2.82574568288854, "grad_norm": 11.691061973571777, "learning_rate": 4.281748484739318e-09, "logits/chosen": 2.046875, "logits/rejected": 1.6953125, "logps/chosen": -482.0, "logps/rejected": -556.0, "loss": 0.5678, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.1474609375, "rewards/rejected": -1.4140625, "step": 1350 }, { "epoch": 2.8278388278388276, "grad_norm": 11.009173393249512, "learning_rate": 4.178192630414292e-09, "logits/chosen": 2.328125, "logits/rejected": 2.71875, "logps/chosen": -608.0, "logps/rejected": -368.0, "loss": 0.5479, "rewards/accuracies": 0.75, "rewards/chosen": -0.9765625, "rewards/margins": 0.462890625, "rewards/rejected": -1.4375, "step": 1351 }, { "epoch": 2.8299319727891157, "grad_norm": 11.051095008850098, "learning_rate": 4.0759090387276545e-09, "logits/chosen": 2.21875, "logits/rejected": 2.34375, "logps/chosen": -490.0, "logps/rejected": -418.0, "loss": 0.5898, "rewards/accuracies": 1.0, "rewards/chosen": -1.4453125, "rewards/margins": 0.63671875, "rewards/rejected": -2.078125, "step": 1352 }, { "epoch": 2.8320251177394034, "grad_norm": 10.837244987487793, "learning_rate": 3.974898238999182e-09, "logits/chosen": 2.25, "logits/rejected": 2.609375, "logps/chosen": -544.0, "logps/rejected": -432.0, "loss": 0.5494, "rewards/accuracies": 1.0, "rewards/chosen": -1.40625, "rewards/margins": 0.3984375, "rewards/rejected": -1.8046875, "step": 1353 }, { "epoch": 2.8341182626896915, "grad_norm": 11.459234237670898, "learning_rate": 3.875160753962021e-09, "logits/chosen": 0.173828125, "logits/rejected": 0.76953125, "logps/chosen": -246.0, "logps/rejected": -229.0, "loss": 0.5664, "rewards/accuracies": 0.5, "rewards/chosen": -1.3046875, "rewards/margins": 0.0859375, "rewards/rejected": -1.390625, "step": 1354 }, { "epoch": 2.836211407639979, "grad_norm": 11.394813537597656, "learning_rate": 3.776697099759833e-09, "logits/chosen": 1.515625, "logits/rejected": 2.0625, "logps/chosen": -536.0, "logps/rejected": -466.0, "loss": 0.5806, "rewards/accuracies": 0.75, "rewards/chosen": -1.3828125, "rewards/margins": 0.396484375, "rewards/rejected": -1.78125, "step": 1355 }, { "epoch": 2.838304552590267, "grad_norm": 13.518462181091309, "learning_rate": 3.679507785944185e-09, "logits/chosen": 1.015625, "logits/rejected": 1.2890625, "logps/chosen": -328.0, "logps/rejected": -384.0, "loss": 0.6375, "rewards/accuracies": 0.5, "rewards/chosen": -1.71875, "rewards/margins": 0.6171875, "rewards/rejected": -2.34375, "step": 1356 }, { "epoch": 2.8403976975405545, "grad_norm": 12.291327476501465, "learning_rate": 3.58359331547194e-09, "logits/chosen": 2.171875, "logits/rejected": 1.3671875, "logps/chosen": -414.0, "logps/rejected": -600.0, "loss": 0.6212, "rewards/accuracies": 0.75, "rewards/chosen": -1.53125, "rewards/margins": 0.7890625, "rewards/rejected": -2.328125, "step": 1357 }, { "epoch": 2.8424908424908426, "grad_norm": 12.211554527282715, "learning_rate": 3.4889541847025653e-09, "logits/chosen": 1.8515625, "logits/rejected": 2.25, "logps/chosen": -484.0, "logps/rejected": -466.0, "loss": 0.5661, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.44921875, "rewards/rejected": -1.765625, "step": 1358 }, { "epoch": 2.8445839874411303, "grad_norm": 12.278834342956543, "learning_rate": 3.39559088339569e-09, "logits/chosen": 2.375, "logits/rejected": 2.109375, "logps/chosen": -664.0, "logps/rejected": -848.0, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -1.421875, "rewards/margins": 0.1123046875, "rewards/rejected": -1.53125, "step": 1359 }, { "epoch": 2.846677132391418, "grad_norm": 11.289807319641113, "learning_rate": 3.303503894708414e-09, "logits/chosen": 2.375, "logits/rejected": 2.921875, "logps/chosen": -628.0, "logps/rejected": -584.0, "loss": 0.5567, "rewards/accuracies": 0.75, "rewards/chosen": -1.703125, "rewards/margins": 0.404296875, "rewards/rejected": -2.109375, "step": 1360 }, { "epoch": 2.848770277341706, "grad_norm": 11.582511901855469, "learning_rate": 3.2126936951929205e-09, "logits/chosen": 1.46875, "logits/rejected": 1.921875, "logps/chosen": -456.0, "logps/rejected": -660.0, "loss": 0.5281, "rewards/accuracies": 1.0, "rewards/chosen": -1.0078125, "rewards/margins": 0.98828125, "rewards/rejected": -2.0, "step": 1361 }, { "epoch": 2.8508634222919937, "grad_norm": 11.814677238464355, "learning_rate": 3.1231607547940605e-09, "logits/chosen": 1.65625, "logits/rejected": 1.8125, "logps/chosen": -284.0, "logps/rejected": -620.0, "loss": 0.5227, "rewards/accuracies": 0.75, "rewards/chosen": -1.53125, "rewards/margins": 0.458984375, "rewards/rejected": -1.9921875, "step": 1362 }, { "epoch": 2.8529565672422814, "grad_norm": 12.034936904907227, "learning_rate": 3.0349055368466632e-09, "logits/chosen": 1.65625, "logits/rejected": 1.484375, "logps/chosen": -400.0, "logps/rejected": -416.0, "loss": 0.5997, "rewards/accuracies": 0.25, "rewards/chosen": -1.2890625, "rewards/margins": 0.123046875, "rewards/rejected": -1.40625, "step": 1363 }, { "epoch": 2.855049712192569, "grad_norm": 12.554017066955566, "learning_rate": 2.9479284980735085e-09, "logits/chosen": 2.28125, "logits/rejected": 2.203125, "logps/chosen": -392.0, "logps/rejected": -580.0, "loss": 0.592, "rewards/accuracies": 0.5, "rewards/chosen": -1.5625, "rewards/margins": -0.33984375, "rewards/rejected": -1.21875, "step": 1364 }, { "epoch": 2.857142857142857, "grad_norm": 11.883865356445312, "learning_rate": 2.862230088582717e-09, "logits/chosen": 1.953125, "logits/rejected": 1.421875, "logps/chosen": -364.0, "logps/rejected": -644.0, "loss": 0.6202, "rewards/accuracies": 0.75, "rewards/chosen": -1.421875, "rewards/margins": 0.7734375, "rewards/rejected": -2.1875, "step": 1365 }, { "epoch": 2.859236002093145, "grad_norm": 10.784345626831055, "learning_rate": 2.7778107518653115e-09, "logits/chosen": 0.98828125, "logits/rejected": 1.359375, "logps/chosen": -398.0, "logps/rejected": -298.0, "loss": 0.6086, "rewards/accuracies": 0.75, "rewards/chosen": -1.453125, "rewards/margins": 0.138671875, "rewards/rejected": -1.59375, "step": 1366 }, { "epoch": 2.861329147043433, "grad_norm": 11.65639877319336, "learning_rate": 2.6946709247933257e-09, "logits/chosen": 1.28125, "logits/rejected": 1.5, "logps/chosen": -322.0, "logps/rejected": -356.0, "loss": 0.5634, "rewards/accuracies": 1.0, "rewards/chosen": -1.5234375, "rewards/margins": 0.46484375, "rewards/rejected": -1.9921875, "step": 1367 }, { "epoch": 2.8634222919937207, "grad_norm": 12.796161651611328, "learning_rate": 2.612811037617142e-09, "logits/chosen": 0.8125, "logits/rejected": 1.0703125, "logps/chosen": -462.0, "logps/rejected": -326.0, "loss": 0.6258, "rewards/accuracies": 0.75, "rewards/chosen": -1.4375, "rewards/margins": 0.400390625, "rewards/rejected": -1.8359375, "step": 1368 }, { "epoch": 2.8655154369440083, "grad_norm": 11.98444652557373, "learning_rate": 2.5322315139635215e-09, "logits/chosen": 1.765625, "logits/rejected": 1.4375, "logps/chosen": -362.0, "logps/rejected": -540.0, "loss": 0.5765, "rewards/accuracies": 0.75, "rewards/chosen": -1.1015625, "rewards/margins": 0.8515625, "rewards/rejected": -1.953125, "step": 1369 }, { "epoch": 2.867608581894296, "grad_norm": 11.5888032913208, "learning_rate": 2.4529327708332437e-09, "logits/chosen": 1.7890625, "logits/rejected": 2.40625, "logps/chosen": -450.0, "logps/rejected": -456.0, "loss": 0.5898, "rewards/accuracies": 0.5, "rewards/chosen": -1.546875, "rewards/margins": -0.1015625, "rewards/rejected": -1.4453125, "step": 1370 }, { "epoch": 2.869701726844584, "grad_norm": 11.986724853515625, "learning_rate": 2.374915218599025e-09, "logits/chosen": 2.515625, "logits/rejected": 2.265625, "logps/chosen": -848.0, "logps/rejected": -568.0, "loss": 0.5708, "rewards/accuracies": 0.75, "rewards/chosen": -1.5390625, "rewards/margins": 0.291015625, "rewards/rejected": -1.828125, "step": 1371 }, { "epoch": 2.871794871794872, "grad_norm": 12.023932456970215, "learning_rate": 2.2981792610034677e-09, "logits/chosen": 1.03125, "logits/rejected": 1.5078125, "logps/chosen": -352.0, "logps/rejected": -382.0, "loss": 0.6112, "rewards/accuracies": 0.5, "rewards/chosen": -1.5625, "rewards/margins": 0.138671875, "rewards/rejected": -1.703125, "step": 1372 }, { "epoch": 2.8738880167451595, "grad_norm": 11.441040992736816, "learning_rate": 2.222725295156808e-09, "logits/chosen": 2.1875, "logits/rejected": 3.359375, "logps/chosen": -848.0, "logps/rejected": -510.0, "loss": 0.5742, "rewards/accuracies": 0.5, "rewards/chosen": -1.328125, "rewards/margins": 0.23828125, "rewards/rejected": -1.5625, "step": 1373 }, { "epoch": 2.8759811616954476, "grad_norm": 11.501876831054688, "learning_rate": 2.1485537115350034e-09, "logits/chosen": 2.671875, "logits/rejected": 3.515625, "logps/chosen": -652.0, "logps/rejected": -540.0, "loss": 0.5568, "rewards/accuracies": 0.5, "rewards/chosen": -1.296875, "rewards/margins": 0.6328125, "rewards/rejected": -1.9375, "step": 1374 }, { "epoch": 2.8780743066457353, "grad_norm": 12.223243713378906, "learning_rate": 2.075664893977596e-09, "logits/chosen": 2.125, "logits/rejected": 2.015625, "logps/chosen": -728.0, "logps/rejected": -724.0, "loss": 0.5875, "rewards/accuracies": 0.5, "rewards/chosen": -1.78125, "rewards/margins": -0.021484375, "rewards/rejected": -1.7578125, "step": 1375 }, { "epoch": 2.880167451596023, "grad_norm": 11.807002067565918, "learning_rate": 2.004059219685879e-09, "logits/chosen": 1.453125, "logits/rejected": 1.875, "logps/chosen": -450.0, "logps/rejected": -458.0, "loss": 0.5473, "rewards/accuracies": 0.75, "rewards/chosen": -1.265625, "rewards/margins": 0.515625, "rewards/rejected": -1.78125, "step": 1376 }, { "epoch": 2.8822605965463106, "grad_norm": 11.525500297546387, "learning_rate": 1.9337370592207062e-09, "logits/chosen": 2.28125, "logits/rejected": 2.359375, "logps/chosen": -604.0, "logps/rejected": -424.0, "loss": 0.587, "rewards/accuracies": 0.5, "rewards/chosen": -1.328125, "rewards/margins": 0.392578125, "rewards/rejected": -1.71875, "step": 1377 }, { "epoch": 2.8843537414965987, "grad_norm": 11.223447799682617, "learning_rate": 1.8646987765008824e-09, "logits/chosen": 1.71875, "logits/rejected": 1.546875, "logps/chosen": -228.0, "logps/rejected": -354.0, "loss": 0.5342, "rewards/accuracies": 0.75, "rewards/chosen": -1.1796875, "rewards/margins": 0.640625, "rewards/rejected": -1.828125, "step": 1378 }, { "epoch": 2.8864468864468864, "grad_norm": 12.623505592346191, "learning_rate": 1.7969447288010238e-09, "logits/chosen": 1.9296875, "logits/rejected": 1.765625, "logps/chosen": -458.0, "logps/rejected": -398.0, "loss": 0.5929, "rewards/accuracies": 0.5, "rewards/chosen": -1.6015625, "rewards/margins": 0.017578125, "rewards/rejected": -1.625, "step": 1379 }, { "epoch": 2.8885400313971745, "grad_norm": 12.275703430175781, "learning_rate": 1.7304752667497843e-09, "logits/chosen": 2.109375, "logits/rejected": 1.2265625, "logps/chosen": -246.0, "logps/rejected": -346.0, "loss": 0.5909, "rewards/accuracies": 0.75, "rewards/chosen": -1.203125, "rewards/margins": 0.25, "rewards/rejected": -1.453125, "step": 1380 }, { "epoch": 2.890633176347462, "grad_norm": 10.699250221252441, "learning_rate": 1.6652907343281343e-09, "logits/chosen": 2.578125, "logits/rejected": 2.6875, "logps/chosen": -716.0, "logps/rejected": -504.0, "loss": 0.5286, "rewards/accuracies": 0.75, "rewards/chosen": -1.4765625, "rewards/margins": 0.6484375, "rewards/rejected": -2.125, "step": 1381 }, { "epoch": 2.89272632129775, "grad_norm": 10.823837280273438, "learning_rate": 1.6013914688674172e-09, "logits/chosen": 2.40625, "logits/rejected": 3.15625, "logps/chosen": -640.0, "logps/rejected": -604.0, "loss": 0.5155, "rewards/accuracies": 0.5, "rewards/chosen": -1.234375, "rewards/margins": 0.38671875, "rewards/rejected": -1.625, "step": 1382 }, { "epoch": 2.8948194662480375, "grad_norm": 12.310914039611816, "learning_rate": 1.5387778010477968e-09, "logits/chosen": 1.578125, "logits/rejected": 2.40625, "logps/chosen": -524.0, "logps/rejected": -482.0, "loss": 0.6252, "rewards/accuracies": 0.5, "rewards/chosen": -1.6015625, "rewards/margins": 0.3203125, "rewards/rejected": -1.921875, "step": 1383 }, { "epoch": 2.896912611198325, "grad_norm": 11.84903335571289, "learning_rate": 1.4774500548963405e-09, "logits/chosen": 1.75, "logits/rejected": 1.5625, "logps/chosen": -304.0, "logps/rejected": -452.0, "loss": 0.6086, "rewards/accuracies": 1.0, "rewards/chosen": -1.5625, "rewards/margins": 0.5703125, "rewards/rejected": -2.125, "step": 1384 }, { "epoch": 2.8990057561486133, "grad_norm": 11.620820999145508, "learning_rate": 1.4174085477854664e-09, "logits/chosen": 2.1875, "logits/rejected": 2.3125, "logps/chosen": -756.0, "logps/rejected": -498.0, "loss": 0.5592, "rewards/accuracies": 0.5, "rewards/chosen": -1.7578125, "rewards/margins": -0.0234375, "rewards/rejected": -1.734375, "step": 1385 }, { "epoch": 2.901098901098901, "grad_norm": 11.344687461853027, "learning_rate": 1.3586535904313612e-09, "logits/chosen": 1.640625, "logits/rejected": 2.140625, "logps/chosen": -476.0, "logps/rejected": -564.0, "loss": 0.5802, "rewards/accuracies": 0.25, "rewards/chosen": -1.375, "rewards/margins": 0.248046875, "rewards/rejected": -1.6171875, "step": 1386 }, { "epoch": 2.903192046049189, "grad_norm": 10.856180191040039, "learning_rate": 1.3011854868921756e-09, "logits/chosen": 1.859375, "logits/rejected": 1.8359375, "logps/chosen": -510.0, "logps/rejected": -536.0, "loss": 0.5505, "rewards/accuracies": 1.0, "rewards/chosen": -1.3046875, "rewards/margins": 0.60546875, "rewards/rejected": -1.9140625, "step": 1387 }, { "epoch": 2.905285190999477, "grad_norm": 11.679254531860352, "learning_rate": 1.2450045345665826e-09, "logits/chosen": 1.90625, "logits/rejected": 2.671875, "logps/chosen": -592.0, "logps/rejected": -344.0, "loss": 0.5609, "rewards/accuracies": 0.5, "rewards/chosen": -1.359375, "rewards/margins": 0.1689453125, "rewards/rejected": -1.53125, "step": 1388 }, { "epoch": 2.9073783359497645, "grad_norm": 11.584948539733887, "learning_rate": 1.1901110241923045e-09, "logits/chosen": 2.1875, "logits/rejected": 2.5, "logps/chosen": -500.0, "logps/rejected": -532.0, "loss": 0.5921, "rewards/accuracies": 0.5, "rewards/chosen": -1.640625, "rewards/margins": 0.203125, "rewards/rejected": -1.84375, "step": 1389 }, { "epoch": 2.909471480900052, "grad_norm": 12.864279747009277, "learning_rate": 1.1365052398444774e-09, "logits/chosen": 0.88671875, "logits/rejected": 0.62890625, "logps/chosen": -358.0, "logps/rejected": -556.0, "loss": 0.6319, "rewards/accuracies": 1.0, "rewards/chosen": -1.71875, "rewards/margins": 0.3828125, "rewards/rejected": -2.109375, "step": 1390 }, { "epoch": 2.9115646258503403, "grad_norm": 11.117484092712402, "learning_rate": 1.0841874589341515e-09, "logits/chosen": 1.5703125, "logits/rejected": 1.7734375, "logps/chosen": -360.0, "logps/rejected": -390.0, "loss": 0.5577, "rewards/accuracies": 1.0, "rewards/chosen": -0.93359375, "rewards/margins": 0.86328125, "rewards/rejected": -1.796875, "step": 1391 }, { "epoch": 2.913657770800628, "grad_norm": 11.658158302307129, "learning_rate": 1.033157952207015e-09, "logits/chosen": 2.0, "logits/rejected": 2.640625, "logps/chosen": -576.0, "logps/rejected": -450.0, "loss": 0.5708, "rewards/accuracies": 0.5, "rewards/chosen": -1.796875, "rewards/margins": 0.12109375, "rewards/rejected": -1.921875, "step": 1392 }, { "epoch": 2.9157509157509156, "grad_norm": 11.494297981262207, "learning_rate": 9.834169837419226e-10, "logits/chosen": 1.7890625, "logits/rejected": 2.671875, "logps/chosen": -556.0, "logps/rejected": -434.0, "loss": 0.5964, "rewards/accuracies": 0.75, "rewards/chosen": -1.2109375, "rewards/margins": 0.376953125, "rewards/rejected": -1.5859375, "step": 1393 }, { "epoch": 2.9178440607012037, "grad_norm": 11.906575202941895, "learning_rate": 9.349648109494255e-10, "logits/chosen": 1.1171875, "logits/rejected": 1.1953125, "logps/chosen": -576.0, "logps/rejected": -448.0, "loss": 0.5775, "rewards/accuracies": 0.75, "rewards/chosen": -1.5859375, "rewards/margins": 0.087890625, "rewards/rejected": -1.671875, "step": 1394 }, { "epoch": 2.9199372056514914, "grad_norm": 11.835088729858398, "learning_rate": 8.878016845706324e-10, "logits/chosen": 1.171875, "logits/rejected": 1.7578125, "logps/chosen": -482.0, "logps/rejected": -340.0, "loss": 0.5616, "rewards/accuracies": 0.5, "rewards/chosen": -1.734375, "rewards/margins": 0.10546875, "rewards/rejected": -1.84375, "step": 1395 }, { "epoch": 2.922030350601779, "grad_norm": 11.811446189880371, "learning_rate": 8.419278486757394e-10, "logits/chosen": 2.1875, "logits/rejected": 2.234375, "logps/chosen": -418.0, "logps/rejected": -500.0, "loss": 0.5857, "rewards/accuracies": 0.75, "rewards/chosen": -1.8125, "rewards/margins": 0.1484375, "rewards/rejected": -1.9609375, "step": 1396 }, { "epoch": 2.9241234955520667, "grad_norm": 12.178156852722168, "learning_rate": 7.973435406628644e-10, "logits/chosen": 2.171875, "logits/rejected": 2.546875, "logps/chosen": -592.0, "logps/rejected": -772.0, "loss": 0.6156, "rewards/accuracies": 0.75, "rewards/chosen": -1.171875, "rewards/margins": 0.9296875, "rewards/rejected": -2.09375, "step": 1397 }, { "epoch": 2.926216640502355, "grad_norm": 10.631257057189941, "learning_rate": 7.540489912567702e-10, "logits/chosen": 2.40625, "logits/rejected": 2.78125, "logps/chosen": -486.0, "logps/rejected": -422.0, "loss": 0.5974, "rewards/accuracies": 1.0, "rewards/chosen": -1.25, "rewards/margins": 0.384765625, "rewards/rejected": -1.640625, "step": 1398 }, { "epoch": 2.9283097854526425, "grad_norm": 10.772899627685547, "learning_rate": 7.120444245076987e-10, "logits/chosen": 1.546875, "logits/rejected": 1.7578125, "logps/chosen": -608.0, "logps/rejected": -624.0, "loss": 0.5379, "rewards/accuracies": 1.0, "rewards/chosen": -1.84375, "rewards/margins": 0.296875, "rewards/rejected": -2.140625, "step": 1399 }, { "epoch": 2.9304029304029307, "grad_norm": 12.620939254760742, "learning_rate": 6.713300577902336e-10, "logits/chosen": 1.5703125, "logits/rejected": 1.875, "logps/chosen": -482.0, "logps/rejected": -470.0, "loss": 0.6188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4453125, "rewards/margins": 0.62890625, "rewards/rejected": -2.078125, "step": 1400 }, { "epoch": 2.9324960753532183, "grad_norm": 11.615793228149414, "learning_rate": 6.319061018021064e-10, "logits/chosen": 1.7421875, "logits/rejected": 1.859375, "logps/chosen": -332.0, "logps/rejected": -406.0, "loss": 0.5764, "rewards/accuracies": 1.0, "rewards/chosen": -1.3515625, "rewards/margins": 0.2294921875, "rewards/rejected": -1.578125, "step": 1401 }, { "epoch": 2.934589220303506, "grad_norm": 11.255279541015625, "learning_rate": 5.937727605631422e-10, "logits/chosen": 1.8671875, "logits/rejected": 1.9609375, "logps/chosen": -552.0, "logps/rejected": -656.0, "loss": 0.5535, "rewards/accuracies": 0.75, "rewards/chosen": -1.359375, "rewards/margins": 0.349609375, "rewards/rejected": -1.703125, "step": 1402 }, { "epoch": 2.9366823652537937, "grad_norm": 11.204919815063477, "learning_rate": 5.56930231414233e-10, "logits/chosen": 1.671875, "logits/rejected": 2.203125, "logps/chosen": -436.0, "logps/rejected": -524.0, "loss": 0.5956, "rewards/accuracies": 0.5, "rewards/chosen": -1.703125, "rewards/margins": 0.15234375, "rewards/rejected": -1.859375, "step": 1403 }, { "epoch": 2.938775510204082, "grad_norm": 11.915118217468262, "learning_rate": 5.213787050162823e-10, "logits/chosen": 1.4296875, "logits/rejected": 2.046875, "logps/chosen": -624.0, "logps/rejected": -608.0, "loss": 0.6307, "rewards/accuracies": 0.25, "rewards/chosen": -2.515625, "rewards/margins": -0.9140625, "rewards/rejected": -1.609375, "step": 1404 }, { "epoch": 2.9408686551543695, "grad_norm": 12.069853782653809, "learning_rate": 4.871183653492071e-10, "logits/chosen": 1.796875, "logits/rejected": 1.7890625, "logps/chosen": -374.0, "logps/rejected": -416.0, "loss": 0.5981, "rewards/accuracies": 0.75, "rewards/chosen": -1.484375, "rewards/margins": 0.4609375, "rewards/rejected": -1.9453125, "step": 1405 }, { "epoch": 2.942961800104657, "grad_norm": 12.105071067810059, "learning_rate": 4.5414938971104906e-10, "logits/chosen": 2.5, "logits/rejected": 2.09375, "logps/chosen": -696.0, "logps/rejected": -502.0, "loss": 0.5965, "rewards/accuracies": 0.5, "rewards/chosen": -1.484375, "rewards/margins": -0.07421875, "rewards/rejected": -1.40625, "step": 1406 }, { "epoch": 2.9450549450549453, "grad_norm": 12.12260913848877, "learning_rate": 4.2247194871694753e-10, "logits/chosen": 2.0625, "logits/rejected": 1.5703125, "logps/chosen": -382.0, "logps/rejected": -516.0, "loss": 0.5959, "rewards/accuracies": 0.25, "rewards/chosen": -1.234375, "rewards/margins": -0.078125, "rewards/rejected": -1.15625, "step": 1407 }, { "epoch": 2.947148090005233, "grad_norm": 11.283514976501465, "learning_rate": 3.9208620629839086e-10, "logits/chosen": 2.40625, "logits/rejected": 2.671875, "logps/chosen": -648.0, "logps/rejected": -648.0, "loss": 0.5661, "rewards/accuracies": 0.75, "rewards/chosen": -1.140625, "rewards/margins": 0.2578125, "rewards/rejected": -1.3984375, "step": 1408 }, { "epoch": 2.9492412349555206, "grad_norm": 12.450947761535645, "learning_rate": 3.629923197022169e-10, "logits/chosen": 2.203125, "logits/rejected": 2.484375, "logps/chosen": -864.0, "logps/rejected": -644.0, "loss": 0.6292, "rewards/accuracies": 0.5, "rewards/chosen": -0.94140625, "rewards/margins": 0.2158203125, "rewards/rejected": -1.15625, "step": 1409 }, { "epoch": 2.9513343799058083, "grad_norm": 10.849802017211914, "learning_rate": 3.3519043948997476e-10, "logits/chosen": 3.0, "logits/rejected": 2.8125, "logps/chosen": -688.0, "logps/rejected": -736.0, "loss": 0.5535, "rewards/accuracies": 0.5, "rewards/chosen": -1.953125, "rewards/margins": 0.056640625, "rewards/rejected": -2.0, "step": 1410 }, { "epoch": 2.9534275248560964, "grad_norm": 10.051568031311035, "learning_rate": 3.086807095369811e-10, "logits/chosen": 2.046875, "logits/rejected": 1.453125, "logps/chosen": -390.0, "logps/rejected": -476.0, "loss": 0.5342, "rewards/accuracies": 0.75, "rewards/chosen": -1.0703125, "rewards/margins": 0.73046875, "rewards/rejected": -1.796875, "step": 1411 }, { "epoch": 2.955520669806384, "grad_norm": 11.31157398223877, "learning_rate": 2.8346326703168203e-10, "logits/chosen": 2.21875, "logits/rejected": 2.34375, "logps/chosen": -412.0, "logps/rejected": -472.0, "loss": 0.5827, "rewards/accuracies": 0.5, "rewards/chosen": -1.5390625, "rewards/margins": 0.306640625, "rewards/rejected": -1.84375, "step": 1412 }, { "epoch": 2.957613814756672, "grad_norm": 11.44780158996582, "learning_rate": 2.5953824247490364e-10, "logits/chosen": 2.765625, "logits/rejected": 2.671875, "logps/chosen": -648.0, "logps/rejected": -410.0, "loss": 0.5737, "rewards/accuracies": 0.25, "rewards/chosen": -1.5625, "rewards/margins": -0.2265625, "rewards/rejected": -1.3359375, "step": 1413 }, { "epoch": 2.95970695970696, "grad_norm": 11.903714179992676, "learning_rate": 2.3690575967915824e-10, "logits/chosen": 2.328125, "logits/rejected": 2.71875, "logps/chosen": -528.0, "logps/rejected": -544.0, "loss": 0.6146, "rewards/accuracies": 1.0, "rewards/chosen": -1.234375, "rewards/margins": 0.60546875, "rewards/rejected": -1.84375, "step": 1414 }, { "epoch": 2.9618001046572475, "grad_norm": 12.053587913513184, "learning_rate": 2.1556593576806152e-10, "logits/chosen": 2.0625, "logits/rejected": 2.5, "logps/chosen": -600.0, "logps/rejected": -620.0, "loss": 0.549, "rewards/accuracies": 0.5, "rewards/chosen": -1.265625, "rewards/margins": 0.494140625, "rewards/rejected": -1.765625, "step": 1415 }, { "epoch": 2.963893249607535, "grad_norm": 11.51821517944336, "learning_rate": 1.9551888117566647e-10, "logits/chosen": 2.796875, "logits/rejected": 2.90625, "logps/chosen": -640.0, "logps/rejected": -502.0, "loss": 0.5759, "rewards/accuracies": 0.5, "rewards/chosen": -1.125, "rewards/margins": 0.169921875, "rewards/rejected": -1.296875, "step": 1416 }, { "epoch": 2.965986394557823, "grad_norm": 12.681265830993652, "learning_rate": 1.7676469964590832e-10, "logits/chosen": 2.828125, "logits/rejected": 3.125, "logps/chosen": -880.0, "logps/rejected": -684.0, "loss": 0.6017, "rewards/accuracies": 0.75, "rewards/chosen": -1.7265625, "rewards/margins": 0.193359375, "rewards/rejected": -1.921875, "step": 1417 }, { "epoch": 2.968079539508111, "grad_norm": 11.360309600830078, "learning_rate": 1.5930348823207737e-10, "logits/chosen": 1.28125, "logits/rejected": 2.5, "logps/chosen": -360.0, "logps/rejected": -278.0, "loss": 0.5533, "rewards/accuracies": 0.75, "rewards/chosen": -1.421875, "rewards/margins": 0.29296875, "rewards/rejected": -1.71875, "step": 1418 }, { "epoch": 2.9701726844583987, "grad_norm": 11.54595947265625, "learning_rate": 1.4313533729634691e-10, "logits/chosen": 2.53125, "logits/rejected": 2.328125, "logps/chosen": -556.0, "logps/rejected": -640.0, "loss": 0.6055, "rewards/accuracies": 0.75, "rewards/chosen": -1.4765625, "rewards/margins": 0.59375, "rewards/rejected": -2.078125, "step": 1419 }, { "epoch": 2.9722658294086868, "grad_norm": 11.778307914733887, "learning_rate": 1.2826033050927406e-10, "logits/chosen": 1.6171875, "logits/rejected": 1.640625, "logps/chosen": -356.0, "logps/rejected": -600.0, "loss": 0.6007, "rewards/accuracies": 0.5, "rewards/chosen": -1.3984375, "rewards/margins": 0.4609375, "rewards/rejected": -1.859375, "step": 1420 }, { "epoch": 2.9743589743589745, "grad_norm": 12.374484062194824, "learning_rate": 1.146785448493276e-10, "logits/chosen": 1.25, "logits/rejected": 2.03125, "logps/chosen": -528.0, "logps/rejected": -442.0, "loss": 0.6091, "rewards/accuracies": 0.5, "rewards/chosen": -1.6875, "rewards/margins": 0.28515625, "rewards/rejected": -1.96875, "step": 1421 }, { "epoch": 2.976452119309262, "grad_norm": 11.62448787689209, "learning_rate": 1.0239005060252739e-10, "logits/chosen": 0.73046875, "logits/rejected": 1.65625, "logps/chosen": -338.0, "logps/rejected": -356.0, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": -1.3359375, "rewards/margins": 0.435546875, "rewards/rejected": -1.7734375, "step": 1422 }, { "epoch": 2.97854526425955, "grad_norm": 12.74441146850586, "learning_rate": 9.1394911362139e-11, "logits/chosen": 2.109375, "logits/rejected": 2.25, "logps/chosen": -728.0, "logps/rejected": -480.0, "loss": 0.6134, "rewards/accuracies": 0.25, "rewards/chosen": -2.0625, "rewards/margins": -0.224609375, "rewards/rejected": -1.828125, "step": 1423 }, { "epoch": 2.980638409209838, "grad_norm": 11.858436584472656, "learning_rate": 8.169318402820202e-11, "logits/chosen": 1.7734375, "logits/rejected": 1.6171875, "logps/chosen": -452.0, "logps/rejected": -832.0, "loss": 0.5728, "rewards/accuracies": 0.75, "rewards/chosen": -2.3125, "rewards/margins": 0.6875, "rewards/rejected": -3.0, "step": 1424 }, { "epoch": 2.9827315541601256, "grad_norm": 10.899884223937988, "learning_rate": 7.328491880741893e-11, "logits/chosen": 2.34375, "logits/rejected": 2.203125, "logps/chosen": -640.0, "logps/rejected": -510.0, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": -1.3125, "rewards/margins": 0.54296875, "rewards/rejected": -1.859375, "step": 1425 }, { "epoch": 2.9848246991104133, "grad_norm": 10.890758514404297, "learning_rate": 6.617015921273888e-11, "logits/chosen": 1.6953125, "logits/rejected": 2.03125, "logps/chosen": -400.0, "logps/rejected": -348.0, "loss": 0.5376, "rewards/accuracies": 0.75, "rewards/chosen": -1.515625, "rewards/margins": 0.302734375, "rewards/rejected": -1.8125, "step": 1426 }, { "epoch": 2.9869178440607014, "grad_norm": 11.535402297973633, "learning_rate": 6.03489420631634e-11, "logits/chosen": 1.171875, "logits/rejected": 1.4375, "logps/chosen": -368.0, "logps/rejected": -436.0, "loss": 0.5985, "rewards/accuracies": 0.25, "rewards/chosen": -1.234375, "rewards/margins": 0.08203125, "rewards/rejected": -1.3203125, "step": 1427 }, { "epoch": 2.989010989010989, "grad_norm": 11.802215576171875, "learning_rate": 5.5821297483635366e-11, "logits/chosen": 2.625, "logits/rejected": 3.46875, "logps/chosen": -572.0, "logps/rejected": -416.0, "loss": 0.5801, "rewards/accuracies": 0.75, "rewards/chosen": -0.94921875, "rewards/margins": 1.265625, "rewards/rejected": -2.203125, "step": 1428 }, { "epoch": 2.9911041339612767, "grad_norm": 12.465570449829102, "learning_rate": 5.258724890484477e-11, "logits/chosen": 2.703125, "logits/rejected": 2.28125, "logps/chosen": -380.0, "logps/rejected": -556.0, "loss": 0.5928, "rewards/accuracies": 0.5, "rewards/chosen": -1.453125, "rewards/margins": 0.0673828125, "rewards/rejected": -1.5234375, "step": 1429 }, { "epoch": 2.9931972789115644, "grad_norm": 12.42818832397461, "learning_rate": 5.0646813063034436e-11, "logits/chosen": 1.53125, "logits/rejected": 0.96484375, "logps/chosen": -260.0, "logps/rejected": -446.0, "loss": 0.5906, "rewards/accuracies": 0.5, "rewards/chosen": -1.5625, "rewards/margins": -0.046875, "rewards/rejected": -1.515625, "step": 1430 }, { "epoch": 2.9952904238618525, "grad_norm": 12.25236701965332, "learning_rate": 5e-11, "logits/chosen": 1.390625, "logits/rejected": 1.34375, "logps/chosen": -436.0, "logps/rejected": -231.0, "loss": 0.5959, "rewards/accuracies": 0.25, "rewards/chosen": -1.6328125, "rewards/margins": -0.1943359375, "rewards/rejected": -1.4375, "step": 1431 } ], "logging_steps": 1.0, "max_steps": 1431, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }