diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.24, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03, + "learning_rate": 3.846153846153846e-06, + "logits/chosen": 1.3807897567749023, + "logits/rejected": 1.1952139139175415, + "logps/chosen": -589.1343994140625, + "logps/rejected": -494.7060241699219, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.05, + "learning_rate": 7.692307692307692e-06, + "logits/chosen": 1.2665337324142456, + "logits/rejected": 1.1713109016418457, + "logps/chosen": -559.9566650390625, + "logps/rejected": -549.1146850585938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.08, + "learning_rate": 1.1538461538461538e-05, + "logits/chosen": 1.3811347484588623, + "logits/rejected": 1.216629981994629, + "logps/chosen": -559.24951171875, + "logps/rejected": -481.5151672363281, + "loss": 0.6956, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.002765989163890481, + "rewards/margins": -0.004863501060754061, + "rewards/rejected": 0.002097511198371649, + "step": 3 + }, + { + "epoch": 0.1, + "learning_rate": 1.5384615384615384e-05, + "logits/chosen": 1.3156853914260864, + "logits/rejected": 1.2506608963012695, + "logps/chosen": -554.9755249023438, + "logps/rejected": -558.6537475585938, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005945444107055664, + "rewards/margins": 0.003013968002051115, + "rewards/rejected": 0.0029314758721739054, + "step": 4 + }, + { + "epoch": 0.13, + "learning_rate": 1.9230769230769228e-05, + "logits/chosen": 1.3257012367248535, + "logits/rejected": 1.223274827003479, + "logps/chosen": -523.537109375, + "logps/rejected": -585.207763671875, + "loss": 0.6902, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0114411236718297, + "rewards/margins": 0.005982697010040283, + "rewards/rejected": 0.005458426661789417, + "step": 5 + }, + { + "epoch": 0.15, + "learning_rate": 2.3076923076923076e-05, + "logits/chosen": 1.2537128925323486, + "logits/rejected": 1.1953891515731812, + "logps/chosen": -545.0478515625, + "logps/rejected": -489.9993896484375, + "loss": 0.6881, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.015848731622099876, + "rewards/margins": 0.01023783627897501, + "rewards/rejected": 0.0056108953431248665, + "step": 6 + }, + { + "epoch": 0.18, + "learning_rate": 2.692307692307692e-05, + "logits/chosen": 1.312497854232788, + "logits/rejected": 1.2055679559707642, + "logps/chosen": -545.1070556640625, + "logps/rejected": -509.77001953125, + "loss": 0.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02675754949450493, + "rewards/margins": 0.013801073655486107, + "rewards/rejected": 0.012956475839018822, + "step": 7 + }, + { + "epoch": 0.2, + "learning_rate": 3.076923076923077e-05, + "logits/chosen": 1.2600666284561157, + "logits/rejected": 1.1770424842834473, + "logps/chosen": -539.1686401367188, + "logps/rejected": -483.5211181640625, + "loss": 0.6871, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.033090125769376755, + "rewards/margins": 0.012454044073820114, + "rewards/rejected": 0.02063608169555664, + "step": 8 + }, + { + "epoch": 0.23, + "learning_rate": 3.461538461538461e-05, + "logits/chosen": 1.3227227926254272, + "logits/rejected": 1.2356113195419312, + "logps/chosen": -565.9663696289062, + "logps/rejected": -622.9007568359375, + "loss": 0.6839, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04537828266620636, + "rewards/margins": 0.019218124449253082, + "rewards/rejected": 0.026160158216953278, + "step": 9 + }, + { + "epoch": 0.26, + "learning_rate": 3.8461538461538456e-05, + "logits/chosen": 1.3510740995407104, + "logits/rejected": 1.2296315431594849, + "logps/chosen": -593.7393798828125, + "logps/rejected": -594.9727172851562, + "loss": 0.6723, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.07614221423864365, + "rewards/margins": 0.0457853302359581, + "rewards/rejected": 0.030356884002685547, + "step": 10 + }, + { + "epoch": 0.28, + "learning_rate": 4.23076923076923e-05, + "logits/chosen": 1.2242865562438965, + "logits/rejected": 1.2467900514602661, + "logps/chosen": -508.84783935546875, + "logps/rejected": -598.6978759765625, + "loss": 0.6674, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.08538543432950974, + "rewards/margins": 0.05763913318514824, + "rewards/rejected": 0.027746297419071198, + "step": 11 + }, + { + "epoch": 0.31, + "learning_rate": 4.615384615384615e-05, + "logits/chosen": 1.3490934371948242, + "logits/rejected": 1.301114559173584, + "logps/chosen": -570.9324340820312, + "logps/rejected": -578.1993408203125, + "loss": 0.6625, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09820537269115448, + "rewards/margins": 0.06798899918794632, + "rewards/rejected": 0.030216386541724205, + "step": 12 + }, + { + "epoch": 0.33, + "learning_rate": 4.9999999999999996e-05, + "logits/chosen": 1.31833016872406, + "logits/rejected": 1.2298263311386108, + "logps/chosen": -498.8169860839844, + "logps/rejected": -478.9023742675781, + "loss": 0.6617, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11317408084869385, + "rewards/margins": 0.06994115561246872, + "rewards/rejected": 0.043232932686805725, + "step": 13 + }, + { + "epoch": 0.36, + "learning_rate": 5.384615384615384e-05, + "logits/chosen": 1.3441507816314697, + "logits/rejected": 1.2419227361679077, + "logps/chosen": -570.9219970703125, + "logps/rejected": -512.0221557617188, + "loss": 0.6168, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13483530282974243, + "rewards/margins": 0.17350149154663086, + "rewards/rejected": -0.03866620361804962, + "step": 14 + }, + { + "epoch": 0.38, + "learning_rate": 5.769230769230769e-05, + "logits/chosen": 1.301425814628601, + "logits/rejected": 1.2630890607833862, + "logps/chosen": -525.2069091796875, + "logps/rejected": -515.6624145507812, + "loss": 0.6319, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.14281943440437317, + "rewards/margins": 0.1536417305469513, + "rewards/rejected": -0.010822296142578125, + "step": 15 + }, + { + "epoch": 0.41, + "learning_rate": 6.153846153846154e-05, + "logits/chosen": 1.3221077919006348, + "logits/rejected": 1.2412704229354858, + "logps/chosen": -522.0698852539062, + "logps/rejected": -479.1268615722656, + "loss": 0.6041, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18193425238132477, + "rewards/margins": 0.20868375897407532, + "rewards/rejected": -0.026749493554234505, + "step": 16 + }, + { + "epoch": 0.44, + "learning_rate": 6.538461538461539e-05, + "logits/chosen": 1.2305197715759277, + "logits/rejected": 1.1909728050231934, + "logps/chosen": -591.547607421875, + "logps/rejected": -501.61956787109375, + "loss": 0.632, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.22359740734100342, + "rewards/margins": 0.16370725631713867, + "rewards/rejected": 0.059890177100896835, + "step": 17 + }, + { + "epoch": 0.46, + "learning_rate": 6.923076923076922e-05, + "logits/chosen": 1.2290271520614624, + "logits/rejected": 1.2503975629806519, + "logps/chosen": -583.2138671875, + "logps/rejected": -551.908447265625, + "loss": 0.5782, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.25082916021347046, + "rewards/margins": 0.29117509722709656, + "rewards/rejected": -0.04034590348601341, + "step": 18 + }, + { + "epoch": 0.49, + "learning_rate": 7.307692307692307e-05, + "logits/chosen": 1.2445173263549805, + "logits/rejected": 1.274112582206726, + "logps/chosen": -476.1685791015625, + "logps/rejected": -558.716796875, + "loss": 0.6356, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.2891727685928345, + "rewards/margins": 0.18064022064208984, + "rewards/rejected": 0.10853258520364761, + "step": 19 + }, + { + "epoch": 0.51, + "learning_rate": 7.692307692307691e-05, + "logits/chosen": 1.3747544288635254, + "logits/rejected": 1.1747362613677979, + "logps/chosen": -604.9407348632812, + "logps/rejected": -521.9097900390625, + "loss": 0.5778, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3479543924331665, + "rewards/margins": 0.2904755175113678, + "rewards/rejected": 0.05747886002063751, + "step": 20 + }, + { + "epoch": 0.54, + "learning_rate": 8.076923076923076e-05, + "logits/chosen": 1.215498447418213, + "logits/rejected": 1.1988316774368286, + "logps/chosen": -508.58685302734375, + "logps/rejected": -480.9490051269531, + "loss": 0.6143, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.3200737535953522, + "rewards/margins": 0.22501060366630554, + "rewards/rejected": 0.09506310522556305, + "step": 21 + }, + { + "epoch": 0.56, + "learning_rate": 8.46153846153846e-05, + "logits/chosen": 1.2625510692596436, + "logits/rejected": 1.2751553058624268, + "logps/chosen": -464.3768615722656, + "logps/rejected": -548.1248779296875, + "loss": 0.6585, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2795701324939728, + "rewards/margins": 0.18192264437675476, + "rewards/rejected": 0.09764745086431503, + "step": 22 + }, + { + "epoch": 0.59, + "learning_rate": 8.846153846153845e-05, + "logits/chosen": 1.228266716003418, + "logits/rejected": 1.1854723691940308, + "logps/chosen": -542.0804443359375, + "logps/rejected": -593.2991943359375, + "loss": 0.5038, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.5573378801345825, + "rewards/margins": 0.559643566608429, + "rewards/rejected": -0.0023057162761688232, + "step": 23 + }, + { + "epoch": 0.61, + "learning_rate": 9.23076923076923e-05, + "logits/chosen": 1.3595786094665527, + "logits/rejected": 1.299391746520996, + "logps/chosen": -598.842041015625, + "logps/rejected": -521.9869384765625, + "loss": 0.58, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3911605179309845, + "rewards/margins": 0.45501241087913513, + "rewards/rejected": -0.06385190039873123, + "step": 24 + }, + { + "epoch": 0.64, + "learning_rate": 9.615384615384615e-05, + "logits/chosen": 1.244804859161377, + "logits/rejected": 1.2789154052734375, + "logps/chosen": -527.1282958984375, + "logps/rejected": -562.9415283203125, + "loss": 0.5638, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.3064769506454468, + "rewards/margins": 0.4528440833091736, + "rewards/rejected": -0.1463671177625656, + "step": 25 + }, + { + "epoch": 0.67, + "learning_rate": 9.999999999999999e-05, + "logits/chosen": 1.2605584859848022, + "logits/rejected": 1.257567048072815, + "logps/chosen": -518.2035522460938, + "logps/rejected": -553.550537109375, + "loss": 0.5622, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.3198173940181732, + "rewards/margins": 0.533422589302063, + "rewards/rejected": -0.21360518038272858, + "step": 26 + }, + { + "epoch": 0.69, + "learning_rate": 0.00010384615384615383, + "logits/chosen": 1.2905932664871216, + "logits/rejected": 1.252805233001709, + "logps/chosen": -502.51318359375, + "logps/rejected": -492.2623596191406, + "loss": 0.5688, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2552236318588257, + "rewards/margins": 0.4563944339752197, + "rewards/rejected": -0.20117078721523285, + "step": 27 + }, + { + "epoch": 0.72, + "learning_rate": 0.00010769230769230768, + "logits/chosen": 1.360573172569275, + "logits/rejected": 1.246628999710083, + "logps/chosen": -593.9207153320312, + "logps/rejected": -521.7044677734375, + "loss": 0.4958, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.24957714974880219, + "rewards/margins": 0.6095183491706848, + "rewards/rejected": -0.35994118452072144, + "step": 28 + }, + { + "epoch": 0.74, + "learning_rate": 0.00011153846153846153, + "logits/chosen": 1.3290568590164185, + "logits/rejected": 1.1086769104003906, + "logps/chosen": -588.37451171875, + "logps/rejected": -555.8126220703125, + "loss": 0.4523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22330154478549957, + "rewards/margins": 0.8921126127243042, + "rewards/rejected": -0.6688110828399658, + "step": 29 + }, + { + "epoch": 0.77, + "learning_rate": 0.00011538461538461538, + "logits/chosen": 1.283523678779602, + "logits/rejected": 1.2930572032928467, + "logps/chosen": -533.7445678710938, + "logps/rejected": -590.4415283203125, + "loss": 0.4839, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.2506100535392761, + "rewards/margins": 0.8535110950469971, + "rewards/rejected": -0.6029011011123657, + "step": 30 + }, + { + "epoch": 0.79, + "learning_rate": 0.00011923076923076922, + "logits/chosen": 1.1903033256530762, + "logits/rejected": 1.2316640615463257, + "logps/chosen": -544.7385864257812, + "logps/rejected": -559.547607421875, + "loss": 0.453, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16801050305366516, + "rewards/margins": 0.6988197565078735, + "rewards/rejected": -0.530809223651886, + "step": 31 + }, + { + "epoch": 0.82, + "learning_rate": 0.00012307692307692307, + "logits/chosen": 1.2974334955215454, + "logits/rejected": 1.2153936624526978, + "logps/chosen": -569.9619140625, + "logps/rejected": -532.5298461914062, + "loss": 0.3596, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.41700971126556396, + "rewards/margins": 1.2314927577972412, + "rewards/rejected": -0.814483106136322, + "step": 32 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001269230769230769, + "logits/chosen": 1.20902681350708, + "logits/rejected": 1.1872200965881348, + "logps/chosen": -517.0770263671875, + "logps/rejected": -496.25469970703125, + "loss": 0.3913, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6890252828598022, + "rewards/margins": 1.2427911758422852, + "rewards/rejected": -0.5537658929824829, + "step": 33 + }, + { + "epoch": 0.87, + "learning_rate": 0.00013076923076923077, + "logits/chosen": 1.1348516941070557, + "logits/rejected": 1.2495156526565552, + "logps/chosen": -473.9527893066406, + "logps/rejected": -583.898193359375, + "loss": 0.3421, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.3356397747993469, + "rewards/margins": 1.326012134552002, + "rewards/rejected": -0.9903723001480103, + "step": 34 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001346153846153846, + "logits/chosen": 1.199057698249817, + "logits/rejected": 1.2061611413955688, + "logps/chosen": -508.8526611328125, + "logps/rejected": -528.3182983398438, + "loss": 0.5245, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.07617084681987762, + "rewards/margins": 0.7057029008865356, + "rewards/rejected": -0.6295321583747864, + "step": 35 + }, + { + "epoch": 0.92, + "learning_rate": 0.00013846153846153845, + "logits/chosen": 1.324285864830017, + "logits/rejected": 1.1698598861694336, + "logps/chosen": -636.7570190429688, + "logps/rejected": -569.9312744140625, + "loss": 0.3545, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5845645666122437, + "rewards/margins": 1.4404823780059814, + "rewards/rejected": -0.855917751789093, + "step": 36 + }, + { + "epoch": 0.95, + "learning_rate": 0.00014230769230769228, + "logits/chosen": 1.3375169038772583, + "logits/rejected": 1.226860761642456, + "logps/chosen": -550.1502685546875, + "logps/rejected": -568.7561645507812, + "loss": 0.2948, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7275359034538269, + "rewards/margins": 1.519990086555481, + "rewards/rejected": -0.7924542427062988, + "step": 37 + }, + { + "epoch": 0.97, + "learning_rate": 0.00014615384615384615, + "logits/chosen": 1.3178166151046753, + "logits/rejected": 1.3225042819976807, + "logps/chosen": -583.7266235351562, + "logps/rejected": -640.8178100585938, + "loss": 0.2679, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0492963790893555, + "rewards/margins": 1.9212793111801147, + "rewards/rejected": -0.8719831109046936, + "step": 38 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015, + "logits/chosen": 1.424353837966919, + "logits/rejected": 1.2278132438659668, + "logps/chosen": -576.828125, + "logps/rejected": -532.112060546875, + "loss": 0.409, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.294251561164856, + "rewards/margins": 1.8459293842315674, + "rewards/rejected": -0.5516780018806458, + "step": 39 + }, + { + "epoch": 1.02, + "learning_rate": 0.00015384615384615382, + "logits/chosen": 1.3266693353652954, + "logits/rejected": 1.4058163166046143, + "logps/chosen": -523.482421875, + "logps/rejected": -534.4713134765625, + "loss": 0.2549, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9553894996643066, + "rewards/margins": 1.72726571559906, + "rewards/rejected": -0.7718762159347534, + "step": 40 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001576923076923077, + "logits/chosen": 1.2602558135986328, + "logits/rejected": 1.2625583410263062, + "logps/chosen": -522.94287109375, + "logps/rejected": -609.2876586914062, + "loss": 0.181, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.3178558349609375, + "rewards/margins": 2.475386381149292, + "rewards/rejected": -1.1575307846069336, + "step": 41 + }, + { + "epoch": 1.08, + "learning_rate": 0.00016153846153846153, + "logits/chosen": 1.2560456991195679, + "logits/rejected": 1.3124988079071045, + "logps/chosen": -557.4879150390625, + "logps/rejected": -604.8274536132812, + "loss": 0.2405, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.4575282335281372, + "rewards/margins": 2.372474193572998, + "rewards/rejected": -0.9149457812309265, + "step": 42 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001653846153846154, + "logits/chosen": 1.2838218212127686, + "logits/rejected": 1.0645534992218018, + "logps/chosen": -572.41357421875, + "logps/rejected": -516.2637939453125, + "loss": 0.1404, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6796542406082153, + "rewards/margins": 2.779283285140991, + "rewards/rejected": -1.0996291637420654, + "step": 43 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001692307692307692, + "logits/chosen": 1.1537644863128662, + "logits/rejected": 1.1177821159362793, + "logps/chosen": -455.26904296875, + "logps/rejected": -498.28900146484375, + "loss": 0.1726, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.2386869192123413, + "rewards/margins": 2.298076868057251, + "rewards/rejected": -1.0593899488449097, + "step": 44 + }, + { + "epoch": 1.15, + "learning_rate": 0.00017307692307692304, + "logits/chosen": 1.2277421951293945, + "logits/rejected": 1.1039767265319824, + "logps/chosen": -510.9812927246094, + "logps/rejected": -503.68829345703125, + "loss": 0.2239, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2717504501342773, + "rewards/margins": 2.59214186668396, + "rewards/rejected": -1.3203915357589722, + "step": 45 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001769230769230769, + "logits/chosen": 1.2631361484527588, + "logits/rejected": 1.221813440322876, + "logps/chosen": -528.2451171875, + "logps/rejected": -565.52783203125, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7057870626449585, + "rewards/margins": 2.7309272289276123, + "rewards/rejected": -1.025140404701233, + "step": 46 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018076923076923074, + "logits/chosen": 1.1752557754516602, + "logits/rejected": 1.2416189908981323, + "logps/chosen": -501.43609619140625, + "logps/rejected": -574.4725341796875, + "loss": 0.2208, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.349045753479004, + "rewards/margins": 2.619417190551758, + "rewards/rejected": -1.2703715562820435, + "step": 47 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001846153846153846, + "logits/chosen": 1.2255234718322754, + "logits/rejected": 1.1879360675811768, + "logps/chosen": -533.389404296875, + "logps/rejected": -574.5704956054688, + "loss": 0.2814, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4104804992675781, + "rewards/margins": 3.444915294647217, + "rewards/rejected": -2.0344350337982178, + "step": 48 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018846153846153844, + "logits/chosen": 1.3353238105773926, + "logits/rejected": 1.133821964263916, + "logps/chosen": -516.5598754882812, + "logps/rejected": -498.43603515625, + "loss": 0.1799, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.9517850875854492, + "rewards/margins": 3.3812241554260254, + "rewards/rejected": -1.4294389486312866, + "step": 49 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001923076923076923, + "logits/chosen": 1.4486721754074097, + "logits/rejected": 1.2504708766937256, + "logps/chosen": -577.9517822265625, + "logps/rejected": -578.573974609375, + "loss": 0.1502, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.9723610877990723, + "rewards/margins": 3.7397069931030273, + "rewards/rejected": -1.767345666885376, + "step": 50 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019615384615384615, + "logits/chosen": 1.156247854232788, + "logits/rejected": 1.1714026927947998, + "logps/chosen": -544.3547973632812, + "logps/rejected": -576.8724365234375, + "loss": 0.2114, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.1724624633789062, + "rewards/margins": 2.6890792846679688, + "rewards/rejected": -1.5166168212890625, + "step": 51 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019999999999999998, + "logits/chosen": 1.144045352935791, + "logits/rejected": 1.1430819034576416, + "logps/chosen": -508.1917724609375, + "logps/rejected": -617.259521484375, + "loss": 0.1589, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.5832880735397339, + "rewards/margins": 4.405728816986084, + "rewards/rejected": -2.8224408626556396, + "step": 52 + }, + { + "epoch": 1.36, + "learning_rate": 0.00020384615384615385, + "logits/chosen": 1.2338566780090332, + "logits/rejected": 1.1481688022613525, + "logps/chosen": -531.331298828125, + "logps/rejected": -501.3189697265625, + "loss": 0.1972, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.7195085287094116, + "rewards/margins": 3.0136146545410156, + "rewards/rejected": -2.2941062450408936, + "step": 53 + }, + { + "epoch": 1.38, + "learning_rate": 0.00020769230769230766, + "logits/chosen": 1.2266112565994263, + "logits/rejected": 1.192571997642517, + "logps/chosen": -549.5713500976562, + "logps/rejected": -617.6051025390625, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4716801345348358, + "rewards/margins": 4.073496341705322, + "rewards/rejected": -3.6018166542053223, + "step": 54 + }, + { + "epoch": 1.41, + "learning_rate": 0.00021153846153846152, + "logits/chosen": 1.1858152151107788, + "logits/rejected": 1.1164907217025757, + "logps/chosen": -556.048583984375, + "logps/rejected": -599.2432250976562, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6191356182098389, + "rewards/margins": 4.675748825073242, + "rewards/rejected": -4.056613445281982, + "step": 55 + }, + { + "epoch": 1.43, + "learning_rate": 0.00021538461538461536, + "logits/chosen": 1.3085862398147583, + "logits/rejected": 1.105547547340393, + "logps/chosen": -603.694580078125, + "logps/rejected": -638.7554931640625, + "loss": 0.1046, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4260401725769043, + "rewards/margins": 4.424009799957275, + "rewards/rejected": -3.997969150543213, + "step": 56 + }, + { + "epoch": 1.46, + "learning_rate": 0.0002192307692307692, + "logits/chosen": 1.0858182907104492, + "logits/rejected": 1.0118762254714966, + "logps/chosen": -547.671875, + "logps/rejected": -617.9951782226562, + "loss": 0.222, + "rewards/accuracies": 0.90625, + "rewards/chosen": 0.33517026901245117, + "rewards/margins": 3.591820001602173, + "rewards/rejected": -3.2566497325897217, + "step": 57 + }, + { + "epoch": 1.48, + "learning_rate": 0.00022307692307692306, + "logits/chosen": 1.0582268238067627, + "logits/rejected": 1.1294262409210205, + "logps/chosen": -482.9873962402344, + "logps/rejected": -656.79736328125, + "loss": 0.0781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.6922726035118103, + "rewards/margins": 4.7888617515563965, + "rewards/rejected": -4.0965895652771, + "step": 58 + }, + { + "epoch": 1.51, + "learning_rate": 0.0002269230769230769, + "logits/chosen": 1.1849119663238525, + "logits/rejected": 0.989042341709137, + "logps/chosen": -579.6500854492188, + "logps/rejected": -521.362060546875, + "loss": 0.3419, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1423124074935913, + "rewards/margins": 2.5491743087768555, + "rewards/rejected": -2.6914870738983154, + "step": 59 + }, + { + "epoch": 1.54, + "learning_rate": 0.00023076923076923076, + "logits/chosen": 1.2060493230819702, + "logits/rejected": 1.0908496379852295, + "logps/chosen": -503.88165283203125, + "logps/rejected": -486.8570251464844, + "loss": 0.2709, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.47895151376724243, + "rewards/margins": 3.0553998947143555, + "rewards/rejected": -2.576448917388916, + "step": 60 + }, + { + "epoch": 1.56, + "learning_rate": 0.0002346153846153846, + "logits/chosen": 1.331544041633606, + "logits/rejected": 1.2061303853988647, + "logps/chosen": -567.0886840820312, + "logps/rejected": -555.7452392578125, + "loss": 0.1176, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.0240767002105713, + "rewards/margins": 3.8347549438476562, + "rewards/rejected": -1.810678243637085, + "step": 61 + }, + { + "epoch": 1.59, + "learning_rate": 0.00023846153846153844, + "logits/chosen": 1.4309509992599487, + "logits/rejected": 1.2126150131225586, + "logps/chosen": -569.264892578125, + "logps/rejected": -555.525390625, + "loss": 0.1622, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7101458311080933, + "rewards/margins": 3.121368885040283, + "rewards/rejected": -1.4112231731414795, + "step": 62 + }, + { + "epoch": 1.61, + "learning_rate": 0.0002423076923076923, + "logits/chosen": 1.4115931987762451, + "logits/rejected": 1.3143726587295532, + "logps/chosen": -567.4575805664062, + "logps/rejected": -551.8060302734375, + "loss": 0.1243, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.250972270965576, + "rewards/margins": 3.4197261333465576, + "rewards/rejected": -1.168753981590271, + "step": 63 + }, + { + "epoch": 1.64, + "learning_rate": 0.00024615384615384614, + "logits/chosen": 1.4023628234863281, + "logits/rejected": 1.4432225227355957, + "logps/chosen": -561.3869018554688, + "logps/rejected": -625.6553344726562, + "loss": 0.137, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7209982872009277, + "rewards/margins": 3.365004777908325, + "rewards/rejected": -0.6440060138702393, + "step": 64 + }, + { + "epoch": 1.66, + "learning_rate": 0.00025, + "logits/chosen": 1.403619647026062, + "logits/rejected": 1.3114700317382812, + "logps/chosen": -557.41796875, + "logps/rejected": -585.562744140625, + "loss": 0.0987, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.93491268157959, + "rewards/margins": 3.9586920738220215, + "rewards/rejected": -1.0237791538238525, + "step": 65 + }, + { + "epoch": 1.69, + "learning_rate": 0.0002538461538461538, + "logits/chosen": 1.2774848937988281, + "logits/rejected": 1.391822338104248, + "logps/chosen": -501.67236328125, + "logps/rejected": -607.6801147460938, + "loss": 0.1317, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4067182540893555, + "rewards/margins": 3.5049643516540527, + "rewards/rejected": -1.0982458591461182, + "step": 66 + }, + { + "epoch": 1.72, + "learning_rate": 0.0002576923076923077, + "logits/chosen": 1.2345640659332275, + "logits/rejected": 1.3340271711349487, + "logps/chosen": -502.60589599609375, + "logps/rejected": -594.7681884765625, + "loss": 0.1882, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6566812992095947, + "rewards/margins": 3.0219056606292725, + "rewards/rejected": -1.3652244806289673, + "step": 67 + }, + { + "epoch": 1.74, + "learning_rate": 0.00026153846153846154, + "logits/chosen": 1.3136622905731201, + "logits/rejected": 1.1759097576141357, + "logps/chosen": -554.1909790039062, + "logps/rejected": -477.70074462890625, + "loss": 0.1877, + "rewards/accuracies": 0.90625, + "rewards/chosen": 1.4338759183883667, + "rewards/margins": 3.288071393966675, + "rewards/rejected": -1.854195237159729, + "step": 68 + }, + { + "epoch": 1.77, + "learning_rate": 0.00026538461538461536, + "logits/chosen": 1.3607081174850464, + "logits/rejected": 1.173765778541565, + "logps/chosen": -581.3910522460938, + "logps/rejected": -505.4391784667969, + "loss": 0.1424, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8081064224243164, + "rewards/margins": 4.335569858551025, + "rewards/rejected": -2.527463436126709, + "step": 69 + }, + { + "epoch": 1.79, + "learning_rate": 0.0002692307692307692, + "logits/chosen": 1.3691301345825195, + "logits/rejected": 1.3659199476242065, + "logps/chosen": -583.293212890625, + "logps/rejected": -593.8999633789062, + "loss": 0.1304, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.1818904876708984, + "rewards/margins": 3.6441099643707275, + "rewards/rejected": -2.46221923828125, + "step": 70 + }, + { + "epoch": 1.82, + "learning_rate": 0.00027307692307692303, + "logits/chosen": 1.1499935388565063, + "logits/rejected": 1.1766197681427002, + "logps/chosen": -482.293701171875, + "logps/rejected": -570.5851440429688, + "loss": 0.0857, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.165338158607483, + "rewards/margins": 3.8725168704986572, + "rewards/rejected": -2.7071785926818848, + "step": 71 + }, + { + "epoch": 1.84, + "learning_rate": 0.0002769230769230769, + "logits/chosen": 1.325207233428955, + "logits/rejected": 1.1687246561050415, + "logps/chosen": -571.1265869140625, + "logps/rejected": -600.61865234375, + "loss": 0.1814, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.56050443649292, + "rewards/margins": 3.7676329612731934, + "rewards/rejected": -2.2071282863616943, + "step": 72 + }, + { + "epoch": 1.87, + "learning_rate": 0.00028076923076923076, + "logits/chosen": 1.3450841903686523, + "logits/rejected": 1.151707410812378, + "logps/chosen": -538.774658203125, + "logps/rejected": -446.6949157714844, + "loss": 0.091, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.3523597717285156, + "rewards/margins": 3.761559247970581, + "rewards/rejected": -2.4091997146606445, + "step": 73 + }, + { + "epoch": 1.89, + "learning_rate": 0.00028461538461538457, + "logits/chosen": 1.3139710426330566, + "logits/rejected": 1.1975148916244507, + "logps/chosen": -544.6610107421875, + "logps/rejected": -459.76275634765625, + "loss": 0.2291, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.633323311805725, + "rewards/margins": 3.0829625129699707, + "rewards/rejected": -1.4496394395828247, + "step": 74 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028846153846153843, + "logits/chosen": 1.4247934818267822, + "logits/rejected": 1.3758761882781982, + "logps/chosen": -518.5802612304688, + "logps/rejected": -527.0255737304688, + "loss": 0.1177, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.010237216949463, + "rewards/margins": 3.8918490409851074, + "rewards/rejected": -1.8816115856170654, + "step": 75 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002923076923076923, + "logits/chosen": 1.3634833097457886, + "logits/rejected": 1.2164397239685059, + "logps/chosen": -534.68505859375, + "logps/rejected": -506.5450744628906, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.536834239959717, + "rewards/margins": 3.897109031677246, + "rewards/rejected": -1.3602746725082397, + "step": 76 + }, + { + "epoch": 1.97, + "learning_rate": 0.00029615384615384616, + "logits/chosen": 1.3006478548049927, + "logits/rejected": 1.3856381177902222, + "logps/chosen": -469.55450439453125, + "logps/rejected": -642.709716796875, + "loss": 0.148, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.724798560142517, + "rewards/margins": 3.511629343032837, + "rewards/rejected": -1.7868304252624512, + "step": 77 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003, + "logits/chosen": 1.328688621520996, + "logits/rejected": 1.1962082386016846, + "logps/chosen": -516.2116088867188, + "logps/rejected": -504.8653259277344, + "loss": 0.0937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.6297885179519653, + "rewards/margins": 3.997544288635254, + "rewards/rejected": -2.36775541305542, + "step": 78 + }, + { + "epoch": 2.02, + "learning_rate": 0.00029957264957264953, + "logits/chosen": 1.3884762525558472, + "logits/rejected": 1.2696239948272705, + "logps/chosen": -575.4322509765625, + "logps/rejected": -583.3463745117188, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6055470705032349, + "rewards/margins": 4.734403133392334, + "rewards/rejected": -3.1288557052612305, + "step": 79 + }, + { + "epoch": 2.05, + "learning_rate": 0.00029914529914529915, + "logits/chosen": 1.2306060791015625, + "logits/rejected": 1.201812982559204, + "logps/chosen": -526.1033935546875, + "logps/rejected": -625.164306640625, + "loss": 0.0742, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.5637297034263611, + "rewards/margins": 4.7755279541015625, + "rewards/rejected": -4.211798191070557, + "step": 80 + }, + { + "epoch": 2.07, + "learning_rate": 0.0002987179487179487, + "logits/chosen": 1.2625794410705566, + "logits/rejected": 1.1102485656738281, + "logps/chosen": -581.4782104492188, + "logps/rejected": -620.8692626953125, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47795385122299194, + "rewards/margins": 5.5137505531311035, + "rewards/rejected": -5.035797119140625, + "step": 81 + }, + { + "epoch": 2.1, + "learning_rate": 0.00029829059829059826, + "logits/chosen": 1.193795919418335, + "logits/rejected": 1.13469660282135, + "logps/chosen": -534.1414794921875, + "logps/rejected": -540.5661010742188, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0871587023139, + "rewards/margins": 4.834710121154785, + "rewards/rejected": -4.747550964355469, + "step": 82 + }, + { + "epoch": 2.12, + "learning_rate": 0.0002978632478632478, + "logits/chosen": 1.245851755142212, + "logits/rejected": 1.0414592027664185, + "logps/chosen": -603.68212890625, + "logps/rejected": -521.8074340820312, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3754722476005554, + "rewards/margins": 4.623739719390869, + "rewards/rejected": -4.248267650604248, + "step": 83 + }, + { + "epoch": 2.15, + "learning_rate": 0.00029743589743589743, + "logits/chosen": 1.0934433937072754, + "logits/rejected": 1.0990025997161865, + "logps/chosen": -511.38897705078125, + "logps/rejected": -607.8187255859375, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49227023124694824, + "rewards/margins": 5.553871154785156, + "rewards/rejected": -6.046142101287842, + "step": 84 + }, + { + "epoch": 2.18, + "learning_rate": 0.000297008547008547, + "logits/chosen": 1.1143027544021606, + "logits/rejected": 1.1277693510055542, + "logps/chosen": -501.1644287109375, + "logps/rejected": -627.3696899414062, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9094219207763672, + "rewards/margins": 5.872971534729004, + "rewards/rejected": -4.9635491371154785, + "step": 85 + }, + { + "epoch": 2.2, + "learning_rate": 0.00029658119658119655, + "logits/chosen": 1.2900817394256592, + "logits/rejected": 1.258035659790039, + "logps/chosen": -482.3966369628906, + "logps/rejected": -613.1928100585938, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0523629188537598, + "rewards/margins": 6.690260887145996, + "rewards/rejected": -5.6378984451293945, + "step": 86 + }, + { + "epoch": 2.23, + "learning_rate": 0.00029615384615384616, + "logits/chosen": 1.2223701477050781, + "logits/rejected": 1.1854349374771118, + "logps/chosen": -504.41375732421875, + "logps/rejected": -530.4273071289062, + "loss": 0.1105, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9830568432807922, + "rewards/margins": 5.67386531829834, + "rewards/rejected": -4.6908087730407715, + "step": 87 + }, + { + "epoch": 2.25, + "learning_rate": 0.0002957264957264957, + "logits/chosen": 1.3008583784103394, + "logits/rejected": 1.2168331146240234, + "logps/chosen": -580.7012939453125, + "logps/rejected": -591.1567993164062, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.010232925415039, + "rewards/margins": 5.921438694000244, + "rewards/rejected": -4.911205768585205, + "step": 88 + }, + { + "epoch": 2.28, + "learning_rate": 0.0002952991452991453, + "logits/chosen": 1.4120073318481445, + "logits/rejected": 1.3543351888656616, + "logps/chosen": -551.34130859375, + "logps/rejected": -607.3863525390625, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7222862243652344, + "rewards/margins": 6.246278762817383, + "rewards/rejected": -4.523993492126465, + "step": 89 + }, + { + "epoch": 2.3, + "learning_rate": 0.00029487179487179484, + "logits/chosen": 1.3433583974838257, + "logits/rejected": 1.2706623077392578, + "logps/chosen": -550.4794921875, + "logps/rejected": -574.7581787109375, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3056535720825195, + "rewards/margins": 6.369759559631348, + "rewards/rejected": -4.064105033874512, + "step": 90 + }, + { + "epoch": 2.33, + "learning_rate": 0.00029444444444444445, + "logits/chosen": 1.4810067415237427, + "logits/rejected": 1.3233308792114258, + "logps/chosen": -586.744384765625, + "logps/rejected": -591.3648071289062, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4182064533233643, + "rewards/margins": 6.562655448913574, + "rewards/rejected": -4.144449710845947, + "step": 91 + }, + { + "epoch": 2.36, + "learning_rate": 0.000294017094017094, + "logits/chosen": 1.4654786586761475, + "logits/rejected": 1.3706129789352417, + "logps/chosen": -496.12701416015625, + "logps/rejected": -567.374755859375, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9136104583740234, + "rewards/margins": 5.71716833114624, + "rewards/rejected": -3.8035576343536377, + "step": 92 + }, + { + "epoch": 2.38, + "learning_rate": 0.00029358974358974357, + "logits/chosen": 1.4996682405471802, + "logits/rejected": 1.4585434198379517, + "logps/chosen": -568.8134155273438, + "logps/rejected": -659.0804443359375, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9488861560821533, + "rewards/margins": 6.1611528396606445, + "rewards/rejected": -4.2122673988342285, + "step": 93 + }, + { + "epoch": 2.41, + "learning_rate": 0.00029316239316239313, + "logits/chosen": 1.433556318283081, + "logits/rejected": 1.4805834293365479, + "logps/chosen": -551.3663940429688, + "logps/rejected": -583.002685546875, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.608030080795288, + "rewards/margins": 5.825406551361084, + "rewards/rejected": -3.217377185821533, + "step": 94 + }, + { + "epoch": 2.43, + "learning_rate": 0.0002927350427350427, + "logits/chosen": 1.5423190593719482, + "logits/rejected": 1.3645411729812622, + "logps/chosen": -541.7529296875, + "logps/rejected": -540.1168212890625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6581199169158936, + "rewards/margins": 6.796147346496582, + "rewards/rejected": -4.138028144836426, + "step": 95 + }, + { + "epoch": 2.46, + "learning_rate": 0.0002923076923076923, + "logits/chosen": 1.3319764137268066, + "logits/rejected": 1.3734033107757568, + "logps/chosen": -505.6252136230469, + "logps/rejected": -595.2819213867188, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1945226192474365, + "rewards/margins": 5.589131832122803, + "rewards/rejected": -4.3946099281311035, + "step": 96 + }, + { + "epoch": 2.48, + "learning_rate": 0.00029188034188034186, + "logits/chosen": 1.326178789138794, + "logits/rejected": 1.4406118392944336, + "logps/chosen": -514.8160400390625, + "logps/rejected": -575.283447265625, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6229735612869263, + "rewards/margins": 6.596070766448975, + "rewards/rejected": -4.973097324371338, + "step": 97 + }, + { + "epoch": 2.51, + "learning_rate": 0.0002914529914529914, + "logits/chosen": 1.408813714981079, + "logits/rejected": 1.3960859775543213, + "logps/chosen": -542.9385375976562, + "logps/rejected": -647.4051513671875, + "loss": 0.0683, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.39411336183547974, + "rewards/margins": 6.164831161499023, + "rewards/rejected": -5.770717620849609, + "step": 98 + }, + { + "epoch": 2.53, + "learning_rate": 0.000291025641025641, + "logits/chosen": 1.2414624691009521, + "logits/rejected": 1.2821427583694458, + "logps/chosen": -500.63568115234375, + "logps/rejected": -617.734375, + "loss": 0.1346, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.0469615459442139, + "rewards/margins": 6.8481831550598145, + "rewards/rejected": -5.8012213706970215, + "step": 99 + }, + { + "epoch": 2.56, + "learning_rate": 0.0002905982905982906, + "logits/chosen": 1.3173812627792358, + "logits/rejected": 1.1461554765701294, + "logps/chosen": -596.7745971679688, + "logps/rejected": -591.0558471679688, + "loss": 0.0567, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.7747964262962341, + "rewards/margins": 5.85588264465332, + "rewards/rejected": -5.081086158752441, + "step": 100 + }, + { + "epoch": 2.59, + "learning_rate": 0.00029017094017094015, + "logits/chosen": 1.1509640216827393, + "logits/rejected": 1.1098980903625488, + "logps/chosen": -474.10675048828125, + "logps/rejected": -535.8497924804688, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1605219841003418, + "rewards/margins": 6.742035865783691, + "rewards/rejected": -5.58151388168335, + "step": 101 + }, + { + "epoch": 2.61, + "learning_rate": 0.0002897435897435897, + "logits/chosen": 1.1271547079086304, + "logits/rejected": 0.9869892001152039, + "logps/chosen": -542.78662109375, + "logps/rejected": -566.442626953125, + "loss": 0.2008, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.9951988458633423, + "rewards/margins": 6.587618827819824, + "rewards/rejected": -5.5924201011657715, + "step": 102 + }, + { + "epoch": 2.64, + "learning_rate": 0.00028931623931623926, + "logits/chosen": 1.2000806331634521, + "logits/rejected": 1.0094119310379028, + "logps/chosen": -550.8510131835938, + "logps/rejected": -531.4238891601562, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3359670639038086, + "rewards/margins": 6.495780944824219, + "rewards/rejected": -5.159814834594727, + "step": 103 + }, + { + "epoch": 2.66, + "learning_rate": 0.0002888888888888888, + "logits/chosen": 1.102707862854004, + "logits/rejected": 1.2245404720306396, + "logps/chosen": -528.5265502929688, + "logps/rejected": -665.08544921875, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7339667081832886, + "rewards/margins": 7.117589473724365, + "rewards/rejected": -6.383623123168945, + "step": 104 + }, + { + "epoch": 2.69, + "learning_rate": 0.00028846153846153843, + "logits/chosen": 1.1308355331420898, + "logits/rejected": 0.9417912364006042, + "logps/chosen": -553.8656005859375, + "logps/rejected": -555.5968627929688, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9540010690689087, + "rewards/margins": 6.364682197570801, + "rewards/rejected": -4.410680770874023, + "step": 105 + }, + { + "epoch": 2.71, + "learning_rate": 0.000288034188034188, + "logits/chosen": 1.2123243808746338, + "logits/rejected": 0.9968570470809937, + "logps/chosen": -567.16748046875, + "logps/rejected": -521.2568359375, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0697741508483887, + "rewards/margins": 6.565426349639893, + "rewards/rejected": -4.495651721954346, + "step": 106 + }, + { + "epoch": 2.74, + "learning_rate": 0.00028760683760683755, + "logits/chosen": 1.1736996173858643, + "logits/rejected": 1.147781252861023, + "logps/chosen": -542.6943359375, + "logps/rejected": -622.96826171875, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.328216075897217, + "rewards/margins": 7.282103538513184, + "rewards/rejected": -4.953887939453125, + "step": 107 + }, + { + "epoch": 2.76, + "learning_rate": 0.00028717948717948716, + "logits/chosen": 1.060608983039856, + "logits/rejected": 1.2086546421051025, + "logps/chosen": -499.45989990234375, + "logps/rejected": -617.087646484375, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5071005821228027, + "rewards/margins": 5.79173469543457, + "rewards/rejected": -3.2846338748931885, + "step": 108 + }, + { + "epoch": 2.79, + "learning_rate": 0.0002867521367521367, + "logits/chosen": 1.180185079574585, + "logits/rejected": 0.9707103967666626, + "logps/chosen": -530.9793701171875, + "logps/rejected": -531.7672119140625, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.693030595779419, + "rewards/margins": 5.776254653930664, + "rewards/rejected": -3.083223819732666, + "step": 109 + }, + { + "epoch": 2.82, + "learning_rate": 0.0002863247863247863, + "logits/chosen": 1.2219749689102173, + "logits/rejected": 1.1085822582244873, + "logps/chosen": -579.6038818359375, + "logps/rejected": -576.358642578125, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.139183521270752, + "rewards/margins": 5.98101806640625, + "rewards/rejected": -2.84183406829834, + "step": 110 + }, + { + "epoch": 2.84, + "learning_rate": 0.00028589743589743584, + "logits/chosen": 1.190647006034851, + "logits/rejected": 1.1908663511276245, + "logps/chosen": -514.8892211914062, + "logps/rejected": -611.48046875, + "loss": 0.0565, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.977041006088257, + "rewards/margins": 6.328646659851074, + "rewards/rejected": -3.3516054153442383, + "step": 111 + }, + { + "epoch": 2.87, + "learning_rate": 0.00028547008547008545, + "logits/chosen": 1.1982405185699463, + "logits/rejected": 1.1012368202209473, + "logps/chosen": -496.9002685546875, + "logps/rejected": -544.1046142578125, + "loss": 0.0649, + "rewards/accuracies": 0.96875, + "rewards/chosen": 3.5147013664245605, + "rewards/margins": 6.30265998840332, + "rewards/rejected": -2.787958860397339, + "step": 112 + }, + { + "epoch": 2.89, + "learning_rate": 0.000285042735042735, + "logits/chosen": 1.1536628007888794, + "logits/rejected": 1.0870661735534668, + "logps/chosen": -475.55023193359375, + "logps/rejected": -584.7400512695312, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8786869049072266, + "rewards/margins": 6.310683250427246, + "rewards/rejected": -3.4319963455200195, + "step": 113 + }, + { + "epoch": 2.92, + "learning_rate": 0.00028461538461538457, + "logits/chosen": 1.129310965538025, + "logits/rejected": 1.0381569862365723, + "logps/chosen": -506.8164978027344, + "logps/rejected": -490.2034606933594, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.066460609436035, + "rewards/margins": 5.390075206756592, + "rewards/rejected": -2.3236145973205566, + "step": 114 + }, + { + "epoch": 2.94, + "learning_rate": 0.0002841880341880342, + "logits/chosen": 1.2719249725341797, + "logits/rejected": 1.0853713750839233, + "logps/chosen": -580.2399291992188, + "logps/rejected": -545.1849975585938, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0230274200439453, + "rewards/margins": 7.315037727355957, + "rewards/rejected": -4.292009353637695, + "step": 115 + }, + { + "epoch": 2.97, + "learning_rate": 0.00028376068376068374, + "logits/chosen": 1.2354512214660645, + "logits/rejected": 1.1801047325134277, + "logps/chosen": -482.8843994140625, + "logps/rejected": -646.3005981445312, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2153114080429077, + "rewards/margins": 6.708934307098389, + "rewards/rejected": -5.493622779846191, + "step": 116 + }, + { + "epoch": 3.0, + "learning_rate": 0.0002833333333333333, + "logits/chosen": 0.9910479784011841, + "logits/rejected": 1.020785927772522, + "logps/chosen": -481.9186706542969, + "logps/rejected": -603.8145141601562, + "loss": 0.0455, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.37189701199531555, + "rewards/margins": 6.756865978240967, + "rewards/rejected": -6.384969711303711, + "step": 117 + }, + { + "epoch": 3.02, + "learning_rate": 0.0002829059829059829, + "logits/chosen": 1.066985845565796, + "logits/rejected": 1.0346630811691284, + "logps/chosen": -550.504638671875, + "logps/rejected": -601.9063720703125, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5437355041503906, + "rewards/margins": 7.08186674118042, + "rewards/rejected": -6.538131237030029, + "step": 118 + }, + { + "epoch": 3.05, + "learning_rate": 0.00028247863247863247, + "logits/chosen": 1.1769685745239258, + "logits/rejected": 1.0711925029754639, + "logps/chosen": -572.137451171875, + "logps/rejected": -591.6756591796875, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4036594033241272, + "rewards/margins": 7.711120128631592, + "rewards/rejected": -7.307460784912109, + "step": 119 + }, + { + "epoch": 3.07, + "learning_rate": 0.00028205128205128203, + "logits/chosen": 1.1407585144042969, + "logits/rejected": 1.0219597816467285, + "logps/chosen": -486.6500244140625, + "logps/rejected": -650.4639282226562, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8903399705886841, + "rewards/margins": 8.448041915893555, + "rewards/rejected": -7.5577006340026855, + "step": 120 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002816239316239316, + "logits/chosen": 1.146453619003296, + "logits/rejected": 1.0628845691680908, + "logps/chosen": -531.998046875, + "logps/rejected": -557.2601928710938, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3119817972183228, + "rewards/margins": 7.6520514488220215, + "rewards/rejected": -6.340068817138672, + "step": 121 + }, + { + "epoch": 3.12, + "learning_rate": 0.0002811965811965812, + "logits/chosen": 1.0393480062484741, + "logits/rejected": 1.0280386209487915, + "logps/chosen": -470.7844543457031, + "logps/rejected": -552.33935546875, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4702866077423096, + "rewards/margins": 7.459576606750488, + "rewards/rejected": -5.9892897605896, + "step": 122 + }, + { + "epoch": 3.15, + "learning_rate": 0.00028076923076923076, + "logits/chosen": 1.1644623279571533, + "logits/rejected": 1.0770840644836426, + "logps/chosen": -546.8715209960938, + "logps/rejected": -534.8888549804688, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.770728349685669, + "rewards/margins": 7.339654922485352, + "rewards/rejected": -5.5689263343811035, + "step": 123 + }, + { + "epoch": 3.17, + "learning_rate": 0.0002803418803418803, + "logits/chosen": 1.0946919918060303, + "logits/rejected": 1.1111879348754883, + "logps/chosen": -448.84698486328125, + "logps/rejected": -623.4598388671875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.276505470275879, + "rewards/margins": 7.733612060546875, + "rewards/rejected": -5.457107067108154, + "step": 124 + }, + { + "epoch": 3.2, + "learning_rate": 0.00027991452991452993, + "logits/chosen": 1.083165168762207, + "logits/rejected": 1.1361708641052246, + "logps/chosen": -491.6983947753906, + "logps/rejected": -597.7027587890625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.072481632232666, + "rewards/margins": 8.625251770019531, + "rewards/rejected": -6.552770137786865, + "step": 125 + }, + { + "epoch": 3.23, + "learning_rate": 0.0002794871794871795, + "logits/chosen": 1.2724071741104126, + "logits/rejected": 1.118057370185852, + "logps/chosen": -525.2489013671875, + "logps/rejected": -523.6296997070312, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.201770544052124, + "rewards/margins": 6.9069504737854, + "rewards/rejected": -4.7051801681518555, + "step": 126 + }, + { + "epoch": 3.25, + "learning_rate": 0.00027905982905982905, + "logits/chosen": 1.2534475326538086, + "logits/rejected": 1.2043638229370117, + "logps/chosen": -574.5486450195312, + "logps/rejected": -598.8119506835938, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.390216112136841, + "rewards/margins": 8.325881004333496, + "rewards/rejected": -5.935665130615234, + "step": 127 + }, + { + "epoch": 3.28, + "learning_rate": 0.0002786324786324786, + "logits/chosen": 1.2600340843200684, + "logits/rejected": 1.2670692205429077, + "logps/chosen": -539.1239624023438, + "logps/rejected": -605.8324584960938, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.585294485092163, + "rewards/margins": 8.082437515258789, + "rewards/rejected": -5.497143268585205, + "step": 128 + }, + { + "epoch": 3.3, + "learning_rate": 0.00027820512820512816, + "logits/chosen": 1.2851054668426514, + "logits/rejected": 1.1849486827850342, + "logps/chosen": -571.027587890625, + "logps/rejected": -601.7526245117188, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7168142795562744, + "rewards/margins": 7.749199867248535, + "rewards/rejected": -5.03238582611084, + "step": 129 + }, + { + "epoch": 3.33, + "learning_rate": 0.0002777777777777778, + "logits/chosen": 1.3977611064910889, + "logits/rejected": 1.210925817489624, + "logps/chosen": -533.6973266601562, + "logps/rejected": -598.4097900390625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4912829399108887, + "rewards/margins": 8.09334945678711, + "rewards/rejected": -5.602066516876221, + "step": 130 + }, + { + "epoch": 3.35, + "learning_rate": 0.00027735042735042734, + "logits/chosen": 1.3112545013427734, + "logits/rejected": 1.2536931037902832, + "logps/chosen": -516.1342163085938, + "logps/rejected": -645.7244873046875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.322323799133301, + "rewards/margins": 8.349126815795898, + "rewards/rejected": -6.026803493499756, + "step": 131 + }, + { + "epoch": 3.38, + "learning_rate": 0.0002769230769230769, + "logits/chosen": 1.3068538904190063, + "logits/rejected": 1.3506109714508057, + "logps/chosen": -503.5260925292969, + "logps/rejected": -658.4425659179688, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.650004506111145, + "rewards/margins": 7.3509368896484375, + "rewards/rejected": -5.700932025909424, + "step": 132 + }, + { + "epoch": 3.4, + "learning_rate": 0.00027649572649572645, + "logits/chosen": 1.336862325668335, + "logits/rejected": 1.2281874418258667, + "logps/chosen": -541.9310302734375, + "logps/rejected": -600.86474609375, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8469295501708984, + "rewards/margins": 7.918344974517822, + "rewards/rejected": -6.071415901184082, + "step": 133 + }, + { + "epoch": 3.43, + "learning_rate": 0.00027606837606837607, + "logits/chosen": 1.3074719905853271, + "logits/rejected": 1.3130128383636475, + "logps/chosen": -484.4212646484375, + "logps/rejected": -639.7544555664062, + "loss": 0.0861, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.4394211769104004, + "rewards/margins": 7.3523383140563965, + "rewards/rejected": -4.912917613983154, + "step": 134 + }, + { + "epoch": 3.46, + "learning_rate": 0.0002756410256410256, + "logits/chosen": 1.3250826597213745, + "logits/rejected": 1.2685434818267822, + "logps/chosen": -496.5311279296875, + "logps/rejected": -562.654541015625, + "loss": 0.09, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.504718065261841, + "rewards/margins": 7.967668056488037, + "rewards/rejected": -5.462949752807617, + "step": 135 + }, + { + "epoch": 3.48, + "learning_rate": 0.0002752136752136752, + "logits/chosen": 1.4271235466003418, + "logits/rejected": 1.2630449533462524, + "logps/chosen": -561.2323608398438, + "logps/rejected": -642.5577392578125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4116668701171875, + "rewards/margins": 8.215126037597656, + "rewards/rejected": -5.803459167480469, + "step": 136 + }, + { + "epoch": 3.51, + "learning_rate": 0.00027478632478632474, + "logits/chosen": 1.3756340742111206, + "logits/rejected": 1.4128466844558716, + "logps/chosen": -559.175048828125, + "logps/rejected": -650.5951538085938, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0436666011810303, + "rewards/margins": 7.851646900177002, + "rewards/rejected": -4.807980537414551, + "step": 137 + }, + { + "epoch": 3.53, + "learning_rate": 0.0002743589743589743, + "logits/chosen": 1.3762015104293823, + "logits/rejected": 1.2889195680618286, + "logps/chosen": -538.1447143554688, + "logps/rejected": -631.197998046875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8010246753692627, + "rewards/margins": 7.74313497543335, + "rewards/rejected": -4.942111015319824, + "step": 138 + }, + { + "epoch": 3.56, + "learning_rate": 0.0002739316239316239, + "logits/chosen": 1.4248151779174805, + "logits/rejected": 1.335301399230957, + "logps/chosen": -473.7148132324219, + "logps/rejected": -559.2073974609375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2720112800598145, + "rewards/margins": 7.730058670043945, + "rewards/rejected": -4.458047389984131, + "step": 139 + }, + { + "epoch": 3.58, + "learning_rate": 0.00027350427350427347, + "logits/chosen": 1.3754342794418335, + "logits/rejected": 1.3729346990585327, + "logps/chosen": -519.9334106445312, + "logps/rejected": -599.0367431640625, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4481189250946045, + "rewards/margins": 8.235373497009277, + "rewards/rejected": -4.787254810333252, + "step": 140 + }, + { + "epoch": 3.61, + "learning_rate": 0.00027307692307692303, + "logits/chosen": 1.3112382888793945, + "logits/rejected": 1.2730636596679688, + "logps/chosen": -535.9412231445312, + "logps/rejected": -491.223388671875, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4259979724884033, + "rewards/margins": 7.016923904418945, + "rewards/rejected": -4.590925693511963, + "step": 141 + }, + { + "epoch": 3.64, + "learning_rate": 0.0002726495726495726, + "logits/chosen": 1.4348691701889038, + "logits/rejected": 1.2340961694717407, + "logps/chosen": -533.677978515625, + "logps/rejected": -525.3228149414062, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7481579780578613, + "rewards/margins": 7.346179008483887, + "rewards/rejected": -4.598021030426025, + "step": 142 + }, + { + "epoch": 3.66, + "learning_rate": 0.0002722222222222222, + "logits/chosen": 1.4624712467193604, + "logits/rejected": 1.4238498210906982, + "logps/chosen": -521.18310546875, + "logps/rejected": -628.2357788085938, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.531442880630493, + "rewards/margins": 7.304967880249023, + "rewards/rejected": -4.773524761199951, + "step": 143 + }, + { + "epoch": 3.69, + "learning_rate": 0.00027179487179487176, + "logits/chosen": 1.3994379043579102, + "logits/rejected": 1.3517390489578247, + "logps/chosen": -487.254150390625, + "logps/rejected": -562.578369140625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.351099729537964, + "rewards/margins": 8.097498893737793, + "rewards/rejected": -4.746399402618408, + "step": 144 + }, + { + "epoch": 3.71, + "learning_rate": 0.0002713675213675213, + "logits/chosen": 1.4121302366256714, + "logits/rejected": 1.4421744346618652, + "logps/chosen": -520.80322265625, + "logps/rejected": -645.4122314453125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2977559566497803, + "rewards/margins": 7.4320292472839355, + "rewards/rejected": -5.134273529052734, + "step": 145 + }, + { + "epoch": 3.74, + "learning_rate": 0.00027094017094017093, + "logits/chosen": 1.4837563037872314, + "logits/rejected": 1.379111886024475, + "logps/chosen": -601.663818359375, + "logps/rejected": -564.5067138671875, + "loss": 0.1342, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.5174392461776733, + "rewards/margins": 6.973749160766602, + "rewards/rejected": -5.456309795379639, + "step": 146 + }, + { + "epoch": 3.76, + "learning_rate": 0.0002705128205128205, + "logits/chosen": 1.3307445049285889, + "logits/rejected": 1.201188564300537, + "logps/chosen": -529.9027099609375, + "logps/rejected": -573.0830078125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5938829183578491, + "rewards/margins": 8.63363265991211, + "rewards/rejected": -7.039750099182129, + "step": 147 + }, + { + "epoch": 3.79, + "learning_rate": 0.00027008547008547005, + "logits/chosen": 1.2792227268218994, + "logits/rejected": 1.2923702001571655, + "logps/chosen": -547.6593017578125, + "logps/rejected": -678.7649536132812, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6561351418495178, + "rewards/margins": 6.722534656524658, + "rewards/rejected": -6.066399574279785, + "step": 148 + }, + { + "epoch": 3.81, + "learning_rate": 0.0002696581196581196, + "logits/chosen": 1.4008458852767944, + "logits/rejected": 1.1927311420440674, + "logps/chosen": -628.3024291992188, + "logps/rejected": -578.4089965820312, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8446162939071655, + "rewards/margins": 8.044957160949707, + "rewards/rejected": -7.200340747833252, + "step": 149 + }, + { + "epoch": 3.84, + "learning_rate": 0.0002692307692307692, + "logits/chosen": 1.1845999956130981, + "logits/rejected": 1.0530712604522705, + "logps/chosen": -499.4281005859375, + "logps/rejected": -573.0311889648438, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9514065980911255, + "rewards/margins": 8.918203353881836, + "rewards/rejected": -7.966796875, + "step": 150 + }, + { + "epoch": 3.87, + "learning_rate": 0.0002688034188034188, + "logits/chosen": 1.1802606582641602, + "logits/rejected": 1.1234092712402344, + "logps/chosen": -572.6054077148438, + "logps/rejected": -626.6286010742188, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8314933180809021, + "rewards/margins": 9.328904151916504, + "rewards/rejected": -8.497410774230957, + "step": 151 + }, + { + "epoch": 3.89, + "learning_rate": 0.00026837606837606834, + "logits/chosen": 1.1486611366271973, + "logits/rejected": 1.0562578439712524, + "logps/chosen": -509.203125, + "logps/rejected": -584.0765991210938, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5767253637313843, + "rewards/margins": 8.173746109008789, + "rewards/rejected": -7.597021102905273, + "step": 152 + }, + { + "epoch": 3.92, + "learning_rate": 0.00026794871794871795, + "logits/chosen": 1.2486861944198608, + "logits/rejected": 1.123504400253296, + "logps/chosen": -586.3121337890625, + "logps/rejected": -666.2870483398438, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11837495118379593, + "rewards/margins": 8.070638656616211, + "rewards/rejected": -7.952263355255127, + "step": 153 + }, + { + "epoch": 3.94, + "learning_rate": 0.0002675213675213675, + "logits/chosen": 1.0615603923797607, + "logits/rejected": 1.0381104946136475, + "logps/chosen": -477.157470703125, + "logps/rejected": -577.363525390625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2932237982749939, + "rewards/margins": 8.428733825683594, + "rewards/rejected": -8.13551139831543, + "step": 154 + }, + { + "epoch": 3.97, + "learning_rate": 0.00026709401709401707, + "logits/chosen": 1.0924714803695679, + "logits/rejected": 1.0560393333435059, + "logps/chosen": -589.655517578125, + "logps/rejected": -621.06201171875, + "loss": 0.0741, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.8137645125389099, + "rewards/margins": 8.433317184448242, + "rewards/rejected": -9.247081756591797, + "step": 155 + }, + { + "epoch": 3.99, + "learning_rate": 0.0002666666666666666, + "logits/chosen": 1.1305707693099976, + "logits/rejected": 1.071329951286316, + "logps/chosen": -582.5031127929688, + "logps/rejected": -615.1607666015625, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6762839555740356, + "rewards/margins": 8.447378158569336, + "rewards/rejected": -7.77109432220459, + "step": 156 + }, + { + "epoch": 4.02, + "learning_rate": 0.00026623931623931624, + "logits/chosen": 1.101088285446167, + "logits/rejected": 1.0743204355239868, + "logps/chosen": -503.9051208496094, + "logps/rejected": -705.6373291015625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4264540672302246, + "rewards/margins": 9.59773063659668, + "rewards/rejected": -8.171276092529297, + "step": 157 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002658119658119658, + "logits/chosen": 1.021052598953247, + "logits/rejected": 1.0671635866165161, + "logps/chosen": -466.4878845214844, + "logps/rejected": -629.9095458984375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2989165782928467, + "rewards/margins": 9.68484878540039, + "rewards/rejected": -8.385932922363281, + "step": 158 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026538461538461536, + "logits/chosen": 1.1435658931732178, + "logits/rejected": 1.1837159395217896, + "logps/chosen": -534.5050048828125, + "logps/rejected": -611.9518432617188, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3590706586837769, + "rewards/margins": 8.476469039916992, + "rewards/rejected": -7.117398262023926, + "step": 159 + }, + { + "epoch": 4.1, + "learning_rate": 0.00026495726495726497, + "logits/chosen": 1.146314263343811, + "logits/rejected": 1.1262887716293335, + "logps/chosen": -476.71453857421875, + "logps/rejected": -590.7471923828125, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9563323259353638, + "rewards/margins": 9.145315170288086, + "rewards/rejected": -7.188984394073486, + "step": 160 + }, + { + "epoch": 4.12, + "learning_rate": 0.0002645299145299145, + "logits/chosen": 1.16382896900177, + "logits/rejected": 1.1415654420852661, + "logps/chosen": -532.4939575195312, + "logps/rejected": -640.6329345703125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2008843421936035, + "rewards/margins": 8.335280418395996, + "rewards/rejected": -6.134396076202393, + "step": 161 + }, + { + "epoch": 4.15, + "learning_rate": 0.0002641025641025641, + "logits/chosen": 1.0377854108810425, + "logits/rejected": 1.0735015869140625, + "logps/chosen": -482.3515625, + "logps/rejected": -620.0985717773438, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8429818153381348, + "rewards/margins": 8.701183319091797, + "rewards/rejected": -6.8582000732421875, + "step": 162 + }, + { + "epoch": 4.17, + "learning_rate": 0.00026367521367521364, + "logits/chosen": 1.2598010301589966, + "logits/rejected": 1.2114201784133911, + "logps/chosen": -497.3212890625, + "logps/rejected": -609.7880249023438, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7050861120224, + "rewards/margins": 8.18321418762207, + "rewards/rejected": -6.478128433227539, + "step": 163 + }, + { + "epoch": 4.2, + "learning_rate": 0.00026324786324786326, + "logits/chosen": 1.2148414850234985, + "logits/rejected": 1.0941295623779297, + "logps/chosen": -575.4998779296875, + "logps/rejected": -580.6967163085938, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.006260395050049, + "rewards/margins": 8.852553367614746, + "rewards/rejected": -6.846292972564697, + "step": 164 + }, + { + "epoch": 4.22, + "learning_rate": 0.0002628205128205128, + "logits/chosen": 1.2730900049209595, + "logits/rejected": 1.1539727449417114, + "logps/chosen": -587.6873779296875, + "logps/rejected": -640.8407592773438, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5435640811920166, + "rewards/margins": 8.264644622802734, + "rewards/rejected": -5.721080303192139, + "step": 165 + }, + { + "epoch": 4.25, + "learning_rate": 0.0002623931623931624, + "logits/chosen": 1.1310749053955078, + "logits/rejected": 1.074840784072876, + "logps/chosen": -539.7301025390625, + "logps/rejected": -588.243408203125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.725675344467163, + "rewards/margins": 8.484735488891602, + "rewards/rejected": -6.759060382843018, + "step": 166 + }, + { + "epoch": 4.28, + "learning_rate": 0.00026196581196581193, + "logits/chosen": 1.1921124458312988, + "logits/rejected": 1.1464745998382568, + "logps/chosen": -531.4241333007812, + "logps/rejected": -650.033447265625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.208735704421997, + "rewards/margins": 8.580331802368164, + "rewards/rejected": -6.371595859527588, + "step": 167 + }, + { + "epoch": 4.3, + "learning_rate": 0.00026153846153846154, + "logits/chosen": 1.2101249694824219, + "logits/rejected": 1.18596613407135, + "logps/chosen": -563.632568359375, + "logps/rejected": -715.3417358398438, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0189900398254395, + "rewards/margins": 9.561189651489258, + "rewards/rejected": -7.542199611663818, + "step": 168 + }, + { + "epoch": 4.33, + "learning_rate": 0.0002611111111111111, + "logits/chosen": 1.2898643016815186, + "logits/rejected": 1.1050164699554443, + "logps/chosen": -568.4158325195312, + "logps/rejected": -607.685302734375, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8665090799331665, + "rewards/margins": 8.093311309814453, + "rewards/rejected": -6.226801872253418, + "step": 169 + }, + { + "epoch": 4.35, + "learning_rate": 0.00026068376068376066, + "logits/chosen": 1.1432087421417236, + "logits/rejected": 1.1123594045639038, + "logps/chosen": -532.8350219726562, + "logps/rejected": -636.5218505859375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5014150142669678, + "rewards/margins": 9.985570907592773, + "rewards/rejected": -7.48415470123291, + "step": 170 + }, + { + "epoch": 4.38, + "learning_rate": 0.0002602564102564102, + "logits/chosen": 1.0670151710510254, + "logits/rejected": 0.9835186004638672, + "logps/chosen": -525.0418090820312, + "logps/rejected": -603.8453369140625, + "loss": 0.0303, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.609043836593628, + "rewards/margins": 8.916692733764648, + "rewards/rejected": -7.307648658752441, + "step": 171 + }, + { + "epoch": 4.4, + "learning_rate": 0.0002598290598290598, + "logits/chosen": 1.1664202213287354, + "logits/rejected": 1.0911628007888794, + "logps/chosen": -546.672607421875, + "logps/rejected": -600.7014770507812, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.016690254211426, + "rewards/margins": 8.022310256958008, + "rewards/rejected": -6.005620002746582, + "step": 172 + }, + { + "epoch": 4.43, + "learning_rate": 0.0002594017094017094, + "logits/chosen": 1.089374303817749, + "logits/rejected": 1.1528961658477783, + "logps/chosen": -515.8948364257812, + "logps/rejected": -608.8614501953125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6085548400878906, + "rewards/margins": 9.069319725036621, + "rewards/rejected": -6.4607648849487305, + "step": 173 + }, + { + "epoch": 4.45, + "learning_rate": 0.00025897435897435895, + "logits/chosen": 1.2652084827423096, + "logits/rejected": 1.0107694864273071, + "logps/chosen": -570.3291015625, + "logps/rejected": -542.94384765625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6129395961761475, + "rewards/margins": 8.992415428161621, + "rewards/rejected": -5.3794755935668945, + "step": 174 + }, + { + "epoch": 4.48, + "learning_rate": 0.0002585470085470085, + "logits/chosen": 1.2760734558105469, + "logits/rejected": 1.1731884479522705, + "logps/chosen": -581.9400634765625, + "logps/rejected": -597.0192260742188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4896321296691895, + "rewards/margins": 8.106904983520508, + "rewards/rejected": -5.617273330688477, + "step": 175 + }, + { + "epoch": 4.51, + "learning_rate": 0.00025811965811965807, + "logits/chosen": 1.262199878692627, + "logits/rejected": 1.0435302257537842, + "logps/chosen": -577.8585205078125, + "logps/rejected": -599.6145629882812, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0292744636535645, + "rewards/margins": 9.197108268737793, + "rewards/rejected": -5.1678338050842285, + "step": 176 + }, + { + "epoch": 4.53, + "learning_rate": 0.0002576923076923077, + "logits/chosen": 1.0869362354278564, + "logits/rejected": 1.062146544456482, + "logps/chosen": -486.3373107910156, + "logps/rejected": -604.74609375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.719869613647461, + "rewards/margins": 9.07932186126709, + "rewards/rejected": -6.359452724456787, + "step": 177 + }, + { + "epoch": 4.56, + "learning_rate": 0.00025726495726495724, + "logits/chosen": 1.1699590682983398, + "logits/rejected": 1.0801599025726318, + "logps/chosen": -498.7552490234375, + "logps/rejected": -508.74609375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.849763870239258, + "rewards/margins": 8.253904342651367, + "rewards/rejected": -5.404139995574951, + "step": 178 + }, + { + "epoch": 4.58, + "learning_rate": 0.0002568376068376068, + "logits/chosen": 1.1467467546463013, + "logits/rejected": 1.1259756088256836, + "logps/chosen": -504.258544921875, + "logps/rejected": -571.6417846679688, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4761364459991455, + "rewards/margins": 9.168277740478516, + "rewards/rejected": -5.692141056060791, + "step": 179 + }, + { + "epoch": 4.61, + "learning_rate": 0.00025641025641025636, + "logits/chosen": 1.1694316864013672, + "logits/rejected": 1.1096751689910889, + "logps/chosen": -521.1782836914062, + "logps/rejected": -578.571533203125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.831768035888672, + "rewards/margins": 8.435105323791504, + "rewards/rejected": -5.603337287902832, + "step": 180 + }, + { + "epoch": 4.63, + "learning_rate": 0.00025598290598290597, + "logits/chosen": 1.2126814126968384, + "logits/rejected": 1.0483447313308716, + "logps/chosen": -514.5982055664062, + "logps/rejected": -559.2354125976562, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6389248371124268, + "rewards/margins": 8.253149032592773, + "rewards/rejected": -5.614223957061768, + "step": 181 + }, + { + "epoch": 4.66, + "learning_rate": 0.00025555555555555553, + "logits/chosen": 1.1626149415969849, + "logits/rejected": 1.023645281791687, + "logps/chosen": -539.7805786132812, + "logps/rejected": -583.5364379882812, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5426697731018066, + "rewards/margins": 9.294103622436523, + "rewards/rejected": -5.751433372497559, + "step": 182 + }, + { + "epoch": 4.68, + "learning_rate": 0.0002551282051282051, + "logits/chosen": 1.2193539142608643, + "logits/rejected": 1.0917335748672485, + "logps/chosen": -482.9999694824219, + "logps/rejected": -574.2794189453125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.089658737182617, + "rewards/margins": 8.874006271362305, + "rewards/rejected": -5.784348011016846, + "step": 183 + }, + { + "epoch": 4.71, + "learning_rate": 0.0002547008547008547, + "logits/chosen": 1.110312819480896, + "logits/rejected": 1.136667013168335, + "logps/chosen": -463.33062744140625, + "logps/rejected": -540.6802978515625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5352556705474854, + "rewards/margins": 9.688406944274902, + "rewards/rejected": -7.1531524658203125, + "step": 184 + }, + { + "epoch": 4.74, + "learning_rate": 0.00025427350427350426, + "logits/chosen": 1.2557671070098877, + "logits/rejected": 1.1719632148742676, + "logps/chosen": -529.3836669921875, + "logps/rejected": -604.2679443359375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.274686098098755, + "rewards/margins": 8.664352416992188, + "rewards/rejected": -5.389666557312012, + "step": 185 + }, + { + "epoch": 4.76, + "learning_rate": 0.0002538461538461538, + "logits/chosen": 1.2389734983444214, + "logits/rejected": 1.084290623664856, + "logps/chosen": -544.71142578125, + "logps/rejected": -621.462158203125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2781314849853516, + "rewards/margins": 8.4774808883667, + "rewards/rejected": -6.199349403381348, + "step": 186 + }, + { + "epoch": 4.79, + "learning_rate": 0.0002534188034188034, + "logits/chosen": 1.1379337310791016, + "logits/rejected": 1.1488574743270874, + "logps/chosen": -505.5488586425781, + "logps/rejected": -620.3075561523438, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.200209140777588, + "rewards/margins": 7.998414516448975, + "rewards/rejected": -5.798205375671387, + "step": 187 + }, + { + "epoch": 4.81, + "learning_rate": 0.000252991452991453, + "logits/chosen": 1.1832406520843506, + "logits/rejected": 1.1894774436950684, + "logps/chosen": -518.3919677734375, + "logps/rejected": -603.9647216796875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2990570068359375, + "rewards/margins": 8.94902515411377, + "rewards/rejected": -5.649968147277832, + "step": 188 + }, + { + "epoch": 4.84, + "learning_rate": 0.00025256410256410255, + "logits/chosen": 1.1613863706588745, + "logits/rejected": 1.0867745876312256, + "logps/chosen": -509.91754150390625, + "logps/rejected": -581.9103393554688, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3365347385406494, + "rewards/margins": 8.27928352355957, + "rewards/rejected": -5.942748069763184, + "step": 189 + }, + { + "epoch": 4.86, + "learning_rate": 0.0002521367521367521, + "logits/chosen": 1.1696518659591675, + "logits/rejected": 1.0967986583709717, + "logps/chosen": -512.4561767578125, + "logps/rejected": -571.2943725585938, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8717398643493652, + "rewards/margins": 9.033393859863281, + "rewards/rejected": -6.161653995513916, + "step": 190 + }, + { + "epoch": 4.89, + "learning_rate": 0.0002517094017094017, + "logits/chosen": 1.211112141609192, + "logits/rejected": 0.9915167093276978, + "logps/chosen": -592.27294921875, + "logps/rejected": -545.3093872070312, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7333691120147705, + "rewards/margins": 8.252148628234863, + "rewards/rejected": -5.518779754638672, + "step": 191 + }, + { + "epoch": 4.92, + "learning_rate": 0.0002512820512820513, + "logits/chosen": 1.1865314245224, + "logits/rejected": 1.083636999130249, + "logps/chosen": -500.9031982421875, + "logps/rejected": -634.9488525390625, + "loss": 0.0616, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.7689785957336426, + "rewards/margins": 9.048822402954102, + "rewards/rejected": -6.279844760894775, + "step": 192 + }, + { + "epoch": 4.94, + "learning_rate": 0.00025085470085470083, + "logits/chosen": 1.308659553527832, + "logits/rejected": 1.1389790773391724, + "logps/chosen": -530.4187622070312, + "logps/rejected": -617.7844848632812, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9889013767242432, + "rewards/margins": 7.278841018676758, + "rewards/rejected": -5.2899394035339355, + "step": 193 + }, + { + "epoch": 4.97, + "learning_rate": 0.0002504273504273504, + "logits/chosen": 1.1351438760757446, + "logits/rejected": 1.0995919704437256, + "logps/chosen": -537.9273071289062, + "logps/rejected": -580.5325927734375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.307258129119873, + "rewards/margins": 8.442761421203613, + "rewards/rejected": -6.135504245758057, + "step": 194 + }, + { + "epoch": 4.99, + "learning_rate": 0.00025, + "logits/chosen": 1.1220672130584717, + "logits/rejected": 1.0801559686660767, + "logps/chosen": -520.331787109375, + "logps/rejected": -620.8114013671875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.103241205215454, + "rewards/margins": 8.628799438476562, + "rewards/rejected": -6.5255584716796875, + "step": 195 + }, + { + "epoch": 5.02, + "learning_rate": 0.00024957264957264956, + "logits/chosen": 1.2094953060150146, + "logits/rejected": 1.1247830390930176, + "logps/chosen": -482.1205749511719, + "logps/rejected": -588.2861328125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7330946922302246, + "rewards/margins": 8.031623840332031, + "rewards/rejected": -5.298529148101807, + "step": 196 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002491452991452991, + "logits/chosen": 1.1430044174194336, + "logits/rejected": 1.0147483348846436, + "logps/chosen": -540.6754760742188, + "logps/rejected": -578.9623413085938, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.216123104095459, + "rewards/margins": 8.867217063903809, + "rewards/rejected": -5.651093482971191, + "step": 197 + }, + { + "epoch": 5.07, + "learning_rate": 0.00024871794871794874, + "logits/chosen": 1.1910125017166138, + "logits/rejected": 1.0428566932678223, + "logps/chosen": -577.4218139648438, + "logps/rejected": -615.4971313476562, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6112916469573975, + "rewards/margins": 8.347122192382812, + "rewards/rejected": -5.735829830169678, + "step": 198 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002482905982905983, + "logits/chosen": 1.189084768295288, + "logits/rejected": 1.023719310760498, + "logps/chosen": -505.85125732421875, + "logps/rejected": -607.1080322265625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.267016649246216, + "rewards/margins": 8.4349946975708, + "rewards/rejected": -6.1679768562316895, + "step": 199 + }, + { + "epoch": 5.12, + "learning_rate": 0.00024786324786324785, + "logits/chosen": 1.1190298795700073, + "logits/rejected": 1.0712878704071045, + "logps/chosen": -522.3341064453125, + "logps/rejected": -650.10107421875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4759145975112915, + "rewards/margins": 8.425509452819824, + "rewards/rejected": -6.949594020843506, + "step": 200 + }, + { + "epoch": 5.15, + "learning_rate": 0.0002474358974358974, + "logits/chosen": 1.1855789422988892, + "logits/rejected": 1.1534652709960938, + "logps/chosen": -530.4508666992188, + "logps/rejected": -620.44873046875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7457334995269775, + "rewards/margins": 8.301987648010254, + "rewards/rejected": -5.5562543869018555, + "step": 201 + }, + { + "epoch": 5.17, + "learning_rate": 0.000247008547008547, + "logits/chosen": 1.2083404064178467, + "logits/rejected": 1.0335760116577148, + "logps/chosen": -544.8739624023438, + "logps/rejected": -582.562744140625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7809946537017822, + "rewards/margins": 8.799093246459961, + "rewards/rejected": -6.018097877502441, + "step": 202 + }, + { + "epoch": 5.2, + "learning_rate": 0.0002465811965811966, + "logits/chosen": 1.1989009380340576, + "logits/rejected": 1.1295719146728516, + "logps/chosen": -480.67724609375, + "logps/rejected": -609.4390258789062, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5975394248962402, + "rewards/margins": 8.314066886901855, + "rewards/rejected": -5.716527938842773, + "step": 203 + }, + { + "epoch": 5.22, + "learning_rate": 0.00024615384615384614, + "logits/chosen": 1.1273399591445923, + "logits/rejected": 0.9690557718276978, + "logps/chosen": -563.5905151367188, + "logps/rejected": -566.3074951171875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.03564715385437, + "rewards/margins": 8.314454078674316, + "rewards/rejected": -6.278806686401367, + "step": 204 + }, + { + "epoch": 5.25, + "learning_rate": 0.0002457264957264957, + "logits/chosen": 1.0755057334899902, + "logits/rejected": 1.1226409673690796, + "logps/chosen": -515.4642333984375, + "logps/rejected": -682.0799560546875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6836296319961548, + "rewards/margins": 8.099983215332031, + "rewards/rejected": -6.416353702545166, + "step": 205 + }, + { + "epoch": 5.27, + "learning_rate": 0.00024529914529914526, + "logits/chosen": 1.1585733890533447, + "logits/rejected": 1.1376292705535889, + "logps/chosen": -489.0839538574219, + "logps/rejected": -560.7153930664062, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.028048276901245, + "rewards/margins": 7.495326995849609, + "rewards/rejected": -5.467278957366943, + "step": 206 + }, + { + "epoch": 5.3, + "learning_rate": 0.00024487179487179487, + "logits/chosen": 1.1711719036102295, + "logits/rejected": 1.2047771215438843, + "logps/chosen": -554.0427856445312, + "logps/rejected": -657.2614135742188, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6054043769836426, + "rewards/margins": 8.226700782775879, + "rewards/rejected": -5.621296405792236, + "step": 207 + }, + { + "epoch": 5.32, + "learning_rate": 0.00024444444444444443, + "logits/chosen": 1.1655393838882446, + "logits/rejected": 1.1469833850860596, + "logps/chosen": -568.0317993164062, + "logps/rejected": -558.951171875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.950896739959717, + "rewards/margins": 8.18274974822998, + "rewards/rejected": -5.231853008270264, + "step": 208 + }, + { + "epoch": 5.35, + "learning_rate": 0.00024401709401709401, + "logits/chosen": 1.188609004020691, + "logits/rejected": 1.1547000408172607, + "logps/chosen": -582.5511474609375, + "logps/rejected": -659.006591796875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.448970317840576, + "rewards/margins": 9.596809387207031, + "rewards/rejected": -7.147839069366455, + "step": 209 + }, + { + "epoch": 5.38, + "learning_rate": 0.00024358974358974357, + "logits/chosen": 1.1199637651443481, + "logits/rejected": 1.0847599506378174, + "logps/chosen": -521.1385498046875, + "logps/rejected": -580.2830810546875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.533172845840454, + "rewards/margins": 8.593878746032715, + "rewards/rejected": -6.060705184936523, + "step": 210 + }, + { + "epoch": 5.4, + "learning_rate": 0.00024316239316239313, + "logits/chosen": 1.1465306282043457, + "logits/rejected": 1.1031239032745361, + "logps/chosen": -483.8526611328125, + "logps/rejected": -559.978759765625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.983487844467163, + "rewards/margins": 8.161898612976074, + "rewards/rejected": -6.178411483764648, + "step": 211 + }, + { + "epoch": 5.43, + "learning_rate": 0.00024273504273504272, + "logits/chosen": 1.13920259475708, + "logits/rejected": 1.1220027208328247, + "logps/chosen": -512.029052734375, + "logps/rejected": -572.7222900390625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.37949538230896, + "rewards/margins": 9.245460510253906, + "rewards/rejected": -6.865965843200684, + "step": 212 + }, + { + "epoch": 5.45, + "learning_rate": 0.0002423076923076923, + "logits/chosen": 1.1790162324905396, + "logits/rejected": 1.032013177871704, + "logps/chosen": -550.051025390625, + "logps/rejected": -632.330810546875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7981252670288086, + "rewards/margins": 9.512584686279297, + "rewards/rejected": -6.714459419250488, + "step": 213 + }, + { + "epoch": 5.48, + "learning_rate": 0.00024188034188034186, + "logits/chosen": 1.299055814743042, + "logits/rejected": 1.2317571640014648, + "logps/chosen": -517.1921997070312, + "logps/rejected": -619.0784301757812, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4681472778320312, + "rewards/margins": 8.650612831115723, + "rewards/rejected": -7.182465076446533, + "step": 214 + }, + { + "epoch": 5.5, + "learning_rate": 0.00024145299145299142, + "logits/chosen": 1.1555142402648926, + "logits/rejected": 1.1552680730819702, + "logps/chosen": -493.82232666015625, + "logps/rejected": -553.5524291992188, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.282465934753418, + "rewards/margins": 7.990402698516846, + "rewards/rejected": -5.707936763763428, + "step": 215 + }, + { + "epoch": 5.53, + "learning_rate": 0.000241025641025641, + "logits/chosen": 1.2630378007888794, + "logits/rejected": 1.145714282989502, + "logps/chosen": -566.5864868164062, + "logps/rejected": -555.4298095703125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5014400482177734, + "rewards/margins": 9.682706832885742, + "rewards/rejected": -7.1812663078308105, + "step": 216 + }, + { + "epoch": 5.56, + "learning_rate": 0.00024059829059829056, + "logits/chosen": 1.275704026222229, + "logits/rejected": 1.1247011423110962, + "logps/chosen": -565.8131713867188, + "logps/rejected": -633.22509765625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3491557836532593, + "rewards/margins": 8.835249900817871, + "rewards/rejected": -7.486093997955322, + "step": 217 + }, + { + "epoch": 5.58, + "learning_rate": 0.00024017094017094015, + "logits/chosen": 1.24131178855896, + "logits/rejected": 1.1392734050750732, + "logps/chosen": -515.2080078125, + "logps/rejected": -557.8899536132812, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5449414253234863, + "rewards/margins": 8.077924728393555, + "rewards/rejected": -6.532983303070068, + "step": 218 + }, + { + "epoch": 5.61, + "learning_rate": 0.00023974358974358974, + "logits/chosen": 1.2522388696670532, + "logits/rejected": 1.049080729484558, + "logps/chosen": -607.8526611328125, + "logps/rejected": -644.0594482421875, + "loss": 0.1498, + "rewards/accuracies": 0.96875, + "rewards/chosen": 2.073587656021118, + "rewards/margins": 9.496702194213867, + "rewards/rejected": -7.42311429977417, + "step": 219 + }, + { + "epoch": 5.63, + "learning_rate": 0.0002393162393162393, + "logits/chosen": 1.1726680994033813, + "logits/rejected": 1.0583220720291138, + "logps/chosen": -537.9254150390625, + "logps/rejected": -589.2078857421875, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3364086151123047, + "rewards/margins": 9.462315559387207, + "rewards/rejected": -8.125906944274902, + "step": 220 + }, + { + "epoch": 5.66, + "learning_rate": 0.00023888888888888885, + "logits/chosen": 1.2105443477630615, + "logits/rejected": 1.0398310422897339, + "logps/chosen": -553.4381713867188, + "logps/rejected": -617.5463256835938, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.809446096420288, + "rewards/margins": 9.608116149902344, + "rewards/rejected": -7.798670291900635, + "step": 221 + }, + { + "epoch": 5.68, + "learning_rate": 0.00023846153846153844, + "logits/chosen": 1.0151175260543823, + "logits/rejected": 1.137940764427185, + "logps/chosen": -474.15582275390625, + "logps/rejected": -601.0347900390625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9911149740219116, + "rewards/margins": 8.517168045043945, + "rewards/rejected": -7.526054382324219, + "step": 222 + }, + { + "epoch": 5.71, + "learning_rate": 0.00023803418803418802, + "logits/chosen": 1.1788804531097412, + "logits/rejected": 1.0858978033065796, + "logps/chosen": -538.7382202148438, + "logps/rejected": -580.581787109375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0802903175354004, + "rewards/margins": 8.940625190734863, + "rewards/rejected": -7.860335350036621, + "step": 223 + }, + { + "epoch": 5.73, + "learning_rate": 0.00023760683760683758, + "logits/chosen": 1.209570288658142, + "logits/rejected": 1.1490302085876465, + "logps/chosen": -497.189697265625, + "logps/rejected": -623.16455078125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9234257936477661, + "rewards/margins": 9.893648147583008, + "rewards/rejected": -7.970221996307373, + "step": 224 + }, + { + "epoch": 5.76, + "learning_rate": 0.00023717948717948714, + "logits/chosen": 1.1075451374053955, + "logits/rejected": 1.0870707035064697, + "logps/chosen": -555.2899169921875, + "logps/rejected": -560.2510375976562, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8097199201583862, + "rewards/margins": 8.50051498413086, + "rewards/rejected": -7.690794467926025, + "step": 225 + }, + { + "epoch": 5.79, + "learning_rate": 0.00023675213675213675, + "logits/chosen": 1.1817216873168945, + "logits/rejected": 1.018075942993164, + "logps/chosen": -529.1445922851562, + "logps/rejected": -607.93505859375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8409253358840942, + "rewards/margins": 9.230034828186035, + "rewards/rejected": -7.389110088348389, + "step": 226 + }, + { + "epoch": 5.81, + "learning_rate": 0.0002363247863247863, + "logits/chosen": 1.146735429763794, + "logits/rejected": 1.082593321800232, + "logps/chosen": -557.0680541992188, + "logps/rejected": -634.6128540039062, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9302033185958862, + "rewards/margins": 9.59379768371582, + "rewards/rejected": -8.663594245910645, + "step": 227 + }, + { + "epoch": 5.84, + "learning_rate": 0.00023589743589743587, + "logits/chosen": 1.0844825506210327, + "logits/rejected": 1.050144076347351, + "logps/chosen": -458.246337890625, + "logps/rejected": -643.5533447265625, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.983664333820343, + "rewards/margins": 9.005660057067871, + "rewards/rejected": -8.021997451782227, + "step": 228 + }, + { + "epoch": 5.86, + "learning_rate": 0.00023547008547008543, + "logits/chosen": 1.1802949905395508, + "logits/rejected": 1.2055476903915405, + "logps/chosen": -538.8391723632812, + "logps/rejected": -667.3803100585938, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4791993796825409, + "rewards/margins": 8.043685913085938, + "rewards/rejected": -7.564486026763916, + "step": 229 + }, + { + "epoch": 5.89, + "learning_rate": 0.00023504273504273504, + "logits/chosen": 1.217112421989441, + "logits/rejected": 1.0857105255126953, + "logps/chosen": -547.7744140625, + "logps/rejected": -650.827880859375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2695170640945435, + "rewards/margins": 10.668280601501465, + "rewards/rejected": -9.398763656616211, + "step": 230 + }, + { + "epoch": 5.91, + "learning_rate": 0.0002346153846153846, + "logits/chosen": 1.106930136680603, + "logits/rejected": 1.1642612218856812, + "logps/chosen": -534.817626953125, + "logps/rejected": -647.9102783203125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33475053310394287, + "rewards/margins": 9.073336601257324, + "rewards/rejected": -8.73858642578125, + "step": 231 + }, + { + "epoch": 5.94, + "learning_rate": 0.00023418803418803416, + "logits/chosen": 1.1831903457641602, + "logits/rejected": 1.1884675025939941, + "logps/chosen": -560.828857421875, + "logps/rejected": -663.0516357421875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9667414426803589, + "rewards/margins": 10.761907577514648, + "rewards/rejected": -8.795166969299316, + "step": 232 + }, + { + "epoch": 5.96, + "learning_rate": 0.00023376068376068375, + "logits/chosen": 1.092282772064209, + "logits/rejected": 0.9970771670341492, + "logps/chosen": -529.67822265625, + "logps/rejected": -641.421142578125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46319279074668884, + "rewards/margins": 10.089384078979492, + "rewards/rejected": -9.626192092895508, + "step": 233 + }, + { + "epoch": 5.99, + "learning_rate": 0.0002333333333333333, + "logits/chosen": 1.1541541814804077, + "logits/rejected": 0.9784144759178162, + "logps/chosen": -562.6727294921875, + "logps/rejected": -608.5543823242188, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5622472763061523, + "rewards/margins": 9.451787948608398, + "rewards/rejected": -8.889540672302246, + "step": 234 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002329059829059829, + "logits/chosen": 1.0728470087051392, + "logits/rejected": 1.0160434246063232, + "logps/chosen": -605.6416625976562, + "logps/rejected": -620.4939575195312, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3293549120426178, + "rewards/margins": 9.99177360534668, + "rewards/rejected": -9.662418365478516, + "step": 235 + }, + { + "epoch": 6.04, + "learning_rate": 0.00023247863247863245, + "logits/chosen": 1.0738935470581055, + "logits/rejected": 1.0548124313354492, + "logps/chosen": -494.51788330078125, + "logps/rejected": -601.1412353515625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6489882469177246, + "rewards/margins": 9.996894836425781, + "rewards/rejected": -9.347906112670898, + "step": 236 + }, + { + "epoch": 6.07, + "learning_rate": 0.00023205128205128203, + "logits/chosen": 1.1992632150650024, + "logits/rejected": 1.1126775741577148, + "logps/chosen": -581.875244140625, + "logps/rejected": -654.9214477539062, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1056530475616455, + "rewards/margins": 9.1024169921875, + "rewards/rejected": -10.208070755004883, + "step": 237 + }, + { + "epoch": 6.09, + "learning_rate": 0.0002316239316239316, + "logits/chosen": 1.024402379989624, + "logits/rejected": 1.0234527587890625, + "logps/chosen": -527.988037109375, + "logps/rejected": -600.8884887695312, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40097522735595703, + "rewards/margins": 10.08630084991455, + "rewards/rejected": -9.685325622558594, + "step": 238 + }, + { + "epoch": 6.12, + "learning_rate": 0.00023119658119658118, + "logits/chosen": 1.0920895338058472, + "logits/rejected": 0.925986647605896, + "logps/chosen": -526.6175537109375, + "logps/rejected": -593.0648803710938, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35913902521133423, + "rewards/margins": 10.041351318359375, + "rewards/rejected": -9.682212829589844, + "step": 239 + }, + { + "epoch": 6.14, + "learning_rate": 0.00023076923076923076, + "logits/chosen": 1.12273108959198, + "logits/rejected": 0.9582171440124512, + "logps/chosen": -566.5948486328125, + "logps/rejected": -640.344482421875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10319514572620392, + "rewards/margins": 10.264111518859863, + "rewards/rejected": -10.36730670928955, + "step": 240 + }, + { + "epoch": 6.17, + "learning_rate": 0.00023034188034188032, + "logits/chosen": 1.125780463218689, + "logits/rejected": 0.8733446598052979, + "logps/chosen": -502.71685791015625, + "logps/rejected": -526.7401123046875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7334610819816589, + "rewards/margins": 8.98703384399414, + "rewards/rejected": -8.253572463989258, + "step": 241 + }, + { + "epoch": 6.2, + "learning_rate": 0.00022991452991452988, + "logits/chosen": 1.007986307144165, + "logits/rejected": 0.9879658818244934, + "logps/chosen": -493.781982421875, + "logps/rejected": -631.9270629882812, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32805830240249634, + "rewards/margins": 10.0196533203125, + "rewards/rejected": -9.691594123840332, + "step": 242 + }, + { + "epoch": 6.22, + "learning_rate": 0.00022948717948717944, + "logits/chosen": 1.0395015478134155, + "logits/rejected": 1.004233479499817, + "logps/chosen": -519.82568359375, + "logps/rejected": -645.8541870117188, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23589622974395752, + "rewards/margins": 10.204262733459473, + "rewards/rejected": -10.440156936645508, + "step": 243 + }, + { + "epoch": 6.25, + "learning_rate": 0.00022905982905982905, + "logits/chosen": 1.030265212059021, + "logits/rejected": 1.0151996612548828, + "logps/chosen": -490.25640869140625, + "logps/rejected": -603.8272705078125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07852241396903992, + "rewards/margins": 10.176379203796387, + "rewards/rejected": -10.254899978637695, + "step": 244 + }, + { + "epoch": 6.27, + "learning_rate": 0.0002286324786324786, + "logits/chosen": 1.2312498092651367, + "logits/rejected": 0.9529620409011841, + "logps/chosen": -602.3244018554688, + "logps/rejected": -622.9336547851562, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13221769034862518, + "rewards/margins": 10.062947273254395, + "rewards/rejected": -9.930729866027832, + "step": 245 + }, + { + "epoch": 6.3, + "learning_rate": 0.00022820512820512817, + "logits/chosen": 1.0476980209350586, + "logits/rejected": 0.9954835176467896, + "logps/chosen": -542.1054077148438, + "logps/rejected": -681.212646484375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028733327984809875, + "rewards/margins": 10.83343505859375, + "rewards/rejected": -10.86216926574707, + "step": 246 + }, + { + "epoch": 6.32, + "learning_rate": 0.00022777777777777778, + "logits/chosen": 0.9804601669311523, + "logits/rejected": 0.8998504281044006, + "logps/chosen": -525.8041381835938, + "logps/rejected": -585.2196655273438, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6352589130401611, + "rewards/margins": 9.895197868347168, + "rewards/rejected": -9.25993824005127, + "step": 247 + }, + { + "epoch": 6.35, + "learning_rate": 0.00022735042735042734, + "logits/chosen": 0.9808767437934875, + "logits/rejected": 1.03694486618042, + "logps/chosen": -460.6586608886719, + "logps/rejected": -696.9314575195312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008252725005149841, + "rewards/margins": 10.02670955657959, + "rewards/rejected": -10.03496265411377, + "step": 248 + }, + { + "epoch": 6.37, + "learning_rate": 0.0002269230769230769, + "logits/chosen": 1.0145666599273682, + "logits/rejected": 1.0171821117401123, + "logps/chosen": -550.2952880859375, + "logps/rejected": -609.4617919921875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.587495744228363, + "rewards/margins": 9.451133728027344, + "rewards/rejected": -10.038629531860352, + "step": 249 + }, + { + "epoch": 6.4, + "learning_rate": 0.00022649572649572646, + "logits/chosen": 1.1384074687957764, + "logits/rejected": 0.994137167930603, + "logps/chosen": -529.2269287109375, + "logps/rejected": -592.1962890625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2150293588638306, + "rewards/margins": 9.835915565490723, + "rewards/rejected": -8.620885848999023, + "step": 250 + }, + { + "epoch": 6.43, + "learning_rate": 0.00022606837606837604, + "logits/chosen": 1.0672990083694458, + "logits/rejected": 1.043774127960205, + "logps/chosen": -530.2998046875, + "logps/rejected": -619.9190673828125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16798219084739685, + "rewards/margins": 9.340568542480469, + "rewards/rejected": -9.508550643920898, + "step": 251 + }, + { + "epoch": 6.45, + "learning_rate": 0.00022564102564102563, + "logits/chosen": 1.0548663139343262, + "logits/rejected": 0.9898471832275391, + "logps/chosen": -515.4979858398438, + "logps/rejected": -591.4309692382812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1059775352478027, + "rewards/margins": 9.55521297454834, + "rewards/rejected": -8.449234962463379, + "step": 252 + }, + { + "epoch": 6.48, + "learning_rate": 0.0002252136752136752, + "logits/chosen": 1.0579899549484253, + "logits/rejected": 1.0557491779327393, + "logps/chosen": -532.400634765625, + "logps/rejected": -688.6305541992188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3727375268936157, + "rewards/margins": 10.276800155639648, + "rewards/rejected": -10.649538040161133, + "step": 253 + }, + { + "epoch": 6.5, + "learning_rate": 0.00022478632478632477, + "logits/chosen": 1.1566115617752075, + "logits/rejected": 1.0760446786880493, + "logps/chosen": -590.3530883789062, + "logps/rejected": -659.9254150390625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8200367093086243, + "rewards/margins": 10.486039161682129, + "rewards/rejected": -9.666001319885254, + "step": 254 + }, + { + "epoch": 6.53, + "learning_rate": 0.00022435897435897433, + "logits/chosen": 1.169042944908142, + "logits/rejected": 1.092968225479126, + "logps/chosen": -562.0431518554688, + "logps/rejected": -657.502197265625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6965909004211426, + "rewards/margins": 11.691349029541016, + "rewards/rejected": -9.994759559631348, + "step": 255 + }, + { + "epoch": 6.55, + "learning_rate": 0.00022393162393162392, + "logits/chosen": 1.0384266376495361, + "logits/rejected": 1.021849513053894, + "logps/chosen": -551.0145263671875, + "logps/rejected": -694.0051879882812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5090039968490601, + "rewards/margins": 10.475794792175293, + "rewards/rejected": -9.966791152954102, + "step": 256 + }, + { + "epoch": 6.58, + "learning_rate": 0.0002235042735042735, + "logits/chosen": 1.1020841598510742, + "logits/rejected": 0.9978400468826294, + "logps/chosen": -552.3654174804688, + "logps/rejected": -584.1214599609375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.505805492401123, + "rewards/margins": 11.413491249084473, + "rewards/rejected": -8.907686233520508, + "step": 257 + }, + { + "epoch": 6.6, + "learning_rate": 0.00022307692307692306, + "logits/chosen": 1.1605815887451172, + "logits/rejected": 1.0998469591140747, + "logps/chosen": -532.1463623046875, + "logps/rejected": -632.1581420898438, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.184004306793213, + "rewards/margins": 9.712126731872559, + "rewards/rejected": -8.528121948242188, + "step": 258 + }, + { + "epoch": 6.63, + "learning_rate": 0.00022264957264957262, + "logits/chosen": 1.0158207416534424, + "logits/rejected": 1.0649572610855103, + "logps/chosen": -537.4420166015625, + "logps/rejected": -689.2913818359375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0612900257110596, + "rewards/margins": 11.472357749938965, + "rewards/rejected": -10.411066055297852, + "step": 259 + }, + { + "epoch": 6.66, + "learning_rate": 0.00022222222222222218, + "logits/chosen": 1.23757803440094, + "logits/rejected": 1.0114773511886597, + "logps/chosen": -557.8294677734375, + "logps/rejected": -592.405517578125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.740067183971405, + "rewards/margins": 10.078370094299316, + "rewards/rejected": -9.338302612304688, + "step": 260 + }, + { + "epoch": 6.68, + "learning_rate": 0.0002217948717948718, + "logits/chosen": 1.2603142261505127, + "logits/rejected": 1.001814603805542, + "logps/chosen": -581.58935546875, + "logps/rejected": -544.1881103515625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5611451864242554, + "rewards/margins": 9.480655670166016, + "rewards/rejected": -7.9195098876953125, + "step": 261 + }, + { + "epoch": 6.71, + "learning_rate": 0.00022136752136752135, + "logits/chosen": 1.1516404151916504, + "logits/rejected": 1.1282165050506592, + "logps/chosen": -560.3634033203125, + "logps/rejected": -674.9133911132812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5357306003570557, + "rewards/margins": 9.987811088562012, + "rewards/rejected": -9.452080726623535, + "step": 262 + }, + { + "epoch": 6.73, + "learning_rate": 0.0002209401709401709, + "logits/chosen": 1.1687240600585938, + "logits/rejected": 1.0638331174850464, + "logps/chosen": -589.182373046875, + "logps/rejected": -682.9474487304688, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.275535225868225, + "rewards/margins": 11.063849449157715, + "rewards/rejected": -9.788314819335938, + "step": 263 + }, + { + "epoch": 6.76, + "learning_rate": 0.00022051282051282052, + "logits/chosen": 1.1243481636047363, + "logits/rejected": 0.967666745185852, + "logps/chosen": -575.9657592773438, + "logps/rejected": -614.3820190429688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.424743115901947, + "rewards/margins": 9.780957221984863, + "rewards/rejected": -9.35621452331543, + "step": 264 + }, + { + "epoch": 6.78, + "learning_rate": 0.00022008547008547008, + "logits/chosen": 1.0041706562042236, + "logits/rejected": 0.996163547039032, + "logps/chosen": -587.704345703125, + "logps/rejected": -628.5338745117188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7361750602722168, + "rewards/margins": 9.253312110900879, + "rewards/rejected": -8.51713752746582, + "step": 265 + }, + { + "epoch": 6.81, + "learning_rate": 0.00021965811965811964, + "logits/chosen": 1.1979098320007324, + "logits/rejected": 1.126028299331665, + "logps/chosen": -524.0247802734375, + "logps/rejected": -609.7775268554688, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6900510787963867, + "rewards/margins": 9.077376365661621, + "rewards/rejected": -7.387324333190918, + "step": 266 + }, + { + "epoch": 6.84, + "learning_rate": 0.0002192307692307692, + "logits/chosen": 1.1033226251602173, + "logits/rejected": 1.0556286573410034, + "logps/chosen": -534.7022094726562, + "logps/rejected": -597.9768676757812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6321287155151367, + "rewards/margins": 10.527310371398926, + "rewards/rejected": -8.895181655883789, + "step": 267 + }, + { + "epoch": 6.86, + "learning_rate": 0.00021880341880341878, + "logits/chosen": 1.0708644390106201, + "logits/rejected": 1.0677733421325684, + "logps/chosen": -561.38525390625, + "logps/rejected": -664.557373046875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4007779359817505, + "rewards/margins": 10.572273254394531, + "rewards/rejected": -10.17149543762207, + "step": 268 + }, + { + "epoch": 6.89, + "learning_rate": 0.00021837606837606837, + "logits/chosen": 1.0858148336410522, + "logits/rejected": 1.0668940544128418, + "logps/chosen": -580.924072265625, + "logps/rejected": -651.995849609375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7338685989379883, + "rewards/margins": 10.77505874633789, + "rewards/rejected": -9.041191101074219, + "step": 269 + }, + { + "epoch": 6.91, + "learning_rate": 0.00021794871794871793, + "logits/chosen": 1.081437110900879, + "logits/rejected": 0.9860243797302246, + "logps/chosen": -484.07366943359375, + "logps/rejected": -648.0784912109375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5497647523880005, + "rewards/margins": 11.513895034790039, + "rewards/rejected": -9.964130401611328, + "step": 270 + }, + { + "epoch": 6.94, + "learning_rate": 0.0002175213675213675, + "logits/chosen": 1.1952917575836182, + "logits/rejected": 1.1564627885818481, + "logps/chosen": -548.2731323242188, + "logps/rejected": -742.41845703125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5800870656967163, + "rewards/margins": 10.232941627502441, + "rewards/rejected": -9.652854919433594, + "step": 271 + }, + { + "epoch": 6.96, + "learning_rate": 0.00021709401709401707, + "logits/chosen": 1.2948367595672607, + "logits/rejected": 1.1679692268371582, + "logps/chosen": -573.0979614257812, + "logps/rejected": -682.3110961914062, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2798572778701782, + "rewards/margins": 9.664986610412598, + "rewards/rejected": -8.385129928588867, + "step": 272 + }, + { + "epoch": 6.99, + "learning_rate": 0.00021666666666666666, + "logits/chosen": 1.1744760274887085, + "logits/rejected": 0.9845774173736572, + "logps/chosen": -558.6195068359375, + "logps/rejected": -652.9862060546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.773690104484558, + "rewards/margins": 10.953255653381348, + "rewards/rejected": -9.1795654296875, + "step": 273 + }, + { + "epoch": 7.01, + "learning_rate": 0.00021623931623931622, + "logits/chosen": 1.1232681274414062, + "logits/rejected": 1.0257112979888916, + "logps/chosen": -510.100341796875, + "logps/rejected": -643.0399169921875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.46491277217865, + "rewards/margins": 9.883655548095703, + "rewards/rejected": -8.418743133544922, + "step": 274 + }, + { + "epoch": 7.04, + "learning_rate": 0.0002158119658119658, + "logits/chosen": 1.0357047319412231, + "logits/rejected": 1.001062035560608, + "logps/chosen": -501.3252868652344, + "logps/rejected": -553.6700439453125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0863351821899414, + "rewards/margins": 9.711782455444336, + "rewards/rejected": -8.625446319580078, + "step": 275 + }, + { + "epoch": 7.07, + "learning_rate": 0.00021538461538461536, + "logits/chosen": 1.1731458902359009, + "logits/rejected": 1.11858069896698, + "logps/chosen": -577.2032470703125, + "logps/rejected": -713.2083740234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0856976509094238, + "rewards/margins": 10.696051597595215, + "rewards/rejected": -9.610353469848633, + "step": 276 + }, + { + "epoch": 7.09, + "learning_rate": 0.00021495726495726492, + "logits/chosen": 1.0282161235809326, + "logits/rejected": 0.9538753032684326, + "logps/chosen": -494.20562744140625, + "logps/rejected": -624.65087890625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3367016315460205, + "rewards/margins": 11.350975036621094, + "rewards/rejected": -9.014272689819336, + "step": 277 + }, + { + "epoch": 7.12, + "learning_rate": 0.00021452991452991453, + "logits/chosen": 1.0994839668273926, + "logits/rejected": 1.1064229011535645, + "logps/chosen": -498.6627197265625, + "logps/rejected": -695.014404296875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8084684014320374, + "rewards/margins": 10.440178871154785, + "rewards/rejected": -9.631710052490234, + "step": 278 + }, + { + "epoch": 7.14, + "learning_rate": 0.0002141025641025641, + "logits/chosen": 0.9965860843658447, + "logits/rejected": 0.9728628396987915, + "logps/chosen": -478.1365966796875, + "logps/rejected": -635.9570922851562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1333738565444946, + "rewards/margins": 9.761213302612305, + "rewards/rejected": -8.627839088439941, + "step": 279 + }, + { + "epoch": 7.17, + "learning_rate": 0.00021367521367521365, + "logits/chosen": 1.2235289812088013, + "logits/rejected": 1.040520191192627, + "logps/chosen": -577.0011596679688, + "logps/rejected": -598.2988891601562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.098937749862671, + "rewards/margins": 10.220855712890625, + "rewards/rejected": -9.121917724609375, + "step": 280 + }, + { + "epoch": 7.19, + "learning_rate": 0.0002132478632478632, + "logits/chosen": 1.1766057014465332, + "logits/rejected": 1.001685380935669, + "logps/chosen": -511.30352783203125, + "logps/rejected": -541.9244384765625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6884262561798096, + "rewards/margins": 9.974132537841797, + "rewards/rejected": -8.28570556640625, + "step": 281 + }, + { + "epoch": 7.22, + "learning_rate": 0.00021282051282051282, + "logits/chosen": 1.193005084991455, + "logits/rejected": 1.118786096572876, + "logps/chosen": -552.6077270507812, + "logps/rejected": -715.137451171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0981065034866333, + "rewards/margins": 10.102052688598633, + "rewards/rejected": -9.003947257995605, + "step": 282 + }, + { + "epoch": 7.24, + "learning_rate": 0.00021239316239316238, + "logits/chosen": 1.1393274068832397, + "logits/rejected": 1.102120041847229, + "logps/chosen": -511.25286865234375, + "logps/rejected": -617.3232421875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.180321216583252, + "rewards/margins": 11.865279197692871, + "rewards/rejected": -9.684957504272461, + "step": 283 + }, + { + "epoch": 7.27, + "learning_rate": 0.00021196581196581194, + "logits/chosen": 1.0302733182907104, + "logits/rejected": 1.0308837890625, + "logps/chosen": -504.82989501953125, + "logps/rejected": -603.8284301757812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.966596782207489, + "rewards/margins": 9.79338264465332, + "rewards/rejected": -8.826786041259766, + "step": 284 + }, + { + "epoch": 7.3, + "learning_rate": 0.00021153846153846152, + "logits/chosen": 0.9913230538368225, + "logits/rejected": 0.9480158090591431, + "logps/chosen": -546.1568603515625, + "logps/rejected": -645.8260498046875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7541160583496094, + "rewards/margins": 10.948417663574219, + "rewards/rejected": -10.19430160522461, + "step": 285 + }, + { + "epoch": 7.32, + "learning_rate": 0.0002111111111111111, + "logits/chosen": 1.1722790002822876, + "logits/rejected": 1.0763994455337524, + "logps/chosen": -604.4279174804688, + "logps/rejected": -637.1195068359375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.36835515499115, + "rewards/margins": 10.110456466674805, + "rewards/rejected": -8.742100715637207, + "step": 286 + }, + { + "epoch": 7.35, + "learning_rate": 0.00021068376068376067, + "logits/chosen": 1.1397333145141602, + "logits/rejected": 1.1724354028701782, + "logps/chosen": -497.7437744140625, + "logps/rejected": -731.747314453125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8257311582565308, + "rewards/margins": 10.88111686706543, + "rewards/rejected": -10.05538558959961, + "step": 287 + }, + { + "epoch": 7.37, + "learning_rate": 0.00021025641025641022, + "logits/chosen": 1.0152881145477295, + "logits/rejected": 0.9720747470855713, + "logps/chosen": -467.1788330078125, + "logps/rejected": -618.5647583007812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0214035511016846, + "rewards/margins": 9.899874687194824, + "rewards/rejected": -8.878470420837402, + "step": 288 + }, + { + "epoch": 7.4, + "learning_rate": 0.0002098290598290598, + "logits/chosen": 1.2052510976791382, + "logits/rejected": 1.0738441944122314, + "logps/chosen": -559.755859375, + "logps/rejected": -672.936767578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2137142419815063, + "rewards/margins": 11.24556827545166, + "rewards/rejected": -10.031854629516602, + "step": 289 + }, + { + "epoch": 7.42, + "learning_rate": 0.0002094017094017094, + "logits/chosen": 1.2158238887786865, + "logits/rejected": 1.0811805725097656, + "logps/chosen": -546.0452270507812, + "logps/rejected": -623.5526733398438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6926029920578003, + "rewards/margins": 10.92724895477295, + "rewards/rejected": -9.23464584350586, + "step": 290 + }, + { + "epoch": 7.45, + "learning_rate": 0.00020897435897435895, + "logits/chosen": 1.2231104373931885, + "logits/rejected": 1.1560981273651123, + "logps/chosen": -578.806640625, + "logps/rejected": -620.2879028320312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.173689603805542, + "rewards/margins": 9.140408515930176, + "rewards/rejected": -7.966719150543213, + "step": 291 + }, + { + "epoch": 7.48, + "learning_rate": 0.00020854700854700854, + "logits/chosen": 1.1962541341781616, + "logits/rejected": 1.0524215698242188, + "logps/chosen": -575.8316650390625, + "logps/rejected": -602.4752807617188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0710935592651367, + "rewards/margins": 9.931817054748535, + "rewards/rejected": -7.860722541809082, + "step": 292 + }, + { + "epoch": 7.5, + "learning_rate": 0.0002081196581196581, + "logits/chosen": 1.2810760736465454, + "logits/rejected": 1.1952964067459106, + "logps/chosen": -611.9567260742188, + "logps/rejected": -695.635986328125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4828972816467285, + "rewards/margins": 10.279756546020508, + "rewards/rejected": -8.796858787536621, + "step": 293 + }, + { + "epoch": 7.53, + "learning_rate": 0.00020769230769230766, + "logits/chosen": 1.1195869445800781, + "logits/rejected": 1.032854437828064, + "logps/chosen": -496.8470153808594, + "logps/rejected": -573.8423461914062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7543283700942993, + "rewards/margins": 10.867127418518066, + "rewards/rejected": -9.112799644470215, + "step": 294 + }, + { + "epoch": 7.55, + "learning_rate": 0.00020726495726495724, + "logits/chosen": 1.1649212837219238, + "logits/rejected": 1.0563678741455078, + "logps/chosen": -558.580322265625, + "logps/rejected": -644.250732421875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.997902512550354, + "rewards/margins": 10.675620079040527, + "rewards/rejected": -9.677717208862305, + "step": 295 + }, + { + "epoch": 7.58, + "learning_rate": 0.00020683760683760683, + "logits/chosen": 1.1838792562484741, + "logits/rejected": 1.0918617248535156, + "logps/chosen": -534.3650512695312, + "logps/rejected": -626.9495849609375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7307627201080322, + "rewards/margins": 9.962722778320312, + "rewards/rejected": -8.23196029663086, + "step": 296 + }, + { + "epoch": 7.6, + "learning_rate": 0.0002064102564102564, + "logits/chosen": 1.1973166465759277, + "logits/rejected": 1.1105926036834717, + "logps/chosen": -552.765625, + "logps/rejected": -571.7509765625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9999276399612427, + "rewards/margins": 9.358254432678223, + "rewards/rejected": -7.358326435089111, + "step": 297 + }, + { + "epoch": 7.63, + "learning_rate": 0.00020598290598290595, + "logits/chosen": 1.1777927875518799, + "logits/rejected": 1.0511623620986938, + "logps/chosen": -482.9162292480469, + "logps/rejected": -596.0338745117188, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1938194036483765, + "rewards/margins": 10.532297134399414, + "rewards/rejected": -9.338478088378906, + "step": 298 + }, + { + "epoch": 7.65, + "learning_rate": 0.00020555555555555556, + "logits/chosen": 1.194580078125, + "logits/rejected": 1.0481842756271362, + "logps/chosen": -523.016845703125, + "logps/rejected": -585.9976806640625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1020272970199585, + "rewards/margins": 10.315288543701172, + "rewards/rejected": -9.213261604309082, + "step": 299 + }, + { + "epoch": 7.68, + "learning_rate": 0.00020512820512820512, + "logits/chosen": 1.0988215208053589, + "logits/rejected": 1.024403691291809, + "logps/chosen": -481.1084289550781, + "logps/rejected": -562.4240112304688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.069756507873535, + "rewards/margins": 10.040982246398926, + "rewards/rejected": -7.971225261688232, + "step": 300 + }, + { + "epoch": 7.71, + "learning_rate": 0.00020470085470085468, + "logits/chosen": 1.311755657196045, + "logits/rejected": 1.0829813480377197, + "logps/chosen": -606.013671875, + "logps/rejected": -694.0232543945312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6969201564788818, + "rewards/margins": 9.998454093933105, + "rewards/rejected": -9.301534652709961, + "step": 301 + }, + { + "epoch": 7.73, + "learning_rate": 0.00020427350427350423, + "logits/chosen": 1.0659198760986328, + "logits/rejected": 1.0787678956985474, + "logps/chosen": -562.7803344726562, + "logps/rejected": -598.2239990234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5258029699325562, + "rewards/margins": 9.569708824157715, + "rewards/rejected": -8.043905258178711, + "step": 302 + }, + { + "epoch": 7.76, + "learning_rate": 0.00020384615384615385, + "logits/chosen": 1.1065874099731445, + "logits/rejected": 1.075424313545227, + "logps/chosen": -503.7167053222656, + "logps/rejected": -640.2783203125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0811102390289307, + "rewards/margins": 9.623838424682617, + "rewards/rejected": -8.542729377746582, + "step": 303 + }, + { + "epoch": 7.78, + "learning_rate": 0.0002034188034188034, + "logits/chosen": 1.1548815965652466, + "logits/rejected": 1.1475387811660767, + "logps/chosen": -475.9542236328125, + "logps/rejected": -630.96826171875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4979651868343353, + "rewards/margins": 10.597925186157227, + "rewards/rejected": -10.099960327148438, + "step": 304 + }, + { + "epoch": 7.81, + "learning_rate": 0.00020299145299145296, + "logits/chosen": 1.1043236255645752, + "logits/rejected": 1.0807740688323975, + "logps/chosen": -546.5986938476562, + "logps/rejected": -615.6447143554688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0489461421966553, + "rewards/margins": 10.541731834411621, + "rewards/rejected": -8.49278450012207, + "step": 305 + }, + { + "epoch": 7.83, + "learning_rate": 0.00020256410256410255, + "logits/chosen": 1.1738014221191406, + "logits/rejected": 1.0424628257751465, + "logps/chosen": -579.9978637695312, + "logps/rejected": -668.8394775390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2401783466339111, + "rewards/margins": 11.568252563476562, + "rewards/rejected": -10.328075408935547, + "step": 306 + }, + { + "epoch": 7.86, + "learning_rate": 0.00020213675213675214, + "logits/chosen": 1.221925973892212, + "logits/rejected": 1.0827970504760742, + "logps/chosen": -554.8446044921875, + "logps/rejected": -623.172119140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.212306261062622, + "rewards/margins": 9.68209457397461, + "rewards/rejected": -8.469788551330566, + "step": 307 + }, + { + "epoch": 7.88, + "learning_rate": 0.0002017094017094017, + "logits/chosen": 1.0534234046936035, + "logits/rejected": 1.1416208744049072, + "logps/chosen": -498.2744140625, + "logps/rejected": -689.6672973632812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.162524938583374, + "rewards/margins": 11.897795677185059, + "rewards/rejected": -9.735269546508789, + "step": 308 + }, + { + "epoch": 7.91, + "learning_rate": 0.00020128205128205125, + "logits/chosen": 1.0100905895233154, + "logits/rejected": 1.1573420763015747, + "logps/chosen": -500.80841064453125, + "logps/rejected": -593.0250244140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8166165351867676, + "rewards/margins": 9.917183876037598, + "rewards/rejected": -8.100566864013672, + "step": 309 + }, + { + "epoch": 7.94, + "learning_rate": 0.00020085470085470084, + "logits/chosen": 1.1027199029922485, + "logits/rejected": 0.9867293238639832, + "logps/chosen": -524.3262329101562, + "logps/rejected": -591.5633544921875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.232293963432312, + "rewards/margins": 10.08406925201416, + "rewards/rejected": -8.851776123046875, + "step": 310 + }, + { + "epoch": 7.96, + "learning_rate": 0.0002004273504273504, + "logits/chosen": 1.2790604829788208, + "logits/rejected": 1.0140596628189087, + "logps/chosen": -588.2280883789062, + "logps/rejected": -668.6362915039062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8383196592330933, + "rewards/margins": 10.089158058166504, + "rewards/rejected": -9.250838279724121, + "step": 311 + }, + { + "epoch": 7.99, + "learning_rate": 0.00019999999999999998, + "logits/chosen": 1.082058310508728, + "logits/rejected": 0.9627883434295654, + "logps/chosen": -609.169677734375, + "logps/rejected": -605.4269409179688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0816116333007812, + "rewards/margins": 11.790176391601562, + "rewards/rejected": -9.708564758300781, + "step": 312 + }, + { + "epoch": 8.01, + "learning_rate": 0.00019957264957264957, + "logits/chosen": 1.1039892435073853, + "logits/rejected": 0.9567267298698425, + "logps/chosen": -473.4000244140625, + "logps/rejected": -618.5371704101562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4500510692596436, + "rewards/margins": 10.126335144042969, + "rewards/rejected": -8.676283836364746, + "step": 313 + }, + { + "epoch": 8.04, + "learning_rate": 0.00019914529914529913, + "logits/chosen": 1.1014786958694458, + "logits/rejected": 1.0613113641738892, + "logps/chosen": -511.72454833984375, + "logps/rejected": -694.4070434570312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3977946639060974, + "rewards/margins": 11.06888484954834, + "rewards/rejected": -10.671089172363281, + "step": 314 + }, + { + "epoch": 8.06, + "learning_rate": 0.00019871794871794869, + "logits/chosen": 1.1368095874786377, + "logits/rejected": 0.9869575500488281, + "logps/chosen": -509.3475036621094, + "logps/rejected": -625.2825927734375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8700177669525146, + "rewards/margins": 10.845271110534668, + "rewards/rejected": -9.97525405883789, + "step": 315 + }, + { + "epoch": 8.09, + "learning_rate": 0.00019829059829059824, + "logits/chosen": 1.1710002422332764, + "logits/rejected": 1.1424845457077026, + "logps/chosen": -548.1114501953125, + "logps/rejected": -658.926513671875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.743558406829834, + "rewards/margins": 9.961552619934082, + "rewards/rejected": -9.217994689941406, + "step": 316 + }, + { + "epoch": 8.12, + "learning_rate": 0.00019786324786324786, + "logits/chosen": 1.227845311164856, + "logits/rejected": 1.1172688007354736, + "logps/chosen": -615.0020751953125, + "logps/rejected": -655.31787109375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7781198024749756, + "rewards/margins": 11.058065414428711, + "rewards/rejected": -9.279945373535156, + "step": 317 + }, + { + "epoch": 8.14, + "learning_rate": 0.00019743589743589742, + "logits/chosen": 1.2156200408935547, + "logits/rejected": 0.9318048357963562, + "logps/chosen": -561.3159790039062, + "logps/rejected": -528.769287109375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.775212287902832, + "rewards/margins": 9.198701858520508, + "rewards/rejected": -7.423489570617676, + "step": 318 + }, + { + "epoch": 8.17, + "learning_rate": 0.00019700854700854697, + "logits/chosen": 1.1288306713104248, + "logits/rejected": 1.0163847208023071, + "logps/chosen": -566.7800903320312, + "logps/rejected": -621.5507202148438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5739163160324097, + "rewards/margins": 11.509315490722656, + "rewards/rejected": -9.935400009155273, + "step": 319 + }, + { + "epoch": 8.19, + "learning_rate": 0.00019658119658119659, + "logits/chosen": 1.1232361793518066, + "logits/rejected": 1.1592121124267578, + "logps/chosen": -528.2422485351562, + "logps/rejected": -708.33642578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.655145287513733, + "rewards/margins": 11.109955787658691, + "rewards/rejected": -9.454811096191406, + "step": 320 + }, + { + "epoch": 8.22, + "learning_rate": 0.00019615384615384615, + "logits/chosen": 1.1231738328933716, + "logits/rejected": 1.117080807685852, + "logps/chosen": -498.927001953125, + "logps/rejected": -631.9031982421875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47227901220321655, + "rewards/margins": 10.479837417602539, + "rewards/rejected": -10.007558822631836, + "step": 321 + }, + { + "epoch": 8.24, + "learning_rate": 0.0001957264957264957, + "logits/chosen": 1.0491048097610474, + "logits/rejected": 0.9988434314727783, + "logps/chosen": -494.6644287109375, + "logps/rejected": -610.76806640625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1051549911499023, + "rewards/margins": 11.895588874816895, + "rewards/rejected": -9.790433883666992, + "step": 322 + }, + { + "epoch": 8.27, + "learning_rate": 0.00019529914529914526, + "logits/chosen": 1.098575234413147, + "logits/rejected": 1.1674755811691284, + "logps/chosen": -514.0235595703125, + "logps/rejected": -722.014892578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.734897255897522, + "rewards/margins": 10.937726974487305, + "rewards/rejected": -10.202829360961914, + "step": 323 + }, + { + "epoch": 8.29, + "learning_rate": 0.00019487179487179487, + "logits/chosen": 1.1288853883743286, + "logits/rejected": 1.1453090906143188, + "logps/chosen": -488.9378662109375, + "logps/rejected": -641.2897338867188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5595530271530151, + "rewards/margins": 10.641010284423828, + "rewards/rejected": -9.081456184387207, + "step": 324 + }, + { + "epoch": 8.32, + "learning_rate": 0.00019444444444444443, + "logits/chosen": 1.265305519104004, + "logits/rejected": 0.9294592142105103, + "logps/chosen": -613.603271484375, + "logps/rejected": -536.7325439453125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3043055534362793, + "rewards/margins": 10.669670104980469, + "rewards/rejected": -8.365365028381348, + "step": 325 + }, + { + "epoch": 8.35, + "learning_rate": 0.000194017094017094, + "logits/chosen": 1.1310415267944336, + "logits/rejected": 1.0312024354934692, + "logps/chosen": -516.1328125, + "logps/rejected": -636.2833251953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.218218207359314, + "rewards/margins": 10.852701187133789, + "rewards/rejected": -9.634482383728027, + "step": 326 + }, + { + "epoch": 8.37, + "learning_rate": 0.00019358974358974358, + "logits/chosen": 1.1042028665542603, + "logits/rejected": 1.0703749656677246, + "logps/chosen": -577.5679321289062, + "logps/rejected": -605.3977661132812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8515387773513794, + "rewards/margins": 9.11039924621582, + "rewards/rejected": -8.25886058807373, + "step": 327 + }, + { + "epoch": 8.4, + "learning_rate": 0.00019316239316239314, + "logits/chosen": 1.1480742692947388, + "logits/rejected": 1.0245976448059082, + "logps/chosen": -544.492919921875, + "logps/rejected": -650.3825073242188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1441433429718018, + "rewards/margins": 10.524991989135742, + "rewards/rejected": -9.38084888458252, + "step": 328 + }, + { + "epoch": 8.42, + "learning_rate": 0.00019273504273504272, + "logits/chosen": 0.9963136315345764, + "logits/rejected": 1.0162067413330078, + "logps/chosen": -543.5288696289062, + "logps/rejected": -677.895751953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5060915946960449, + "rewards/margins": 10.86108684539795, + "rewards/rejected": -10.354994773864746, + "step": 329 + }, + { + "epoch": 8.45, + "learning_rate": 0.0001923076923076923, + "logits/chosen": 1.192360758781433, + "logits/rejected": 1.1079771518707275, + "logps/chosen": -512.4086303710938, + "logps/rejected": -630.524169921875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9825226068496704, + "rewards/margins": 11.417821884155273, + "rewards/rejected": -9.435300827026367, + "step": 330 + }, + { + "epoch": 8.47, + "learning_rate": 0.00019188034188034187, + "logits/chosen": 1.0749591588974, + "logits/rejected": 1.0543586015701294, + "logps/chosen": -502.4964599609375, + "logps/rejected": -628.3140869140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1525578498840332, + "rewards/margins": 10.45914363861084, + "rewards/rejected": -9.306587219238281, + "step": 331 + }, + { + "epoch": 8.5, + "learning_rate": 0.00019145299145299142, + "logits/chosen": 1.1045258045196533, + "logits/rejected": 1.094617486000061, + "logps/chosen": -556.7658081054688, + "logps/rejected": -661.129638671875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5904799103736877, + "rewards/margins": 10.494197845458984, + "rewards/rejected": -9.903717994689941, + "step": 332 + }, + { + "epoch": 8.52, + "learning_rate": 0.00019102564102564098, + "logits/chosen": 1.0999786853790283, + "logits/rejected": 1.0287926197052002, + "logps/chosen": -559.324951171875, + "logps/rejected": -685.59716796875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8768333196640015, + "rewards/margins": 11.342018127441406, + "rewards/rejected": -10.465184211730957, + "step": 333 + }, + { + "epoch": 8.55, + "learning_rate": 0.0001905982905982906, + "logits/chosen": 1.2270386219024658, + "logits/rejected": 1.0695297718048096, + "logps/chosen": -555.893310546875, + "logps/rejected": -616.703125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.512807846069336, + "rewards/margins": 11.095309257507324, + "rewards/rejected": -8.582502365112305, + "step": 334 + }, + { + "epoch": 8.58, + "learning_rate": 0.00019017094017094015, + "logits/chosen": 1.1461352109909058, + "logits/rejected": 1.0352705717086792, + "logps/chosen": -500.9500427246094, + "logps/rejected": -638.1965942382812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5767711400985718, + "rewards/margins": 10.275659561157227, + "rewards/rejected": -9.698890686035156, + "step": 335 + }, + { + "epoch": 8.6, + "learning_rate": 0.0001897435897435897, + "logits/chosen": 1.1884602308273315, + "logits/rejected": 0.9545145630836487, + "logps/chosen": -553.399658203125, + "logps/rejected": -571.7633056640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.724369764328003, + "rewards/margins": 9.780606269836426, + "rewards/rejected": -8.056236267089844, + "step": 336 + }, + { + "epoch": 8.63, + "learning_rate": 0.00018931623931623933, + "logits/chosen": 1.1067692041397095, + "logits/rejected": 1.0162923336029053, + "logps/chosen": -517.6556396484375, + "logps/rejected": -604.726318359375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7874058485031128, + "rewards/margins": 10.558215141296387, + "rewards/rejected": -8.770809173583984, + "step": 337 + }, + { + "epoch": 8.65, + "learning_rate": 0.00018888888888888888, + "logits/chosen": 1.217395544052124, + "logits/rejected": 1.084112286567688, + "logps/chosen": -542.150634765625, + "logps/rejected": -678.0106201171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0376150608062744, + "rewards/margins": 10.951888084411621, + "rewards/rejected": -9.914274215698242, + "step": 338 + }, + { + "epoch": 8.68, + "learning_rate": 0.00018846153846153844, + "logits/chosen": 1.1382925510406494, + "logits/rejected": 1.0875835418701172, + "logps/chosen": -545.2322387695312, + "logps/rejected": -629.0198974609375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7386005520820618, + "rewards/margins": 10.408458709716797, + "rewards/rejected": -9.669858932495117, + "step": 339 + }, + { + "epoch": 8.7, + "learning_rate": 0.000188034188034188, + "logits/chosen": 1.115515947341919, + "logits/rejected": 1.066940426826477, + "logps/chosen": -523.6985473632812, + "logps/rejected": -574.7987670898438, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0293325185775757, + "rewards/margins": 9.218039512634277, + "rewards/rejected": -8.18870735168457, + "step": 340 + }, + { + "epoch": 8.73, + "learning_rate": 0.00018760683760683761, + "logits/chosen": 1.0013961791992188, + "logits/rejected": 1.0823533535003662, + "logps/chosen": -485.6407775878906, + "logps/rejected": -657.2274169921875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3279895782470703, + "rewards/margins": 10.64334774017334, + "rewards/rejected": -9.31535816192627, + "step": 341 + }, + { + "epoch": 8.76, + "learning_rate": 0.00018717948717948717, + "logits/chosen": 1.0347654819488525, + "logits/rejected": 1.0151424407958984, + "logps/chosen": -497.17730712890625, + "logps/rejected": -617.05029296875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.067275881767273, + "rewards/margins": 9.780403137207031, + "rewards/rejected": -8.713126182556152, + "step": 342 + }, + { + "epoch": 8.78, + "learning_rate": 0.00018675213675213673, + "logits/chosen": 1.1175577640533447, + "logits/rejected": 1.0508739948272705, + "logps/chosen": -543.9364013671875, + "logps/rejected": -730.9745483398438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06183135509490967, + "rewards/margins": 12.12753963470459, + "rewards/rejected": -12.065709114074707, + "step": 343 + }, + { + "epoch": 8.81, + "learning_rate": 0.00018632478632478632, + "logits/chosen": 1.07142174243927, + "logits/rejected": 1.0519976615905762, + "logps/chosen": -506.71435546875, + "logps/rejected": -652.1842041015625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.239646315574646, + "rewards/margins": 10.84978199005127, + "rewards/rejected": -9.61013412475586, + "step": 344 + }, + { + "epoch": 8.83, + "learning_rate": 0.00018589743589743588, + "logits/chosen": 1.1885634660720825, + "logits/rejected": 1.0062313079833984, + "logps/chosen": -569.5838623046875, + "logps/rejected": -602.7799072265625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2498093843460083, + "rewards/margins": 9.729214668273926, + "rewards/rejected": -8.47940444946289, + "step": 345 + }, + { + "epoch": 8.86, + "learning_rate": 0.00018547008547008546, + "logits/chosen": 1.2486610412597656, + "logits/rejected": 0.9658511877059937, + "logps/chosen": -563.5016479492188, + "logps/rejected": -559.8602294921875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7746890783309937, + "rewards/margins": 10.259527206420898, + "rewards/rejected": -8.484838485717773, + "step": 346 + }, + { + "epoch": 8.88, + "learning_rate": 0.00018504273504273502, + "logits/chosen": 1.0988224744796753, + "logits/rejected": 1.032260775566101, + "logps/chosen": -590.3836669921875, + "logps/rejected": -605.6322021484375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1525167226791382, + "rewards/margins": 10.179304122924805, + "rewards/rejected": -9.026787757873535, + "step": 347 + }, + { + "epoch": 8.91, + "learning_rate": 0.0001846153846153846, + "logits/chosen": 1.07295823097229, + "logits/rejected": 0.9503864645957947, + "logps/chosen": -567.7842407226562, + "logps/rejected": -599.153564453125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1233148574829102, + "rewards/margins": 10.49275016784668, + "rewards/rejected": -9.369434356689453, + "step": 348 + }, + { + "epoch": 8.93, + "learning_rate": 0.00018418803418803416, + "logits/chosen": 1.034525752067566, + "logits/rejected": 1.0498316287994385, + "logps/chosen": -501.63323974609375, + "logps/rejected": -582.5183715820312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7673141956329346, + "rewards/margins": 10.9537353515625, + "rewards/rejected": -9.186420440673828, + "step": 349 + }, + { + "epoch": 8.96, + "learning_rate": 0.00018376068376068372, + "logits/chosen": 1.1317795515060425, + "logits/rejected": 1.0368475914001465, + "logps/chosen": -592.8737182617188, + "logps/rejected": -618.9924926757812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9373716115951538, + "rewards/margins": 10.743789672851562, + "rewards/rejected": -9.806417465209961, + "step": 350 + }, + { + "epoch": 8.99, + "learning_rate": 0.00018333333333333334, + "logits/chosen": 1.236878514289856, + "logits/rejected": 1.0824024677276611, + "logps/chosen": -592.6204223632812, + "logps/rejected": -678.1072387695312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1566966772079468, + "rewards/margins": 12.701655387878418, + "rewards/rejected": -11.544958114624023, + "step": 351 + }, + { + "epoch": 9.01, + "learning_rate": 0.0001829059829059829, + "logits/chosen": 1.0362117290496826, + "logits/rejected": 1.0344483852386475, + "logps/chosen": -514.698486328125, + "logps/rejected": -682.2589721679688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7533162832260132, + "rewards/margins": 10.993279457092285, + "rewards/rejected": -10.23996353149414, + "step": 352 + }, + { + "epoch": 9.04, + "learning_rate": 0.00018247863247863245, + "logits/chosen": 1.0642633438110352, + "logits/rejected": 0.9820662140846252, + "logps/chosen": -531.2005004882812, + "logps/rejected": -594.51171875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5738836526870728, + "rewards/margins": 11.221702575683594, + "rewards/rejected": -9.647819519042969, + "step": 353 + }, + { + "epoch": 9.06, + "learning_rate": 0.000182051282051282, + "logits/chosen": 1.1875312328338623, + "logits/rejected": 1.0477038621902466, + "logps/chosen": -519.474609375, + "logps/rejected": -641.5570678710938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4246861934661865, + "rewards/margins": 10.708345413208008, + "rewards/rejected": -9.283658981323242, + "step": 354 + }, + { + "epoch": 9.09, + "learning_rate": 0.00018162393162393162, + "logits/chosen": 1.300011157989502, + "logits/rejected": 1.1615049839019775, + "logps/chosen": -594.492919921875, + "logps/rejected": -714.0015258789062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3209400177001953, + "rewards/margins": 11.944135665893555, + "rewards/rejected": -10.62319564819336, + "step": 355 + }, + { + "epoch": 9.11, + "learning_rate": 0.00018119658119658118, + "logits/chosen": 1.2041311264038086, + "logits/rejected": 1.09273099899292, + "logps/chosen": -517.081787109375, + "logps/rejected": -623.0797729492188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7040494680404663, + "rewards/margins": 9.923104286193848, + "rewards/rejected": -8.21905517578125, + "step": 356 + }, + { + "epoch": 9.14, + "learning_rate": 0.00018076923076923074, + "logits/chosen": 1.1552023887634277, + "logits/rejected": 1.1544487476348877, + "logps/chosen": -527.7257080078125, + "logps/rejected": -654.458740234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5392783880233765, + "rewards/margins": 10.782899856567383, + "rewards/rejected": -9.243619918823242, + "step": 357 + }, + { + "epoch": 9.16, + "learning_rate": 0.00018034188034188035, + "logits/chosen": 1.0705739259719849, + "logits/rejected": 1.0257068872451782, + "logps/chosen": -529.324951171875, + "logps/rejected": -572.942138671875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1362345218658447, + "rewards/margins": 10.217846870422363, + "rewards/rejected": -9.081612586975098, + "step": 358 + }, + { + "epoch": 9.19, + "learning_rate": 0.0001799145299145299, + "logits/chosen": 1.0554429292678833, + "logits/rejected": 0.9397602081298828, + "logps/chosen": -497.97955322265625, + "logps/rejected": -575.80810546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4867826700210571, + "rewards/margins": 11.263152122497559, + "rewards/rejected": -9.77637004852295, + "step": 359 + }, + { + "epoch": 9.22, + "learning_rate": 0.00017948717948717947, + "logits/chosen": 1.028283953666687, + "logits/rejected": 1.1357170343399048, + "logps/chosen": -486.7881164550781, + "logps/rejected": -638.11083984375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6323189735412598, + "rewards/margins": 9.90245246887207, + "rewards/rejected": -8.270133972167969, + "step": 360 + }, + { + "epoch": 9.24, + "learning_rate": 0.00017905982905982903, + "logits/chosen": 1.140254259109497, + "logits/rejected": 0.9276759028434753, + "logps/chosen": -533.4109497070312, + "logps/rejected": -542.5346069335938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5379778146743774, + "rewards/margins": 10.987845420837402, + "rewards/rejected": -9.449868202209473, + "step": 361 + }, + { + "epoch": 9.27, + "learning_rate": 0.00017863247863247861, + "logits/chosen": 1.2161970138549805, + "logits/rejected": 1.1362658739089966, + "logps/chosen": -508.8780212402344, + "logps/rejected": -620.1040649414062, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2322115898132324, + "rewards/margins": 10.1491060256958, + "rewards/rejected": -8.91689395904541, + "step": 362 + }, + { + "epoch": 9.29, + "learning_rate": 0.0001782051282051282, + "logits/chosen": 1.075786828994751, + "logits/rejected": 1.056262731552124, + "logps/chosen": -545.98876953125, + "logps/rejected": -626.4110717773438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6408146023750305, + "rewards/margins": 11.502391815185547, + "rewards/rejected": -10.861577033996582, + "step": 363 + }, + { + "epoch": 9.32, + "learning_rate": 0.00017777777777777776, + "logits/chosen": 1.127282738685608, + "logits/rejected": 1.0822765827178955, + "logps/chosen": -518.8118286132812, + "logps/rejected": -693.237548828125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05411094054579735, + "rewards/margins": 11.227949142456055, + "rewards/rejected": -11.282060623168945, + "step": 364 + }, + { + "epoch": 9.34, + "learning_rate": 0.00017735042735042734, + "logits/chosen": 1.1585055589675903, + "logits/rejected": 1.031551718711853, + "logps/chosen": -549.4298706054688, + "logps/rejected": -594.13720703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8158806562423706, + "rewards/margins": 10.627920150756836, + "rewards/rejected": -8.81203842163086, + "step": 365 + }, + { + "epoch": 9.37, + "learning_rate": 0.0001769230769230769, + "logits/chosen": 1.2148640155792236, + "logits/rejected": 1.1051499843597412, + "logps/chosen": -618.556884765625, + "logps/rejected": -655.5272827148438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7870438098907471, + "rewards/margins": 10.324081420898438, + "rewards/rejected": -9.537036895751953, + "step": 366 + }, + { + "epoch": 9.4, + "learning_rate": 0.00017649572649572646, + "logits/chosen": 1.0680745840072632, + "logits/rejected": 0.9647752046585083, + "logps/chosen": -520.0435791015625, + "logps/rejected": -650.8917236328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3966693878173828, + "rewards/margins": 11.41585922241211, + "rewards/rejected": -10.019189834594727, + "step": 367 + }, + { + "epoch": 9.42, + "learning_rate": 0.00017606837606837605, + "logits/chosen": 0.974884033203125, + "logits/rejected": 0.9622063636779785, + "logps/chosen": -551.198486328125, + "logps/rejected": -553.2161865234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7198470830917358, + "rewards/margins": 10.737578392028809, + "rewards/rejected": -9.017731666564941, + "step": 368 + }, + { + "epoch": 9.45, + "learning_rate": 0.00017564102564102563, + "logits/chosen": 1.1338629722595215, + "logits/rejected": 1.0500080585479736, + "logps/chosen": -519.1705322265625, + "logps/rejected": -645.0868530273438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2686916589736938, + "rewards/margins": 11.208242416381836, + "rewards/rejected": -9.939552307128906, + "step": 369 + }, + { + "epoch": 9.47, + "learning_rate": 0.0001752136752136752, + "logits/chosen": 1.1133869886398315, + "logits/rejected": 1.0957475900650024, + "logps/chosen": -509.5137939453125, + "logps/rejected": -634.0670776367188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1420785188674927, + "rewards/margins": 10.913748741149902, + "rewards/rejected": -9.771669387817383, + "step": 370 + }, + { + "epoch": 9.5, + "learning_rate": 0.00017478632478632475, + "logits/chosen": 1.0674755573272705, + "logits/rejected": 1.0430006980895996, + "logps/chosen": -549.5125732421875, + "logps/rejected": -615.5984497070312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9752965569496155, + "rewards/margins": 10.462637901306152, + "rewards/rejected": -9.487340927124023, + "step": 371 + }, + { + "epoch": 9.52, + "learning_rate": 0.00017435897435897436, + "logits/chosen": 1.0406773090362549, + "logits/rejected": 0.9974726438522339, + "logps/chosen": -554.4515380859375, + "logps/rejected": -671.939697265625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2197209894657135, + "rewards/margins": 10.627167701721191, + "rewards/rejected": -10.40744686126709, + "step": 372 + }, + { + "epoch": 9.55, + "learning_rate": 0.00017393162393162392, + "logits/chosen": 1.0458401441574097, + "logits/rejected": 0.9767919182777405, + "logps/chosen": -508.51275634765625, + "logps/rejected": -617.0277099609375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3107894659042358, + "rewards/margins": 11.249536514282227, + "rewards/rejected": -9.938748359680176, + "step": 373 + }, + { + "epoch": 9.57, + "learning_rate": 0.00017350427350427348, + "logits/chosen": 1.1613295078277588, + "logits/rejected": 1.067457914352417, + "logps/chosen": -583.6506958007812, + "logps/rejected": -683.9884033203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4901715517044067, + "rewards/margins": 11.270017623901367, + "rewards/rejected": -9.779845237731934, + "step": 374 + }, + { + "epoch": 9.6, + "learning_rate": 0.00017307692307692304, + "logits/chosen": 1.0676989555358887, + "logits/rejected": 1.048395037651062, + "logps/chosen": -553.3953857421875, + "logps/rejected": -713.2011108398438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.901606559753418, + "rewards/margins": 11.026727676391602, + "rewards/rejected": -10.125120162963867, + "step": 375 + }, + { + "epoch": 9.63, + "learning_rate": 0.00017264957264957265, + "logits/chosen": 1.196079969406128, + "logits/rejected": 1.0154253244400024, + "logps/chosen": -588.3291015625, + "logps/rejected": -622.0186767578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1913900375366211, + "rewards/margins": 10.078536987304688, + "rewards/rejected": -9.887145042419434, + "step": 376 + }, + { + "epoch": 9.65, + "learning_rate": 0.0001722222222222222, + "logits/chosen": 1.0178331136703491, + "logits/rejected": 0.9668864011764526, + "logps/chosen": -546.368408203125, + "logps/rejected": -615.4050903320312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2646788954734802, + "rewards/margins": 10.304734230041504, + "rewards/rejected": -10.040055274963379, + "step": 377 + }, + { + "epoch": 9.68, + "learning_rate": 0.00017179487179487177, + "logits/chosen": 1.1032341718673706, + "logits/rejected": 0.9035695791244507, + "logps/chosen": -604.7924194335938, + "logps/rejected": -652.8895874023438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5645720958709717, + "rewards/margins": 11.598380088806152, + "rewards/rejected": -10.033807754516602, + "step": 378 + }, + { + "epoch": 9.7, + "learning_rate": 0.00017136752136752135, + "logits/chosen": 1.1787011623382568, + "logits/rejected": 0.9833663702011108, + "logps/chosen": -574.9915771484375, + "logps/rejected": -650.5631713867188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0073214769363403, + "rewards/margins": 10.402047157287598, + "rewards/rejected": -9.394725799560547, + "step": 379 + }, + { + "epoch": 9.73, + "learning_rate": 0.00017094017094017094, + "logits/chosen": 1.1996289491653442, + "logits/rejected": 1.039535403251648, + "logps/chosen": -556.2608642578125, + "logps/rejected": -642.0311279296875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.216808557510376, + "rewards/margins": 11.361566543579102, + "rewards/rejected": -10.144757270812988, + "step": 380 + }, + { + "epoch": 9.75, + "learning_rate": 0.0001705128205128205, + "logits/chosen": 1.1434451341629028, + "logits/rejected": 1.051792860031128, + "logps/chosen": -493.62506103515625, + "logps/rejected": -622.7883911132812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9653478860855103, + "rewards/margins": 10.048986434936523, + "rewards/rejected": -9.083638191223145, + "step": 381 + }, + { + "epoch": 9.78, + "learning_rate": 0.00017008547008547006, + "logits/chosen": 1.2439961433410645, + "logits/rejected": 0.9698901772499084, + "logps/chosen": -590.2651977539062, + "logps/rejected": -601.9655151367188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7650651931762695, + "rewards/margins": 10.54482650756836, + "rewards/rejected": -8.779762268066406, + "step": 382 + }, + { + "epoch": 9.8, + "learning_rate": 0.00016965811965811964, + "logits/chosen": 1.0514246225357056, + "logits/rejected": 1.0069361925125122, + "logps/chosen": -528.1588745117188, + "logps/rejected": -668.963623046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9377299547195435, + "rewards/margins": 11.108814239501953, + "rewards/rejected": -10.171082496643066, + "step": 383 + }, + { + "epoch": 9.83, + "learning_rate": 0.0001692307692307692, + "logits/chosen": 1.0299386978149414, + "logits/rejected": 0.9614180326461792, + "logps/chosen": -461.4678955078125, + "logps/rejected": -622.8182373046875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3360067903995514, + "rewards/margins": 11.178654670715332, + "rewards/rejected": -10.842646598815918, + "step": 384 + }, + { + "epoch": 9.86, + "learning_rate": 0.0001688034188034188, + "logits/chosen": 1.069563627243042, + "logits/rejected": 0.9999057054519653, + "logps/chosen": -532.8282470703125, + "logps/rejected": -611.0808715820312, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6393276453018188, + "rewards/margins": 11.510586738586426, + "rewards/rejected": -9.871259689331055, + "step": 385 + }, + { + "epoch": 9.88, + "learning_rate": 0.00016837606837606837, + "logits/chosen": 1.0466080904006958, + "logits/rejected": 0.9922081232070923, + "logps/chosen": -516.3175048828125, + "logps/rejected": -622.3922729492188, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9020825028419495, + "rewards/margins": 11.032547950744629, + "rewards/rejected": -10.13046646118164, + "step": 386 + }, + { + "epoch": 9.91, + "learning_rate": 0.00016794871794871793, + "logits/chosen": 1.068638801574707, + "logits/rejected": 1.0634409189224243, + "logps/chosen": -518.2888793945312, + "logps/rejected": -700.1104736328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8084597587585449, + "rewards/margins": 11.80289363861084, + "rewards/rejected": -10.994433403015137, + "step": 387 + }, + { + "epoch": 9.93, + "learning_rate": 0.0001675213675213675, + "logits/chosen": 1.252096176147461, + "logits/rejected": 1.12416672706604, + "logps/chosen": -559.9757080078125, + "logps/rejected": -640.3218994140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.250156283378601, + "rewards/margins": 10.136184692382812, + "rewards/rejected": -8.886027336120605, + "step": 388 + }, + { + "epoch": 9.96, + "learning_rate": 0.00016709401709401708, + "logits/chosen": 1.0118858814239502, + "logits/rejected": 1.0544030666351318, + "logps/chosen": -550.706298828125, + "logps/rejected": -651.165283203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7763742208480835, + "rewards/margins": 11.286043167114258, + "rewards/rejected": -10.50966739654541, + "step": 389 + }, + { + "epoch": 9.98, + "learning_rate": 0.00016666666666666666, + "logits/chosen": 1.2127994298934937, + "logits/rejected": 0.9555975198745728, + "logps/chosen": -586.635009765625, + "logps/rejected": -619.5669555664062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7170848846435547, + "rewards/margins": 11.147682189941406, + "rewards/rejected": -9.430597305297852, + "step": 390 + }, + { + "epoch": 10.01, + "learning_rate": 0.00016623931623931622, + "logits/chosen": 0.9974936246871948, + "logits/rejected": 0.98292076587677, + "logps/chosen": -537.190185546875, + "logps/rejected": -637.2457275390625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7226758599281311, + "rewards/margins": 10.400409698486328, + "rewards/rejected": -9.677732467651367, + "step": 391 + }, + { + "epoch": 10.04, + "learning_rate": 0.00016581196581196578, + "logits/chosen": 0.9824466109275818, + "logits/rejected": 1.0442546606063843, + "logps/chosen": -524.7149047851562, + "logps/rejected": -685.2274169921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7316232919692993, + "rewards/margins": 12.584775924682617, + "rewards/rejected": -10.85315227508545, + "step": 392 + }, + { + "epoch": 10.06, + "learning_rate": 0.0001653846153846154, + "logits/chosen": 1.0178762674331665, + "logits/rejected": 1.0314674377441406, + "logps/chosen": -530.5228271484375, + "logps/rejected": -601.8655395507812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4628024697303772, + "rewards/margins": 9.492361068725586, + "rewards/rejected": -9.029558181762695, + "step": 393 + }, + { + "epoch": 10.09, + "learning_rate": 0.00016495726495726495, + "logits/chosen": 1.1514480113983154, + "logits/rejected": 0.9793822765350342, + "logps/chosen": -569.4765625, + "logps/rejected": -589.162109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2061307430267334, + "rewards/margins": 11.210150718688965, + "rewards/rejected": -9.004018783569336, + "step": 394 + }, + { + "epoch": 10.11, + "learning_rate": 0.0001645299145299145, + "logits/chosen": 1.008737564086914, + "logits/rejected": 0.9937309622764587, + "logps/chosen": -473.2950134277344, + "logps/rejected": -667.3652954101562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11462172865867615, + "rewards/margins": 11.188936233520508, + "rewards/rejected": -11.303558349609375, + "step": 395 + }, + { + "epoch": 10.14, + "learning_rate": 0.0001641025641025641, + "logits/chosen": 1.1840193271636963, + "logits/rejected": 0.9543963670730591, + "logps/chosen": -582.7732543945312, + "logps/rejected": -608.0531616210938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1386045217514038, + "rewards/margins": 10.291077613830566, + "rewards/rejected": -9.152473449707031, + "step": 396 + }, + { + "epoch": 10.16, + "learning_rate": 0.00016367521367521368, + "logits/chosen": 1.157091736793518, + "logits/rejected": 1.1324167251586914, + "logps/chosen": -568.5440063476562, + "logps/rejected": -697.5151977539062, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1107374429702759, + "rewards/margins": 11.52519702911377, + "rewards/rejected": -10.414460182189941, + "step": 397 + }, + { + "epoch": 10.19, + "learning_rate": 0.00016324786324786324, + "logits/chosen": 1.0599353313446045, + "logits/rejected": 0.8371300101280212, + "logps/chosen": -523.8178100585938, + "logps/rejected": -568.3763427734375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7378228306770325, + "rewards/margins": 10.31213092803955, + "rewards/rejected": -9.574308395385742, + "step": 398 + }, + { + "epoch": 10.21, + "learning_rate": 0.0001628205128205128, + "logits/chosen": 1.1668627262115479, + "logits/rejected": 1.1840052604675293, + "logps/chosen": -513.0997314453125, + "logps/rejected": -679.572998046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.532392680644989, + "rewards/margins": 10.64517593383789, + "rewards/rejected": -10.112783432006836, + "step": 399 + }, + { + "epoch": 10.24, + "learning_rate": 0.00016239316239316238, + "logits/chosen": 1.058180570602417, + "logits/rejected": 0.9799319505691528, + "logps/chosen": -521.548095703125, + "logps/rejected": -572.1917724609375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7191563844680786, + "rewards/margins": 9.96951675415039, + "rewards/rejected": -8.250360488891602, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 780, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 50, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}