cosmetic-test / trainer_state.json
seyeon-shijuan's picture
Upload folder using huggingface_hub
949447e verified
raw
history blame
No virus
185 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.24,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"learning_rate": 3.846153846153846e-06,
"logits/chosen": 1.3807897567749023,
"logits/rejected": 1.1952139139175415,
"logps/chosen": -589.1343994140625,
"logps/rejected": -494.7060241699219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.05,
"learning_rate": 7.692307692307692e-06,
"logits/chosen": 1.2665337324142456,
"logits/rejected": 1.1713109016418457,
"logps/chosen": -559.9566650390625,
"logps/rejected": -549.1146850585938,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.08,
"learning_rate": 1.1538461538461538e-05,
"logits/chosen": 1.3811347484588623,
"logits/rejected": 1.216629981994629,
"logps/chosen": -559.24951171875,
"logps/rejected": -481.5151672363281,
"loss": 0.6956,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.002765989163890481,
"rewards/margins": -0.004863501060754061,
"rewards/rejected": 0.002097511198371649,
"step": 3
},
{
"epoch": 0.1,
"learning_rate": 1.5384615384615384e-05,
"logits/chosen": 1.3156853914260864,
"logits/rejected": 1.2506608963012695,
"logps/chosen": -554.9755249023438,
"logps/rejected": -558.6537475585938,
"loss": 0.6917,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.005945444107055664,
"rewards/margins": 0.003013968002051115,
"rewards/rejected": 0.0029314758721739054,
"step": 4
},
{
"epoch": 0.13,
"learning_rate": 1.9230769230769228e-05,
"logits/chosen": 1.3257012367248535,
"logits/rejected": 1.223274827003479,
"logps/chosen": -523.537109375,
"logps/rejected": -585.207763671875,
"loss": 0.6902,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0114411236718297,
"rewards/margins": 0.005982697010040283,
"rewards/rejected": 0.005458426661789417,
"step": 5
},
{
"epoch": 0.15,
"learning_rate": 2.3076923076923076e-05,
"logits/chosen": 1.2537128925323486,
"logits/rejected": 1.1953891515731812,
"logps/chosen": -545.0478515625,
"logps/rejected": -489.9993896484375,
"loss": 0.6881,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.015848731622099876,
"rewards/margins": 0.01023783627897501,
"rewards/rejected": 0.0056108953431248665,
"step": 6
},
{
"epoch": 0.18,
"learning_rate": 2.692307692307692e-05,
"logits/chosen": 1.312497854232788,
"logits/rejected": 1.2055679559707642,
"logps/chosen": -545.1070556640625,
"logps/rejected": -509.77001953125,
"loss": 0.6864,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.02675754949450493,
"rewards/margins": 0.013801073655486107,
"rewards/rejected": 0.012956475839018822,
"step": 7
},
{
"epoch": 0.2,
"learning_rate": 3.076923076923077e-05,
"logits/chosen": 1.2600666284561157,
"logits/rejected": 1.1770424842834473,
"logps/chosen": -539.1686401367188,
"logps/rejected": -483.5211181640625,
"loss": 0.6871,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.033090125769376755,
"rewards/margins": 0.012454044073820114,
"rewards/rejected": 0.02063608169555664,
"step": 8
},
{
"epoch": 0.23,
"learning_rate": 3.461538461538461e-05,
"logits/chosen": 1.3227227926254272,
"logits/rejected": 1.2356113195419312,
"logps/chosen": -565.9663696289062,
"logps/rejected": -622.9007568359375,
"loss": 0.6839,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.04537828266620636,
"rewards/margins": 0.019218124449253082,
"rewards/rejected": 0.026160158216953278,
"step": 9
},
{
"epoch": 0.26,
"learning_rate": 3.8461538461538456e-05,
"logits/chosen": 1.3510740995407104,
"logits/rejected": 1.2296315431594849,
"logps/chosen": -593.7393798828125,
"logps/rejected": -594.9727172851562,
"loss": 0.6723,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.07614221423864365,
"rewards/margins": 0.0457853302359581,
"rewards/rejected": 0.030356884002685547,
"step": 10
},
{
"epoch": 0.28,
"learning_rate": 4.23076923076923e-05,
"logits/chosen": 1.2242865562438965,
"logits/rejected": 1.2467900514602661,
"logps/chosen": -508.84783935546875,
"logps/rejected": -598.6978759765625,
"loss": 0.6674,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.08538543432950974,
"rewards/margins": 0.05763913318514824,
"rewards/rejected": 0.027746297419071198,
"step": 11
},
{
"epoch": 0.31,
"learning_rate": 4.615384615384615e-05,
"logits/chosen": 1.3490934371948242,
"logits/rejected": 1.301114559173584,
"logps/chosen": -570.9324340820312,
"logps/rejected": -578.1993408203125,
"loss": 0.6625,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.09820537269115448,
"rewards/margins": 0.06798899918794632,
"rewards/rejected": 0.030216386541724205,
"step": 12
},
{
"epoch": 0.33,
"learning_rate": 4.9999999999999996e-05,
"logits/chosen": 1.31833016872406,
"logits/rejected": 1.2298263311386108,
"logps/chosen": -498.8169860839844,
"logps/rejected": -478.9023742675781,
"loss": 0.6617,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.11317408084869385,
"rewards/margins": 0.06994115561246872,
"rewards/rejected": 0.043232932686805725,
"step": 13
},
{
"epoch": 0.36,
"learning_rate": 5.384615384615384e-05,
"logits/chosen": 1.3441507816314697,
"logits/rejected": 1.2419227361679077,
"logps/chosen": -570.9219970703125,
"logps/rejected": -512.0221557617188,
"loss": 0.6168,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.13483530282974243,
"rewards/margins": 0.17350149154663086,
"rewards/rejected": -0.03866620361804962,
"step": 14
},
{
"epoch": 0.38,
"learning_rate": 5.769230769230769e-05,
"logits/chosen": 1.301425814628601,
"logits/rejected": 1.2630890607833862,
"logps/chosen": -525.2069091796875,
"logps/rejected": -515.6624145507812,
"loss": 0.6319,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.14281943440437317,
"rewards/margins": 0.1536417305469513,
"rewards/rejected": -0.010822296142578125,
"step": 15
},
{
"epoch": 0.41,
"learning_rate": 6.153846153846154e-05,
"logits/chosen": 1.3221077919006348,
"logits/rejected": 1.2412704229354858,
"logps/chosen": -522.0698852539062,
"logps/rejected": -479.1268615722656,
"loss": 0.6041,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.18193425238132477,
"rewards/margins": 0.20868375897407532,
"rewards/rejected": -0.026749493554234505,
"step": 16
},
{
"epoch": 0.44,
"learning_rate": 6.538461538461539e-05,
"logits/chosen": 1.2305197715759277,
"logits/rejected": 1.1909728050231934,
"logps/chosen": -591.547607421875,
"logps/rejected": -501.61956787109375,
"loss": 0.632,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.22359740734100342,
"rewards/margins": 0.16370725631713867,
"rewards/rejected": 0.059890177100896835,
"step": 17
},
{
"epoch": 0.46,
"learning_rate": 6.923076923076922e-05,
"logits/chosen": 1.2290271520614624,
"logits/rejected": 1.2503975629806519,
"logps/chosen": -583.2138671875,
"logps/rejected": -551.908447265625,
"loss": 0.5782,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.25082916021347046,
"rewards/margins": 0.29117509722709656,
"rewards/rejected": -0.04034590348601341,
"step": 18
},
{
"epoch": 0.49,
"learning_rate": 7.307692307692307e-05,
"logits/chosen": 1.2445173263549805,
"logits/rejected": 1.274112582206726,
"logps/chosen": -476.1685791015625,
"logps/rejected": -558.716796875,
"loss": 0.6356,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.2891727685928345,
"rewards/margins": 0.18064022064208984,
"rewards/rejected": 0.10853258520364761,
"step": 19
},
{
"epoch": 0.51,
"learning_rate": 7.692307692307691e-05,
"logits/chosen": 1.3747544288635254,
"logits/rejected": 1.1747362613677979,
"logps/chosen": -604.9407348632812,
"logps/rejected": -521.9097900390625,
"loss": 0.5778,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3479543924331665,
"rewards/margins": 0.2904755175113678,
"rewards/rejected": 0.05747886002063751,
"step": 20
},
{
"epoch": 0.54,
"learning_rate": 8.076923076923076e-05,
"logits/chosen": 1.215498447418213,
"logits/rejected": 1.1988316774368286,
"logps/chosen": -508.58685302734375,
"logps/rejected": -480.9490051269531,
"loss": 0.6143,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.3200737535953522,
"rewards/margins": 0.22501060366630554,
"rewards/rejected": 0.09506310522556305,
"step": 21
},
{
"epoch": 0.56,
"learning_rate": 8.46153846153846e-05,
"logits/chosen": 1.2625510692596436,
"logits/rejected": 1.2751553058624268,
"logps/chosen": -464.3768615722656,
"logps/rejected": -548.1248779296875,
"loss": 0.6585,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.2795701324939728,
"rewards/margins": 0.18192264437675476,
"rewards/rejected": 0.09764745086431503,
"step": 22
},
{
"epoch": 0.59,
"learning_rate": 8.846153846153845e-05,
"logits/chosen": 1.228266716003418,
"logits/rejected": 1.1854723691940308,
"logps/chosen": -542.0804443359375,
"logps/rejected": -593.2991943359375,
"loss": 0.5038,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.5573378801345825,
"rewards/margins": 0.559643566608429,
"rewards/rejected": -0.0023057162761688232,
"step": 23
},
{
"epoch": 0.61,
"learning_rate": 9.23076923076923e-05,
"logits/chosen": 1.3595786094665527,
"logits/rejected": 1.299391746520996,
"logps/chosen": -598.842041015625,
"logps/rejected": -521.9869384765625,
"loss": 0.58,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3911605179309845,
"rewards/margins": 0.45501241087913513,
"rewards/rejected": -0.06385190039873123,
"step": 24
},
{
"epoch": 0.64,
"learning_rate": 9.615384615384615e-05,
"logits/chosen": 1.244804859161377,
"logits/rejected": 1.2789154052734375,
"logps/chosen": -527.1282958984375,
"logps/rejected": -562.9415283203125,
"loss": 0.5638,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.3064769506454468,
"rewards/margins": 0.4528440833091736,
"rewards/rejected": -0.1463671177625656,
"step": 25
},
{
"epoch": 0.67,
"learning_rate": 9.999999999999999e-05,
"logits/chosen": 1.2605584859848022,
"logits/rejected": 1.257567048072815,
"logps/chosen": -518.2035522460938,
"logps/rejected": -553.550537109375,
"loss": 0.5622,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.3198173940181732,
"rewards/margins": 0.533422589302063,
"rewards/rejected": -0.21360518038272858,
"step": 26
},
{
"epoch": 0.69,
"learning_rate": 0.00010384615384615383,
"logits/chosen": 1.2905932664871216,
"logits/rejected": 1.252805233001709,
"logps/chosen": -502.51318359375,
"logps/rejected": -492.2623596191406,
"loss": 0.5688,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.2552236318588257,
"rewards/margins": 0.4563944339752197,
"rewards/rejected": -0.20117078721523285,
"step": 27
},
{
"epoch": 0.72,
"learning_rate": 0.00010769230769230768,
"logits/chosen": 1.360573172569275,
"logits/rejected": 1.246628999710083,
"logps/chosen": -593.9207153320312,
"logps/rejected": -521.7044677734375,
"loss": 0.4958,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.24957714974880219,
"rewards/margins": 0.6095183491706848,
"rewards/rejected": -0.35994118452072144,
"step": 28
},
{
"epoch": 0.74,
"learning_rate": 0.00011153846153846153,
"logits/chosen": 1.3290568590164185,
"logits/rejected": 1.1086769104003906,
"logps/chosen": -588.37451171875,
"logps/rejected": -555.8126220703125,
"loss": 0.4523,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.22330154478549957,
"rewards/margins": 0.8921126127243042,
"rewards/rejected": -0.6688110828399658,
"step": 29
},
{
"epoch": 0.77,
"learning_rate": 0.00011538461538461538,
"logits/chosen": 1.283523678779602,
"logits/rejected": 1.2930572032928467,
"logps/chosen": -533.7445678710938,
"logps/rejected": -590.4415283203125,
"loss": 0.4839,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.2506100535392761,
"rewards/margins": 0.8535110950469971,
"rewards/rejected": -0.6029011011123657,
"step": 30
},
{
"epoch": 0.79,
"learning_rate": 0.00011923076923076922,
"logits/chosen": 1.1903033256530762,
"logits/rejected": 1.2316640615463257,
"logps/chosen": -544.7385864257812,
"logps/rejected": -559.547607421875,
"loss": 0.453,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.16801050305366516,
"rewards/margins": 0.6988197565078735,
"rewards/rejected": -0.530809223651886,
"step": 31
},
{
"epoch": 0.82,
"learning_rate": 0.00012307692307692307,
"logits/chosen": 1.2974334955215454,
"logits/rejected": 1.2153936624526978,
"logps/chosen": -569.9619140625,
"logps/rejected": -532.5298461914062,
"loss": 0.3596,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.41700971126556396,
"rewards/margins": 1.2314927577972412,
"rewards/rejected": -0.814483106136322,
"step": 32
},
{
"epoch": 0.84,
"learning_rate": 0.0001269230769230769,
"logits/chosen": 1.20902681350708,
"logits/rejected": 1.1872200965881348,
"logps/chosen": -517.0770263671875,
"logps/rejected": -496.25469970703125,
"loss": 0.3913,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.6890252828598022,
"rewards/margins": 1.2427911758422852,
"rewards/rejected": -0.5537658929824829,
"step": 33
},
{
"epoch": 0.87,
"learning_rate": 0.00013076923076923077,
"logits/chosen": 1.1348516941070557,
"logits/rejected": 1.2495156526565552,
"logps/chosen": -473.9527893066406,
"logps/rejected": -583.898193359375,
"loss": 0.3421,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.3356397747993469,
"rewards/margins": 1.326012134552002,
"rewards/rejected": -0.9903723001480103,
"step": 34
},
{
"epoch": 0.9,
"learning_rate": 0.0001346153846153846,
"logits/chosen": 1.199057698249817,
"logits/rejected": 1.2061611413955688,
"logps/chosen": -508.8526611328125,
"logps/rejected": -528.3182983398438,
"loss": 0.5245,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.07617084681987762,
"rewards/margins": 0.7057029008865356,
"rewards/rejected": -0.6295321583747864,
"step": 35
},
{
"epoch": 0.92,
"learning_rate": 0.00013846153846153845,
"logits/chosen": 1.324285864830017,
"logits/rejected": 1.1698598861694336,
"logps/chosen": -636.7570190429688,
"logps/rejected": -569.9312744140625,
"loss": 0.3545,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.5845645666122437,
"rewards/margins": 1.4404823780059814,
"rewards/rejected": -0.855917751789093,
"step": 36
},
{
"epoch": 0.95,
"learning_rate": 0.00014230769230769228,
"logits/chosen": 1.3375169038772583,
"logits/rejected": 1.226860761642456,
"logps/chosen": -550.1502685546875,
"logps/rejected": -568.7561645507812,
"loss": 0.2948,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.7275359034538269,
"rewards/margins": 1.519990086555481,
"rewards/rejected": -0.7924542427062988,
"step": 37
},
{
"epoch": 0.97,
"learning_rate": 0.00014615384615384615,
"logits/chosen": 1.3178166151046753,
"logits/rejected": 1.3225042819976807,
"logps/chosen": -583.7266235351562,
"logps/rejected": -640.8178100585938,
"loss": 0.2679,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.0492963790893555,
"rewards/margins": 1.9212793111801147,
"rewards/rejected": -0.8719831109046936,
"step": 38
},
{
"epoch": 1.0,
"learning_rate": 0.00015,
"logits/chosen": 1.424353837966919,
"logits/rejected": 1.2278132438659668,
"logps/chosen": -576.828125,
"logps/rejected": -532.112060546875,
"loss": 0.409,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.294251561164856,
"rewards/margins": 1.8459293842315674,
"rewards/rejected": -0.5516780018806458,
"step": 39
},
{
"epoch": 1.02,
"learning_rate": 0.00015384615384615382,
"logits/chosen": 1.3266693353652954,
"logits/rejected": 1.4058163166046143,
"logps/chosen": -523.482421875,
"logps/rejected": -534.4713134765625,
"loss": 0.2549,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.9553894996643066,
"rewards/margins": 1.72726571559906,
"rewards/rejected": -0.7718762159347534,
"step": 40
},
{
"epoch": 1.05,
"learning_rate": 0.0001576923076923077,
"logits/chosen": 1.2602558135986328,
"logits/rejected": 1.2625583410263062,
"logps/chosen": -522.94287109375,
"logps/rejected": -609.2876586914062,
"loss": 0.181,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.3178558349609375,
"rewards/margins": 2.475386381149292,
"rewards/rejected": -1.1575307846069336,
"step": 41
},
{
"epoch": 1.08,
"learning_rate": 0.00016153846153846153,
"logits/chosen": 1.2560456991195679,
"logits/rejected": 1.3124988079071045,
"logps/chosen": -557.4879150390625,
"logps/rejected": -604.8274536132812,
"loss": 0.2405,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.4575282335281372,
"rewards/margins": 2.372474193572998,
"rewards/rejected": -0.9149457812309265,
"step": 42
},
{
"epoch": 1.1,
"learning_rate": 0.0001653846153846154,
"logits/chosen": 1.2838218212127686,
"logits/rejected": 1.0645534992218018,
"logps/chosen": -572.41357421875,
"logps/rejected": -516.2637939453125,
"loss": 0.1404,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6796542406082153,
"rewards/margins": 2.779283285140991,
"rewards/rejected": -1.0996291637420654,
"step": 43
},
{
"epoch": 1.13,
"learning_rate": 0.0001692307692307692,
"logits/chosen": 1.1537644863128662,
"logits/rejected": 1.1177821159362793,
"logps/chosen": -455.26904296875,
"logps/rejected": -498.28900146484375,
"loss": 0.1726,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.2386869192123413,
"rewards/margins": 2.298076868057251,
"rewards/rejected": -1.0593899488449097,
"step": 44
},
{
"epoch": 1.15,
"learning_rate": 0.00017307692307692304,
"logits/chosen": 1.2277421951293945,
"logits/rejected": 1.1039767265319824,
"logps/chosen": -510.9812927246094,
"logps/rejected": -503.68829345703125,
"loss": 0.2239,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.2717504501342773,
"rewards/margins": 2.59214186668396,
"rewards/rejected": -1.3203915357589722,
"step": 45
},
{
"epoch": 1.18,
"learning_rate": 0.0001769230769230769,
"logits/chosen": 1.2631361484527588,
"logits/rejected": 1.221813440322876,
"logps/chosen": -528.2451171875,
"logps/rejected": -565.52783203125,
"loss": 0.1884,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7057870626449585,
"rewards/margins": 2.7309272289276123,
"rewards/rejected": -1.025140404701233,
"step": 46
},
{
"epoch": 1.2,
"learning_rate": 0.00018076923076923074,
"logits/chosen": 1.1752557754516602,
"logits/rejected": 1.2416189908981323,
"logps/chosen": -501.43609619140625,
"logps/rejected": -574.4725341796875,
"loss": 0.2208,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.349045753479004,
"rewards/margins": 2.619417190551758,
"rewards/rejected": -1.2703715562820435,
"step": 47
},
{
"epoch": 1.23,
"learning_rate": 0.0001846153846153846,
"logits/chosen": 1.2255234718322754,
"logits/rejected": 1.1879360675811768,
"logps/chosen": -533.389404296875,
"logps/rejected": -574.5704956054688,
"loss": 0.2814,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.4104804992675781,
"rewards/margins": 3.444915294647217,
"rewards/rejected": -2.0344350337982178,
"step": 48
},
{
"epoch": 1.25,
"learning_rate": 0.00018846153846153844,
"logits/chosen": 1.3353238105773926,
"logits/rejected": 1.133821964263916,
"logps/chosen": -516.5598754882812,
"logps/rejected": -498.43603515625,
"loss": 0.1799,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.9517850875854492,
"rewards/margins": 3.3812241554260254,
"rewards/rejected": -1.4294389486312866,
"step": 49
},
{
"epoch": 1.28,
"learning_rate": 0.0001923076923076923,
"logits/chosen": 1.4486721754074097,
"logits/rejected": 1.2504708766937256,
"logps/chosen": -577.9517822265625,
"logps/rejected": -578.573974609375,
"loss": 0.1502,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.9723610877990723,
"rewards/margins": 3.7397069931030273,
"rewards/rejected": -1.767345666885376,
"step": 50
},
{
"epoch": 1.31,
"learning_rate": 0.00019615384615384615,
"logits/chosen": 1.156247854232788,
"logits/rejected": 1.1714026927947998,
"logps/chosen": -544.3547973632812,
"logps/rejected": -576.8724365234375,
"loss": 0.2114,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.1724624633789062,
"rewards/margins": 2.6890792846679688,
"rewards/rejected": -1.5166168212890625,
"step": 51
},
{
"epoch": 1.33,
"learning_rate": 0.00019999999999999998,
"logits/chosen": 1.144045352935791,
"logits/rejected": 1.1430819034576416,
"logps/chosen": -508.1917724609375,
"logps/rejected": -617.259521484375,
"loss": 0.1589,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.5832880735397339,
"rewards/margins": 4.405728816986084,
"rewards/rejected": -2.8224408626556396,
"step": 52
},
{
"epoch": 1.36,
"learning_rate": 0.00020384615384615385,
"logits/chosen": 1.2338566780090332,
"logits/rejected": 1.1481688022613525,
"logps/chosen": -531.331298828125,
"logps/rejected": -501.3189697265625,
"loss": 0.1972,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.7195085287094116,
"rewards/margins": 3.0136146545410156,
"rewards/rejected": -2.2941062450408936,
"step": 53
},
{
"epoch": 1.38,
"learning_rate": 0.00020769230769230766,
"logits/chosen": 1.2266112565994263,
"logits/rejected": 1.192571997642517,
"logps/chosen": -549.5713500976562,
"logps/rejected": -617.6051025390625,
"loss": 0.2518,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.4716801345348358,
"rewards/margins": 4.073496341705322,
"rewards/rejected": -3.6018166542053223,
"step": 54
},
{
"epoch": 1.41,
"learning_rate": 0.00021153846153846152,
"logits/chosen": 1.1858152151107788,
"logits/rejected": 1.1164907217025757,
"logps/chosen": -556.048583984375,
"logps/rejected": -599.2432250976562,
"loss": 0.0601,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6191356182098389,
"rewards/margins": 4.675748825073242,
"rewards/rejected": -4.056613445281982,
"step": 55
},
{
"epoch": 1.43,
"learning_rate": 0.00021538461538461536,
"logits/chosen": 1.3085862398147583,
"logits/rejected": 1.105547547340393,
"logps/chosen": -603.694580078125,
"logps/rejected": -638.7554931640625,
"loss": 0.1046,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.4260401725769043,
"rewards/margins": 4.424009799957275,
"rewards/rejected": -3.997969150543213,
"step": 56
},
{
"epoch": 1.46,
"learning_rate": 0.0002192307692307692,
"logits/chosen": 1.0858182907104492,
"logits/rejected": 1.0118762254714966,
"logps/chosen": -547.671875,
"logps/rejected": -617.9951782226562,
"loss": 0.222,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.33517026901245117,
"rewards/margins": 3.591820001602173,
"rewards/rejected": -3.2566497325897217,
"step": 57
},
{
"epoch": 1.48,
"learning_rate": 0.00022307692307692306,
"logits/chosen": 1.0582268238067627,
"logits/rejected": 1.1294262409210205,
"logps/chosen": -482.9873962402344,
"logps/rejected": -656.79736328125,
"loss": 0.0781,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.6922726035118103,
"rewards/margins": 4.7888617515563965,
"rewards/rejected": -4.0965895652771,
"step": 58
},
{
"epoch": 1.51,
"learning_rate": 0.0002269230769230769,
"logits/chosen": 1.1849119663238525,
"logits/rejected": 0.989042341709137,
"logps/chosen": -579.6500854492188,
"logps/rejected": -521.362060546875,
"loss": 0.3419,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1423124074935913,
"rewards/margins": 2.5491743087768555,
"rewards/rejected": -2.6914870738983154,
"step": 59
},
{
"epoch": 1.54,
"learning_rate": 0.00023076923076923076,
"logits/chosen": 1.2060493230819702,
"logits/rejected": 1.0908496379852295,
"logps/chosen": -503.88165283203125,
"logps/rejected": -486.8570251464844,
"loss": 0.2709,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.47895151376724243,
"rewards/margins": 3.0553998947143555,
"rewards/rejected": -2.576448917388916,
"step": 60
},
{
"epoch": 1.56,
"learning_rate": 0.0002346153846153846,
"logits/chosen": 1.331544041633606,
"logits/rejected": 1.2061303853988647,
"logps/chosen": -567.0886840820312,
"logps/rejected": -555.7452392578125,
"loss": 0.1176,
"rewards/accuracies": 0.9375,
"rewards/chosen": 2.0240767002105713,
"rewards/margins": 3.8347549438476562,
"rewards/rejected": -1.810678243637085,
"step": 61
},
{
"epoch": 1.59,
"learning_rate": 0.00023846153846153844,
"logits/chosen": 1.4309509992599487,
"logits/rejected": 1.2126150131225586,
"logps/chosen": -569.264892578125,
"logps/rejected": -555.525390625,
"loss": 0.1622,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.7101458311080933,
"rewards/margins": 3.121368885040283,
"rewards/rejected": -1.4112231731414795,
"step": 62
},
{
"epoch": 1.61,
"learning_rate": 0.0002423076923076923,
"logits/chosen": 1.4115931987762451,
"logits/rejected": 1.3143726587295532,
"logps/chosen": -567.4575805664062,
"logps/rejected": -551.8060302734375,
"loss": 0.1243,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.250972270965576,
"rewards/margins": 3.4197261333465576,
"rewards/rejected": -1.168753981590271,
"step": 63
},
{
"epoch": 1.64,
"learning_rate": 0.00024615384615384614,
"logits/chosen": 1.4023628234863281,
"logits/rejected": 1.4432225227355957,
"logps/chosen": -561.3869018554688,
"logps/rejected": -625.6553344726562,
"loss": 0.137,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.7209982872009277,
"rewards/margins": 3.365004777908325,
"rewards/rejected": -0.6440060138702393,
"step": 64
},
{
"epoch": 1.66,
"learning_rate": 0.00025,
"logits/chosen": 1.403619647026062,
"logits/rejected": 1.3114700317382812,
"logps/chosen": -557.41796875,
"logps/rejected": -585.562744140625,
"loss": 0.0987,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.93491268157959,
"rewards/margins": 3.9586920738220215,
"rewards/rejected": -1.0237791538238525,
"step": 65
},
{
"epoch": 1.69,
"learning_rate": 0.0002538461538461538,
"logits/chosen": 1.2774848937988281,
"logits/rejected": 1.391822338104248,
"logps/chosen": -501.67236328125,
"logps/rejected": -607.6801147460938,
"loss": 0.1317,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.4067182540893555,
"rewards/margins": 3.5049643516540527,
"rewards/rejected": -1.0982458591461182,
"step": 66
},
{
"epoch": 1.72,
"learning_rate": 0.0002576923076923077,
"logits/chosen": 1.2345640659332275,
"logits/rejected": 1.3340271711349487,
"logps/chosen": -502.60589599609375,
"logps/rejected": -594.7681884765625,
"loss": 0.1882,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.6566812992095947,
"rewards/margins": 3.0219056606292725,
"rewards/rejected": -1.3652244806289673,
"step": 67
},
{
"epoch": 1.74,
"learning_rate": 0.00026153846153846154,
"logits/chosen": 1.3136622905731201,
"logits/rejected": 1.1759097576141357,
"logps/chosen": -554.1909790039062,
"logps/rejected": -477.70074462890625,
"loss": 0.1877,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.4338759183883667,
"rewards/margins": 3.288071393966675,
"rewards/rejected": -1.854195237159729,
"step": 68
},
{
"epoch": 1.77,
"learning_rate": 0.00026538461538461536,
"logits/chosen": 1.3607081174850464,
"logits/rejected": 1.173765778541565,
"logps/chosen": -581.3910522460938,
"logps/rejected": -505.4391784667969,
"loss": 0.1424,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.8081064224243164,
"rewards/margins": 4.335569858551025,
"rewards/rejected": -2.527463436126709,
"step": 69
},
{
"epoch": 1.79,
"learning_rate": 0.0002692307692307692,
"logits/chosen": 1.3691301345825195,
"logits/rejected": 1.3659199476242065,
"logps/chosen": -583.293212890625,
"logps/rejected": -593.8999633789062,
"loss": 0.1304,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.1818904876708984,
"rewards/margins": 3.6441099643707275,
"rewards/rejected": -2.46221923828125,
"step": 70
},
{
"epoch": 1.82,
"learning_rate": 0.00027307692307692303,
"logits/chosen": 1.1499935388565063,
"logits/rejected": 1.1766197681427002,
"logps/chosen": -482.293701171875,
"logps/rejected": -570.5851440429688,
"loss": 0.0857,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.165338158607483,
"rewards/margins": 3.8725168704986572,
"rewards/rejected": -2.7071785926818848,
"step": 71
},
{
"epoch": 1.84,
"learning_rate": 0.0002769230769230769,
"logits/chosen": 1.325207233428955,
"logits/rejected": 1.1687246561050415,
"logps/chosen": -571.1265869140625,
"logps/rejected": -600.61865234375,
"loss": 0.1814,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.56050443649292,
"rewards/margins": 3.7676329612731934,
"rewards/rejected": -2.2071282863616943,
"step": 72
},
{
"epoch": 1.87,
"learning_rate": 0.00028076923076923076,
"logits/chosen": 1.3450841903686523,
"logits/rejected": 1.151707410812378,
"logps/chosen": -538.774658203125,
"logps/rejected": -446.6949157714844,
"loss": 0.091,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.3523597717285156,
"rewards/margins": 3.761559247970581,
"rewards/rejected": -2.4091997146606445,
"step": 73
},
{
"epoch": 1.89,
"learning_rate": 0.00028461538461538457,
"logits/chosen": 1.3139710426330566,
"logits/rejected": 1.1975148916244507,
"logps/chosen": -544.6610107421875,
"logps/rejected": -459.76275634765625,
"loss": 0.2291,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.633323311805725,
"rewards/margins": 3.0829625129699707,
"rewards/rejected": -1.4496394395828247,
"step": 74
},
{
"epoch": 1.92,
"learning_rate": 0.00028846153846153843,
"logits/chosen": 1.4247934818267822,
"logits/rejected": 1.3758761882781982,
"logps/chosen": -518.5802612304688,
"logps/rejected": -527.0255737304688,
"loss": 0.1177,
"rewards/accuracies": 0.9375,
"rewards/chosen": 2.010237216949463,
"rewards/margins": 3.8918490409851074,
"rewards/rejected": -1.8816115856170654,
"step": 75
},
{
"epoch": 1.95,
"learning_rate": 0.0002923076923076923,
"logits/chosen": 1.3634833097457886,
"logits/rejected": 1.2164397239685059,
"logps/chosen": -534.68505859375,
"logps/rejected": -506.5450744628906,
"loss": 0.0595,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.536834239959717,
"rewards/margins": 3.897109031677246,
"rewards/rejected": -1.3602746725082397,
"step": 76
},
{
"epoch": 1.97,
"learning_rate": 0.00029615384615384616,
"logits/chosen": 1.3006478548049927,
"logits/rejected": 1.3856381177902222,
"logps/chosen": -469.55450439453125,
"logps/rejected": -642.709716796875,
"loss": 0.148,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.724798560142517,
"rewards/margins": 3.511629343032837,
"rewards/rejected": -1.7868304252624512,
"step": 77
},
{
"epoch": 2.0,
"learning_rate": 0.0003,
"logits/chosen": 1.328688621520996,
"logits/rejected": 1.1962082386016846,
"logps/chosen": -516.2116088867188,
"logps/rejected": -504.8653259277344,
"loss": 0.0937,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.6297885179519653,
"rewards/margins": 3.997544288635254,
"rewards/rejected": -2.36775541305542,
"step": 78
},
{
"epoch": 2.02,
"learning_rate": 0.00029957264957264953,
"logits/chosen": 1.3884762525558472,
"logits/rejected": 1.2696239948272705,
"logps/chosen": -575.4322509765625,
"logps/rejected": -583.3463745117188,
"loss": 0.0528,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6055470705032349,
"rewards/margins": 4.734403133392334,
"rewards/rejected": -3.1288557052612305,
"step": 79
},
{
"epoch": 2.05,
"learning_rate": 0.00029914529914529915,
"logits/chosen": 1.2306060791015625,
"logits/rejected": 1.201812982559204,
"logps/chosen": -526.1033935546875,
"logps/rejected": -625.164306640625,
"loss": 0.0742,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.5637297034263611,
"rewards/margins": 4.7755279541015625,
"rewards/rejected": -4.211798191070557,
"step": 80
},
{
"epoch": 2.07,
"learning_rate": 0.0002987179487179487,
"logits/chosen": 1.2625794410705566,
"logits/rejected": 1.1102485656738281,
"logps/chosen": -581.4782104492188,
"logps/rejected": -620.8692626953125,
"loss": 0.015,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.47795385122299194,
"rewards/margins": 5.5137505531311035,
"rewards/rejected": -5.035797119140625,
"step": 81
},
{
"epoch": 2.1,
"learning_rate": 0.00029829059829059826,
"logits/chosen": 1.193795919418335,
"logits/rejected": 1.13469660282135,
"logps/chosen": -534.1414794921875,
"logps/rejected": -540.5661010742188,
"loss": 0.0537,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0871587023139,
"rewards/margins": 4.834710121154785,
"rewards/rejected": -4.747550964355469,
"step": 82
},
{
"epoch": 2.12,
"learning_rate": 0.0002978632478632478,
"logits/chosen": 1.245851755142212,
"logits/rejected": 1.0414592027664185,
"logps/chosen": -603.68212890625,
"logps/rejected": -521.8074340820312,
"loss": 0.046,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3754722476005554,
"rewards/margins": 4.623739719390869,
"rewards/rejected": -4.248267650604248,
"step": 83
},
{
"epoch": 2.15,
"learning_rate": 0.00029743589743589743,
"logits/chosen": 1.0934433937072754,
"logits/rejected": 1.0990025997161865,
"logps/chosen": -511.38897705078125,
"logps/rejected": -607.8187255859375,
"loss": 0.0567,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.49227023124694824,
"rewards/margins": 5.553871154785156,
"rewards/rejected": -6.046142101287842,
"step": 84
},
{
"epoch": 2.18,
"learning_rate": 0.000297008547008547,
"logits/chosen": 1.1143027544021606,
"logits/rejected": 1.1277693510055542,
"logps/chosen": -501.1644287109375,
"logps/rejected": -627.3696899414062,
"loss": 0.0149,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9094219207763672,
"rewards/margins": 5.872971534729004,
"rewards/rejected": -4.9635491371154785,
"step": 85
},
{
"epoch": 2.2,
"learning_rate": 0.00029658119658119655,
"logits/chosen": 1.2900817394256592,
"logits/rejected": 1.258035659790039,
"logps/chosen": -482.3966369628906,
"logps/rejected": -613.1928100585938,
"loss": 0.006,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0523629188537598,
"rewards/margins": 6.690260887145996,
"rewards/rejected": -5.6378984451293945,
"step": 86
},
{
"epoch": 2.23,
"learning_rate": 0.00029615384615384616,
"logits/chosen": 1.2223701477050781,
"logits/rejected": 1.1854349374771118,
"logps/chosen": -504.41375732421875,
"logps/rejected": -530.4273071289062,
"loss": 0.1105,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.9830568432807922,
"rewards/margins": 5.67386531829834,
"rewards/rejected": -4.6908087730407715,
"step": 87
},
{
"epoch": 2.25,
"learning_rate": 0.0002957264957264957,
"logits/chosen": 1.3008583784103394,
"logits/rejected": 1.2168331146240234,
"logps/chosen": -580.7012939453125,
"logps/rejected": -591.1567993164062,
"loss": 0.028,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.010232925415039,
"rewards/margins": 5.921438694000244,
"rewards/rejected": -4.911205768585205,
"step": 88
},
{
"epoch": 2.28,
"learning_rate": 0.0002952991452991453,
"logits/chosen": 1.4120073318481445,
"logits/rejected": 1.3543351888656616,
"logps/chosen": -551.34130859375,
"logps/rejected": -607.3863525390625,
"loss": 0.0082,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7222862243652344,
"rewards/margins": 6.246278762817383,
"rewards/rejected": -4.523993492126465,
"step": 89
},
{
"epoch": 2.3,
"learning_rate": 0.00029487179487179484,
"logits/chosen": 1.3433583974838257,
"logits/rejected": 1.2706623077392578,
"logps/chosen": -550.4794921875,
"logps/rejected": -574.7581787109375,
"loss": 0.0251,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3056535720825195,
"rewards/margins": 6.369759559631348,
"rewards/rejected": -4.064105033874512,
"step": 90
},
{
"epoch": 2.33,
"learning_rate": 0.00029444444444444445,
"logits/chosen": 1.4810067415237427,
"logits/rejected": 1.3233308792114258,
"logps/chosen": -586.744384765625,
"logps/rejected": -591.3648071289062,
"loss": 0.0176,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4182064533233643,
"rewards/margins": 6.562655448913574,
"rewards/rejected": -4.144449710845947,
"step": 91
},
{
"epoch": 2.36,
"learning_rate": 0.000294017094017094,
"logits/chosen": 1.4654786586761475,
"logits/rejected": 1.3706129789352417,
"logps/chosen": -496.12701416015625,
"logps/rejected": -567.374755859375,
"loss": 0.0221,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9136104583740234,
"rewards/margins": 5.71716833114624,
"rewards/rejected": -3.8035576343536377,
"step": 92
},
{
"epoch": 2.38,
"learning_rate": 0.00029358974358974357,
"logits/chosen": 1.4996682405471802,
"logits/rejected": 1.4585434198379517,
"logps/chosen": -568.8134155273438,
"logps/rejected": -659.0804443359375,
"loss": 0.0161,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9488861560821533,
"rewards/margins": 6.1611528396606445,
"rewards/rejected": -4.2122673988342285,
"step": 93
},
{
"epoch": 2.41,
"learning_rate": 0.00029316239316239313,
"logits/chosen": 1.433556318283081,
"logits/rejected": 1.4805834293365479,
"logps/chosen": -551.3663940429688,
"logps/rejected": -583.002685546875,
"loss": 0.039,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.608030080795288,
"rewards/margins": 5.825406551361084,
"rewards/rejected": -3.217377185821533,
"step": 94
},
{
"epoch": 2.43,
"learning_rate": 0.0002927350427350427,
"logits/chosen": 1.5423190593719482,
"logits/rejected": 1.3645411729812622,
"logps/chosen": -541.7529296875,
"logps/rejected": -540.1168212890625,
"loss": 0.0073,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6581199169158936,
"rewards/margins": 6.796147346496582,
"rewards/rejected": -4.138028144836426,
"step": 95
},
{
"epoch": 2.46,
"learning_rate": 0.0002923076923076923,
"logits/chosen": 1.3319764137268066,
"logits/rejected": 1.3734033107757568,
"logps/chosen": -505.6252136230469,
"logps/rejected": -595.2819213867188,
"loss": 0.0207,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1945226192474365,
"rewards/margins": 5.589131832122803,
"rewards/rejected": -4.3946099281311035,
"step": 96
},
{
"epoch": 2.48,
"learning_rate": 0.00029188034188034186,
"logits/chosen": 1.326178789138794,
"logits/rejected": 1.4406118392944336,
"logps/chosen": -514.8160400390625,
"logps/rejected": -575.283447265625,
"loss": 0.0135,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6229735612869263,
"rewards/margins": 6.596070766448975,
"rewards/rejected": -4.973097324371338,
"step": 97
},
{
"epoch": 2.51,
"learning_rate": 0.0002914529914529914,
"logits/chosen": 1.408813714981079,
"logits/rejected": 1.3960859775543213,
"logps/chosen": -542.9385375976562,
"logps/rejected": -647.4051513671875,
"loss": 0.0683,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.39411336183547974,
"rewards/margins": 6.164831161499023,
"rewards/rejected": -5.770717620849609,
"step": 98
},
{
"epoch": 2.53,
"learning_rate": 0.000291025641025641,
"logits/chosen": 1.2414624691009521,
"logits/rejected": 1.2821427583694458,
"logps/chosen": -500.63568115234375,
"logps/rejected": -617.734375,
"loss": 0.1346,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.0469615459442139,
"rewards/margins": 6.8481831550598145,
"rewards/rejected": -5.8012213706970215,
"step": 99
},
{
"epoch": 2.56,
"learning_rate": 0.0002905982905982906,
"logits/chosen": 1.3173812627792358,
"logits/rejected": 1.1461554765701294,
"logps/chosen": -596.7745971679688,
"logps/rejected": -591.0558471679688,
"loss": 0.0567,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.7747964262962341,
"rewards/margins": 5.85588264465332,
"rewards/rejected": -5.081086158752441,
"step": 100
},
{
"epoch": 2.59,
"learning_rate": 0.00029017094017094015,
"logits/chosen": 1.1509640216827393,
"logits/rejected": 1.1098980903625488,
"logps/chosen": -474.10675048828125,
"logps/rejected": -535.8497924804688,
"loss": 0.031,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1605219841003418,
"rewards/margins": 6.742035865783691,
"rewards/rejected": -5.58151388168335,
"step": 101
},
{
"epoch": 2.61,
"learning_rate": 0.0002897435897435897,
"logits/chosen": 1.1271547079086304,
"logits/rejected": 0.9869892001152039,
"logps/chosen": -542.78662109375,
"logps/rejected": -566.442626953125,
"loss": 0.2008,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.9951988458633423,
"rewards/margins": 6.587618827819824,
"rewards/rejected": -5.5924201011657715,
"step": 102
},
{
"epoch": 2.64,
"learning_rate": 0.00028931623931623926,
"logits/chosen": 1.2000806331634521,
"logits/rejected": 1.0094119310379028,
"logps/chosen": -550.8510131835938,
"logps/rejected": -531.4238891601562,
"loss": 0.0239,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3359670639038086,
"rewards/margins": 6.495780944824219,
"rewards/rejected": -5.159814834594727,
"step": 103
},
{
"epoch": 2.66,
"learning_rate": 0.0002888888888888888,
"logits/chosen": 1.102707862854004,
"logits/rejected": 1.2245404720306396,
"logps/chosen": -528.5265502929688,
"logps/rejected": -665.08544921875,
"loss": 0.0174,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7339667081832886,
"rewards/margins": 7.117589473724365,
"rewards/rejected": -6.383623123168945,
"step": 104
},
{
"epoch": 2.69,
"learning_rate": 0.00028846153846153843,
"logits/chosen": 1.1308355331420898,
"logits/rejected": 0.9417912364006042,
"logps/chosen": -553.8656005859375,
"logps/rejected": -555.5968627929688,
"loss": 0.0185,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9540010690689087,
"rewards/margins": 6.364682197570801,
"rewards/rejected": -4.410680770874023,
"step": 105
},
{
"epoch": 2.71,
"learning_rate": 0.000288034188034188,
"logits/chosen": 1.2123243808746338,
"logits/rejected": 0.9968570470809937,
"logps/chosen": -567.16748046875,
"logps/rejected": -521.2568359375,
"loss": 0.0109,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0697741508483887,
"rewards/margins": 6.565426349639893,
"rewards/rejected": -4.495651721954346,
"step": 106
},
{
"epoch": 2.74,
"learning_rate": 0.00028760683760683755,
"logits/chosen": 1.1736996173858643,
"logits/rejected": 1.147781252861023,
"logps/chosen": -542.6943359375,
"logps/rejected": -622.96826171875,
"loss": 0.0085,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.328216075897217,
"rewards/margins": 7.282103538513184,
"rewards/rejected": -4.953887939453125,
"step": 107
},
{
"epoch": 2.76,
"learning_rate": 0.00028717948717948716,
"logits/chosen": 1.060608983039856,
"logits/rejected": 1.2086546421051025,
"logps/chosen": -499.45989990234375,
"logps/rejected": -617.087646484375,
"loss": 0.0268,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5071005821228027,
"rewards/margins": 5.79173469543457,
"rewards/rejected": -3.2846338748931885,
"step": 108
},
{
"epoch": 2.79,
"learning_rate": 0.0002867521367521367,
"logits/chosen": 1.180185079574585,
"logits/rejected": 0.9707103967666626,
"logps/chosen": -530.9793701171875,
"logps/rejected": -531.7672119140625,
"loss": 0.0172,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.693030595779419,
"rewards/margins": 5.776254653930664,
"rewards/rejected": -3.083223819732666,
"step": 109
},
{
"epoch": 2.82,
"learning_rate": 0.0002863247863247863,
"logits/chosen": 1.2219749689102173,
"logits/rejected": 1.1085822582244873,
"logps/chosen": -579.6038818359375,
"logps/rejected": -576.358642578125,
"loss": 0.0091,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.139183521270752,
"rewards/margins": 5.98101806640625,
"rewards/rejected": -2.84183406829834,
"step": 110
},
{
"epoch": 2.84,
"learning_rate": 0.00028589743589743584,
"logits/chosen": 1.190647006034851,
"logits/rejected": 1.1908663511276245,
"logps/chosen": -514.8892211914062,
"logps/rejected": -611.48046875,
"loss": 0.0565,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.977041006088257,
"rewards/margins": 6.328646659851074,
"rewards/rejected": -3.3516054153442383,
"step": 111
},
{
"epoch": 2.87,
"learning_rate": 0.00028547008547008545,
"logits/chosen": 1.1982405185699463,
"logits/rejected": 1.1012368202209473,
"logps/chosen": -496.9002685546875,
"logps/rejected": -544.1046142578125,
"loss": 0.0649,
"rewards/accuracies": 0.96875,
"rewards/chosen": 3.5147013664245605,
"rewards/margins": 6.30265998840332,
"rewards/rejected": -2.787958860397339,
"step": 112
},
{
"epoch": 2.89,
"learning_rate": 0.000285042735042735,
"logits/chosen": 1.1536628007888794,
"logits/rejected": 1.0870661735534668,
"logps/chosen": -475.55023193359375,
"logps/rejected": -584.7400512695312,
"loss": 0.0315,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8786869049072266,
"rewards/margins": 6.310683250427246,
"rewards/rejected": -3.4319963455200195,
"step": 113
},
{
"epoch": 2.92,
"learning_rate": 0.00028461538461538457,
"logits/chosen": 1.129310965538025,
"logits/rejected": 1.0381569862365723,
"logps/chosen": -506.8164978027344,
"logps/rejected": -490.2034606933594,
"loss": 0.0435,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.066460609436035,
"rewards/margins": 5.390075206756592,
"rewards/rejected": -2.3236145973205566,
"step": 114
},
{
"epoch": 2.94,
"learning_rate": 0.0002841880341880342,
"logits/chosen": 1.2719249725341797,
"logits/rejected": 1.0853713750839233,
"logps/chosen": -580.2399291992188,
"logps/rejected": -545.1849975585938,
"loss": 0.0081,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0230274200439453,
"rewards/margins": 7.315037727355957,
"rewards/rejected": -4.292009353637695,
"step": 115
},
{
"epoch": 2.97,
"learning_rate": 0.00028376068376068374,
"logits/chosen": 1.2354512214660645,
"logits/rejected": 1.1801047325134277,
"logps/chosen": -482.8843994140625,
"logps/rejected": -646.3005981445312,
"loss": 0.0077,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2153114080429077,
"rewards/margins": 6.708934307098389,
"rewards/rejected": -5.493622779846191,
"step": 116
},
{
"epoch": 3.0,
"learning_rate": 0.0002833333333333333,
"logits/chosen": 0.9910479784011841,
"logits/rejected": 1.020785927772522,
"logps/chosen": -481.9186706542969,
"logps/rejected": -603.8145141601562,
"loss": 0.0455,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.37189701199531555,
"rewards/margins": 6.756865978240967,
"rewards/rejected": -6.384969711303711,
"step": 117
},
{
"epoch": 3.02,
"learning_rate": 0.0002829059829059829,
"logits/chosen": 1.066985845565796,
"logits/rejected": 1.0346630811691284,
"logps/chosen": -550.504638671875,
"logps/rejected": -601.9063720703125,
"loss": 0.0125,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5437355041503906,
"rewards/margins": 7.08186674118042,
"rewards/rejected": -6.538131237030029,
"step": 118
},
{
"epoch": 3.05,
"learning_rate": 0.00028247863247863247,
"logits/chosen": 1.1769685745239258,
"logits/rejected": 1.0711925029754639,
"logps/chosen": -572.137451171875,
"logps/rejected": -591.6756591796875,
"loss": 0.0185,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4036594033241272,
"rewards/margins": 7.711120128631592,
"rewards/rejected": -7.307460784912109,
"step": 119
},
{
"epoch": 3.07,
"learning_rate": 0.00028205128205128203,
"logits/chosen": 1.1407585144042969,
"logits/rejected": 1.0219597816467285,
"logps/chosen": -486.6500244140625,
"logps/rejected": -650.4639282226562,
"loss": 0.0075,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8903399705886841,
"rewards/margins": 8.448041915893555,
"rewards/rejected": -7.5577006340026855,
"step": 120
},
{
"epoch": 3.1,
"learning_rate": 0.0002816239316239316,
"logits/chosen": 1.146453619003296,
"logits/rejected": 1.0628845691680908,
"logps/chosen": -531.998046875,
"logps/rejected": -557.2601928710938,
"loss": 0.0032,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3119817972183228,
"rewards/margins": 7.6520514488220215,
"rewards/rejected": -6.340068817138672,
"step": 121
},
{
"epoch": 3.12,
"learning_rate": 0.0002811965811965812,
"logits/chosen": 1.0393480062484741,
"logits/rejected": 1.0280386209487915,
"logps/chosen": -470.7844543457031,
"logps/rejected": -552.33935546875,
"loss": 0.011,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4702866077423096,
"rewards/margins": 7.459576606750488,
"rewards/rejected": -5.9892897605896,
"step": 122
},
{
"epoch": 3.15,
"learning_rate": 0.00028076923076923076,
"logits/chosen": 1.1644623279571533,
"logits/rejected": 1.0770840644836426,
"logps/chosen": -546.8715209960938,
"logps/rejected": -534.8888549804688,
"loss": 0.0046,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.770728349685669,
"rewards/margins": 7.339654922485352,
"rewards/rejected": -5.5689263343811035,
"step": 123
},
{
"epoch": 3.17,
"learning_rate": 0.0002803418803418803,
"logits/chosen": 1.0946919918060303,
"logits/rejected": 1.1111879348754883,
"logps/chosen": -448.84698486328125,
"logps/rejected": -623.4598388671875,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.276505470275879,
"rewards/margins": 7.733612060546875,
"rewards/rejected": -5.457107067108154,
"step": 124
},
{
"epoch": 3.2,
"learning_rate": 0.00027991452991452993,
"logits/chosen": 1.083165168762207,
"logits/rejected": 1.1361708641052246,
"logps/chosen": -491.6983947753906,
"logps/rejected": -597.7027587890625,
"loss": 0.0045,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.072481632232666,
"rewards/margins": 8.625251770019531,
"rewards/rejected": -6.552770137786865,
"step": 125
},
{
"epoch": 3.23,
"learning_rate": 0.0002794871794871795,
"logits/chosen": 1.2724071741104126,
"logits/rejected": 1.118057370185852,
"logps/chosen": -525.2489013671875,
"logps/rejected": -523.6296997070312,
"loss": 0.0102,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.201770544052124,
"rewards/margins": 6.9069504737854,
"rewards/rejected": -4.7051801681518555,
"step": 126
},
{
"epoch": 3.25,
"learning_rate": 0.00027905982905982905,
"logits/chosen": 1.2534475326538086,
"logits/rejected": 1.2043638229370117,
"logps/chosen": -574.5486450195312,
"logps/rejected": -598.8119506835938,
"loss": 0.0061,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.390216112136841,
"rewards/margins": 8.325881004333496,
"rewards/rejected": -5.935665130615234,
"step": 127
},
{
"epoch": 3.28,
"learning_rate": 0.0002786324786324786,
"logits/chosen": 1.2600340843200684,
"logits/rejected": 1.2670692205429077,
"logps/chosen": -539.1239624023438,
"logps/rejected": -605.8324584960938,
"loss": 0.0025,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.585294485092163,
"rewards/margins": 8.082437515258789,
"rewards/rejected": -5.497143268585205,
"step": 128
},
{
"epoch": 3.3,
"learning_rate": 0.00027820512820512816,
"logits/chosen": 1.2851054668426514,
"logits/rejected": 1.1849486827850342,
"logps/chosen": -571.027587890625,
"logps/rejected": -601.7526245117188,
"loss": 0.0025,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7168142795562744,
"rewards/margins": 7.749199867248535,
"rewards/rejected": -5.03238582611084,
"step": 129
},
{
"epoch": 3.33,
"learning_rate": 0.0002777777777777778,
"logits/chosen": 1.3977611064910889,
"logits/rejected": 1.210925817489624,
"logps/chosen": -533.6973266601562,
"logps/rejected": -598.4097900390625,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4912829399108887,
"rewards/margins": 8.09334945678711,
"rewards/rejected": -5.602066516876221,
"step": 130
},
{
"epoch": 3.35,
"learning_rate": 0.00027735042735042734,
"logits/chosen": 1.3112545013427734,
"logits/rejected": 1.2536931037902832,
"logps/chosen": -516.1342163085938,
"logps/rejected": -645.7244873046875,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.322323799133301,
"rewards/margins": 8.349126815795898,
"rewards/rejected": -6.026803493499756,
"step": 131
},
{
"epoch": 3.38,
"learning_rate": 0.0002769230769230769,
"logits/chosen": 1.3068538904190063,
"logits/rejected": 1.3506109714508057,
"logps/chosen": -503.5260925292969,
"logps/rejected": -658.4425659179688,
"loss": 0.0067,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.650004506111145,
"rewards/margins": 7.3509368896484375,
"rewards/rejected": -5.700932025909424,
"step": 132
},
{
"epoch": 3.4,
"learning_rate": 0.00027649572649572645,
"logits/chosen": 1.336862325668335,
"logits/rejected": 1.2281874418258667,
"logps/chosen": -541.9310302734375,
"logps/rejected": -600.86474609375,
"loss": 0.0064,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8469295501708984,
"rewards/margins": 7.918344974517822,
"rewards/rejected": -6.071415901184082,
"step": 133
},
{
"epoch": 3.43,
"learning_rate": 0.00027606837606837607,
"logits/chosen": 1.3074719905853271,
"logits/rejected": 1.3130128383636475,
"logps/chosen": -484.4212646484375,
"logps/rejected": -639.7544555664062,
"loss": 0.0861,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.4394211769104004,
"rewards/margins": 7.3523383140563965,
"rewards/rejected": -4.912917613983154,
"step": 134
},
{
"epoch": 3.46,
"learning_rate": 0.0002756410256410256,
"logits/chosen": 1.3250826597213745,
"logits/rejected": 1.2685434818267822,
"logps/chosen": -496.5311279296875,
"logps/rejected": -562.654541015625,
"loss": 0.09,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.504718065261841,
"rewards/margins": 7.967668056488037,
"rewards/rejected": -5.462949752807617,
"step": 135
},
{
"epoch": 3.48,
"learning_rate": 0.0002752136752136752,
"logits/chosen": 1.4271235466003418,
"logits/rejected": 1.2630449533462524,
"logps/chosen": -561.2323608398438,
"logps/rejected": -642.5577392578125,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4116668701171875,
"rewards/margins": 8.215126037597656,
"rewards/rejected": -5.803459167480469,
"step": 136
},
{
"epoch": 3.51,
"learning_rate": 0.00027478632478632474,
"logits/chosen": 1.3756340742111206,
"logits/rejected": 1.4128466844558716,
"logps/chosen": -559.175048828125,
"logps/rejected": -650.5951538085938,
"loss": 0.0045,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.0436666011810303,
"rewards/margins": 7.851646900177002,
"rewards/rejected": -4.807980537414551,
"step": 137
},
{
"epoch": 3.53,
"learning_rate": 0.0002743589743589743,
"logits/chosen": 1.3762015104293823,
"logits/rejected": 1.2889195680618286,
"logps/chosen": -538.1447143554688,
"logps/rejected": -631.197998046875,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8010246753692627,
"rewards/margins": 7.74313497543335,
"rewards/rejected": -4.942111015319824,
"step": 138
},
{
"epoch": 3.56,
"learning_rate": 0.0002739316239316239,
"logits/chosen": 1.4248151779174805,
"logits/rejected": 1.335301399230957,
"logps/chosen": -473.7148132324219,
"logps/rejected": -559.2073974609375,
"loss": 0.003,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2720112800598145,
"rewards/margins": 7.730058670043945,
"rewards/rejected": -4.458047389984131,
"step": 139
},
{
"epoch": 3.58,
"learning_rate": 0.00027350427350427347,
"logits/chosen": 1.3754342794418335,
"logits/rejected": 1.3729346990585327,
"logps/chosen": -519.9334106445312,
"logps/rejected": -599.0367431640625,
"loss": 0.0136,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4481189250946045,
"rewards/margins": 8.235373497009277,
"rewards/rejected": -4.787254810333252,
"step": 140
},
{
"epoch": 3.61,
"learning_rate": 0.00027307692307692303,
"logits/chosen": 1.3112382888793945,
"logits/rejected": 1.2730636596679688,
"logps/chosen": -535.9412231445312,
"logps/rejected": -491.223388671875,
"loss": 0.0156,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4259979724884033,
"rewards/margins": 7.016923904418945,
"rewards/rejected": -4.590925693511963,
"step": 141
},
{
"epoch": 3.64,
"learning_rate": 0.0002726495726495726,
"logits/chosen": 1.4348691701889038,
"logits/rejected": 1.2340961694717407,
"logps/chosen": -533.677978515625,
"logps/rejected": -525.3228149414062,
"loss": 0.0146,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7481579780578613,
"rewards/margins": 7.346179008483887,
"rewards/rejected": -4.598021030426025,
"step": 142
},
{
"epoch": 3.66,
"learning_rate": 0.0002722222222222222,
"logits/chosen": 1.4624712467193604,
"logits/rejected": 1.4238498210906982,
"logps/chosen": -521.18310546875,
"logps/rejected": -628.2357788085938,
"loss": 0.0103,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.531442880630493,
"rewards/margins": 7.304967880249023,
"rewards/rejected": -4.773524761199951,
"step": 143
},
{
"epoch": 3.69,
"learning_rate": 0.00027179487179487176,
"logits/chosen": 1.3994379043579102,
"logits/rejected": 1.3517390489578247,
"logps/chosen": -487.254150390625,
"logps/rejected": -562.578369140625,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.351099729537964,
"rewards/margins": 8.097498893737793,
"rewards/rejected": -4.746399402618408,
"step": 144
},
{
"epoch": 3.71,
"learning_rate": 0.0002713675213675213,
"logits/chosen": 1.4121302366256714,
"logits/rejected": 1.4421744346618652,
"logps/chosen": -520.80322265625,
"logps/rejected": -645.4122314453125,
"loss": 0.0025,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2977559566497803,
"rewards/margins": 7.4320292472839355,
"rewards/rejected": -5.134273529052734,
"step": 145
},
{
"epoch": 3.74,
"learning_rate": 0.00027094017094017093,
"logits/chosen": 1.4837563037872314,
"logits/rejected": 1.379111886024475,
"logps/chosen": -601.663818359375,
"logps/rejected": -564.5067138671875,
"loss": 0.1342,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.5174392461776733,
"rewards/margins": 6.973749160766602,
"rewards/rejected": -5.456309795379639,
"step": 146
},
{
"epoch": 3.76,
"learning_rate": 0.0002705128205128205,
"logits/chosen": 1.3307445049285889,
"logits/rejected": 1.201188564300537,
"logps/chosen": -529.9027099609375,
"logps/rejected": -573.0830078125,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5938829183578491,
"rewards/margins": 8.63363265991211,
"rewards/rejected": -7.039750099182129,
"step": 147
},
{
"epoch": 3.79,
"learning_rate": 0.00027008547008547005,
"logits/chosen": 1.2792227268218994,
"logits/rejected": 1.2923702001571655,
"logps/chosen": -547.6593017578125,
"logps/rejected": -678.7649536132812,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6561351418495178,
"rewards/margins": 6.722534656524658,
"rewards/rejected": -6.066399574279785,
"step": 148
},
{
"epoch": 3.81,
"learning_rate": 0.0002696581196581196,
"logits/chosen": 1.4008458852767944,
"logits/rejected": 1.1927311420440674,
"logps/chosen": -628.3024291992188,
"logps/rejected": -578.4089965820312,
"loss": 0.0052,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8446162939071655,
"rewards/margins": 8.044957160949707,
"rewards/rejected": -7.200340747833252,
"step": 149
},
{
"epoch": 3.84,
"learning_rate": 0.0002692307692307692,
"logits/chosen": 1.1845999956130981,
"logits/rejected": 1.0530712604522705,
"logps/chosen": -499.4281005859375,
"logps/rejected": -573.0311889648438,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9514065980911255,
"rewards/margins": 8.918203353881836,
"rewards/rejected": -7.966796875,
"step": 150
},
{
"epoch": 3.87,
"learning_rate": 0.0002688034188034188,
"logits/chosen": 1.1802606582641602,
"logits/rejected": 1.1234092712402344,
"logps/chosen": -572.6054077148438,
"logps/rejected": -626.6286010742188,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8314933180809021,
"rewards/margins": 9.328904151916504,
"rewards/rejected": -8.497410774230957,
"step": 151
},
{
"epoch": 3.89,
"learning_rate": 0.00026837606837606834,
"logits/chosen": 1.1486611366271973,
"logits/rejected": 1.0562578439712524,
"logps/chosen": -509.203125,
"logps/rejected": -584.0765991210938,
"loss": 0.003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5767253637313843,
"rewards/margins": 8.173746109008789,
"rewards/rejected": -7.597021102905273,
"step": 152
},
{
"epoch": 3.92,
"learning_rate": 0.00026794871794871795,
"logits/chosen": 1.2486861944198608,
"logits/rejected": 1.123504400253296,
"logps/chosen": -586.3121337890625,
"logps/rejected": -666.2870483398438,
"loss": 0.0048,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11837495118379593,
"rewards/margins": 8.070638656616211,
"rewards/rejected": -7.952263355255127,
"step": 153
},
{
"epoch": 3.94,
"learning_rate": 0.0002675213675213675,
"logits/chosen": 1.0615603923797607,
"logits/rejected": 1.0381104946136475,
"logps/chosen": -477.157470703125,
"logps/rejected": -577.363525390625,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2932237982749939,
"rewards/margins": 8.428733825683594,
"rewards/rejected": -8.13551139831543,
"step": 154
},
{
"epoch": 3.97,
"learning_rate": 0.00026709401709401707,
"logits/chosen": 1.0924714803695679,
"logits/rejected": 1.0560393333435059,
"logps/chosen": -589.655517578125,
"logps/rejected": -621.06201171875,
"loss": 0.0741,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.8137645125389099,
"rewards/margins": 8.433317184448242,
"rewards/rejected": -9.247081756591797,
"step": 155
},
{
"epoch": 3.99,
"learning_rate": 0.0002666666666666666,
"logits/chosen": 1.1305707693099976,
"logits/rejected": 1.071329951286316,
"logps/chosen": -582.5031127929688,
"logps/rejected": -615.1607666015625,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6762839555740356,
"rewards/margins": 8.447378158569336,
"rewards/rejected": -7.77109432220459,
"step": 156
},
{
"epoch": 4.02,
"learning_rate": 0.00026623931623931624,
"logits/chosen": 1.101088285446167,
"logits/rejected": 1.0743204355239868,
"logps/chosen": -503.9051208496094,
"logps/rejected": -705.6373291015625,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4264540672302246,
"rewards/margins": 9.59773063659668,
"rewards/rejected": -8.171276092529297,
"step": 157
},
{
"epoch": 4.04,
"learning_rate": 0.0002658119658119658,
"logits/chosen": 1.021052598953247,
"logits/rejected": 1.0671635866165161,
"logps/chosen": -466.4878845214844,
"logps/rejected": -629.9095458984375,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2989165782928467,
"rewards/margins": 9.68484878540039,
"rewards/rejected": -8.385932922363281,
"step": 158
},
{
"epoch": 4.07,
"learning_rate": 0.00026538461538461536,
"logits/chosen": 1.1435658931732178,
"logits/rejected": 1.1837159395217896,
"logps/chosen": -534.5050048828125,
"logps/rejected": -611.9518432617188,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3590706586837769,
"rewards/margins": 8.476469039916992,
"rewards/rejected": -7.117398262023926,
"step": 159
},
{
"epoch": 4.1,
"learning_rate": 0.00026495726495726497,
"logits/chosen": 1.146314263343811,
"logits/rejected": 1.1262887716293335,
"logps/chosen": -476.71453857421875,
"logps/rejected": -590.7471923828125,
"loss": 0.0145,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9563323259353638,
"rewards/margins": 9.145315170288086,
"rewards/rejected": -7.188984394073486,
"step": 160
},
{
"epoch": 4.12,
"learning_rate": 0.0002645299145299145,
"logits/chosen": 1.16382896900177,
"logits/rejected": 1.1415654420852661,
"logps/chosen": -532.4939575195312,
"logps/rejected": -640.6329345703125,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2008843421936035,
"rewards/margins": 8.335280418395996,
"rewards/rejected": -6.134396076202393,
"step": 161
},
{
"epoch": 4.15,
"learning_rate": 0.0002641025641025641,
"logits/chosen": 1.0377854108810425,
"logits/rejected": 1.0735015869140625,
"logps/chosen": -482.3515625,
"logps/rejected": -620.0985717773438,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8429818153381348,
"rewards/margins": 8.701183319091797,
"rewards/rejected": -6.8582000732421875,
"step": 162
},
{
"epoch": 4.17,
"learning_rate": 0.00026367521367521364,
"logits/chosen": 1.2598010301589966,
"logits/rejected": 1.2114201784133911,
"logps/chosen": -497.3212890625,
"logps/rejected": -609.7880249023438,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7050861120224,
"rewards/margins": 8.18321418762207,
"rewards/rejected": -6.478128433227539,
"step": 163
},
{
"epoch": 4.2,
"learning_rate": 0.00026324786324786326,
"logits/chosen": 1.2148414850234985,
"logits/rejected": 1.0941295623779297,
"logps/chosen": -575.4998779296875,
"logps/rejected": -580.6967163085938,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.006260395050049,
"rewards/margins": 8.852553367614746,
"rewards/rejected": -6.846292972564697,
"step": 164
},
{
"epoch": 4.22,
"learning_rate": 0.0002628205128205128,
"logits/chosen": 1.2730900049209595,
"logits/rejected": 1.1539727449417114,
"logps/chosen": -587.6873779296875,
"logps/rejected": -640.8407592773438,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5435640811920166,
"rewards/margins": 8.264644622802734,
"rewards/rejected": -5.721080303192139,
"step": 165
},
{
"epoch": 4.25,
"learning_rate": 0.0002623931623931624,
"logits/chosen": 1.1310749053955078,
"logits/rejected": 1.074840784072876,
"logps/chosen": -539.7301025390625,
"logps/rejected": -588.243408203125,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.725675344467163,
"rewards/margins": 8.484735488891602,
"rewards/rejected": -6.759060382843018,
"step": 166
},
{
"epoch": 4.28,
"learning_rate": 0.00026196581196581193,
"logits/chosen": 1.1921124458312988,
"logits/rejected": 1.1464745998382568,
"logps/chosen": -531.4241333007812,
"logps/rejected": -650.033447265625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.208735704421997,
"rewards/margins": 8.580331802368164,
"rewards/rejected": -6.371595859527588,
"step": 167
},
{
"epoch": 4.3,
"learning_rate": 0.00026153846153846154,
"logits/chosen": 1.2101249694824219,
"logits/rejected": 1.18596613407135,
"logps/chosen": -563.632568359375,
"logps/rejected": -715.3417358398438,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0189900398254395,
"rewards/margins": 9.561189651489258,
"rewards/rejected": -7.542199611663818,
"step": 168
},
{
"epoch": 4.33,
"learning_rate": 0.0002611111111111111,
"logits/chosen": 1.2898643016815186,
"logits/rejected": 1.1050164699554443,
"logps/chosen": -568.4158325195312,
"logps/rejected": -607.685302734375,
"loss": 0.0064,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8665090799331665,
"rewards/margins": 8.093311309814453,
"rewards/rejected": -6.226801872253418,
"step": 169
},
{
"epoch": 4.35,
"learning_rate": 0.00026068376068376066,
"logits/chosen": 1.1432087421417236,
"logits/rejected": 1.1123594045639038,
"logps/chosen": -532.8350219726562,
"logps/rejected": -636.5218505859375,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5014150142669678,
"rewards/margins": 9.985570907592773,
"rewards/rejected": -7.48415470123291,
"step": 170
},
{
"epoch": 4.38,
"learning_rate": 0.0002602564102564102,
"logits/chosen": 1.0670151710510254,
"logits/rejected": 0.9835186004638672,
"logps/chosen": -525.0418090820312,
"logps/rejected": -603.8453369140625,
"loss": 0.0303,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.609043836593628,
"rewards/margins": 8.916692733764648,
"rewards/rejected": -7.307648658752441,
"step": 171
},
{
"epoch": 4.4,
"learning_rate": 0.0002598290598290598,
"logits/chosen": 1.1664202213287354,
"logits/rejected": 1.0911628007888794,
"logps/chosen": -546.672607421875,
"logps/rejected": -600.7014770507812,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.016690254211426,
"rewards/margins": 8.022310256958008,
"rewards/rejected": -6.005620002746582,
"step": 172
},
{
"epoch": 4.43,
"learning_rate": 0.0002594017094017094,
"logits/chosen": 1.089374303817749,
"logits/rejected": 1.1528961658477783,
"logps/chosen": -515.8948364257812,
"logps/rejected": -608.8614501953125,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6085548400878906,
"rewards/margins": 9.069319725036621,
"rewards/rejected": -6.4607648849487305,
"step": 173
},
{
"epoch": 4.45,
"learning_rate": 0.00025897435897435895,
"logits/chosen": 1.2652084827423096,
"logits/rejected": 1.0107694864273071,
"logps/chosen": -570.3291015625,
"logps/rejected": -542.94384765625,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.6129395961761475,
"rewards/margins": 8.992415428161621,
"rewards/rejected": -5.3794755935668945,
"step": 174
},
{
"epoch": 4.48,
"learning_rate": 0.0002585470085470085,
"logits/chosen": 1.2760734558105469,
"logits/rejected": 1.1731884479522705,
"logps/chosen": -581.9400634765625,
"logps/rejected": -597.0192260742188,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4896321296691895,
"rewards/margins": 8.106904983520508,
"rewards/rejected": -5.617273330688477,
"step": 175
},
{
"epoch": 4.51,
"learning_rate": 0.00025811965811965807,
"logits/chosen": 1.262199878692627,
"logits/rejected": 1.0435302257537842,
"logps/chosen": -577.8585205078125,
"logps/rejected": -599.6145629882812,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.0292744636535645,
"rewards/margins": 9.197108268737793,
"rewards/rejected": -5.1678338050842285,
"step": 176
},
{
"epoch": 4.53,
"learning_rate": 0.0002576923076923077,
"logits/chosen": 1.0869362354278564,
"logits/rejected": 1.062146544456482,
"logps/chosen": -486.3373107910156,
"logps/rejected": -604.74609375,
"loss": 0.0031,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.719869613647461,
"rewards/margins": 9.07932186126709,
"rewards/rejected": -6.359452724456787,
"step": 177
},
{
"epoch": 4.56,
"learning_rate": 0.00025726495726495724,
"logits/chosen": 1.1699590682983398,
"logits/rejected": 1.0801599025726318,
"logps/chosen": -498.7552490234375,
"logps/rejected": -508.74609375,
"loss": 0.0024,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.849763870239258,
"rewards/margins": 8.253904342651367,
"rewards/rejected": -5.404139995574951,
"step": 178
},
{
"epoch": 4.58,
"learning_rate": 0.0002568376068376068,
"logits/chosen": 1.1467467546463013,
"logits/rejected": 1.1259756088256836,
"logps/chosen": -504.258544921875,
"logps/rejected": -571.6417846679688,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.4761364459991455,
"rewards/margins": 9.168277740478516,
"rewards/rejected": -5.692141056060791,
"step": 179
},
{
"epoch": 4.61,
"learning_rate": 0.00025641025641025636,
"logits/chosen": 1.1694316864013672,
"logits/rejected": 1.1096751689910889,
"logps/chosen": -521.1782836914062,
"logps/rejected": -578.571533203125,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.831768035888672,
"rewards/margins": 8.435105323791504,
"rewards/rejected": -5.603337287902832,
"step": 180
},
{
"epoch": 4.63,
"learning_rate": 0.00025598290598290597,
"logits/chosen": 1.2126814126968384,
"logits/rejected": 1.0483447313308716,
"logps/chosen": -514.5982055664062,
"logps/rejected": -559.2354125976562,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6389248371124268,
"rewards/margins": 8.253149032592773,
"rewards/rejected": -5.614223957061768,
"step": 181
},
{
"epoch": 4.66,
"learning_rate": 0.00025555555555555553,
"logits/chosen": 1.1626149415969849,
"logits/rejected": 1.023645281791687,
"logps/chosen": -539.7805786132812,
"logps/rejected": -583.5364379882812,
"loss": 0.0018,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5426697731018066,
"rewards/margins": 9.294103622436523,
"rewards/rejected": -5.751433372497559,
"step": 182
},
{
"epoch": 4.68,
"learning_rate": 0.0002551282051282051,
"logits/chosen": 1.2193539142608643,
"logits/rejected": 1.0917335748672485,
"logps/chosen": -482.9999694824219,
"logps/rejected": -574.2794189453125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.089658737182617,
"rewards/margins": 8.874006271362305,
"rewards/rejected": -5.784348011016846,
"step": 183
},
{
"epoch": 4.71,
"learning_rate": 0.0002547008547008547,
"logits/chosen": 1.110312819480896,
"logits/rejected": 1.136667013168335,
"logps/chosen": -463.33062744140625,
"logps/rejected": -540.6802978515625,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5352556705474854,
"rewards/margins": 9.688406944274902,
"rewards/rejected": -7.1531524658203125,
"step": 184
},
{
"epoch": 4.74,
"learning_rate": 0.00025427350427350426,
"logits/chosen": 1.2557671070098877,
"logits/rejected": 1.1719632148742676,
"logps/chosen": -529.3836669921875,
"logps/rejected": -604.2679443359375,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.274686098098755,
"rewards/margins": 8.664352416992188,
"rewards/rejected": -5.389666557312012,
"step": 185
},
{
"epoch": 4.76,
"learning_rate": 0.0002538461538461538,
"logits/chosen": 1.2389734983444214,
"logits/rejected": 1.084290623664856,
"logps/chosen": -544.71142578125,
"logps/rejected": -621.462158203125,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2781314849853516,
"rewards/margins": 8.4774808883667,
"rewards/rejected": -6.199349403381348,
"step": 186
},
{
"epoch": 4.79,
"learning_rate": 0.0002534188034188034,
"logits/chosen": 1.1379337310791016,
"logits/rejected": 1.1488574743270874,
"logps/chosen": -505.5488586425781,
"logps/rejected": -620.3075561523438,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.200209140777588,
"rewards/margins": 7.998414516448975,
"rewards/rejected": -5.798205375671387,
"step": 187
},
{
"epoch": 4.81,
"learning_rate": 0.000252991452991453,
"logits/chosen": 1.1832406520843506,
"logits/rejected": 1.1894774436950684,
"logps/chosen": -518.3919677734375,
"logps/rejected": -603.9647216796875,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2990570068359375,
"rewards/margins": 8.94902515411377,
"rewards/rejected": -5.649968147277832,
"step": 188
},
{
"epoch": 4.84,
"learning_rate": 0.00025256410256410255,
"logits/chosen": 1.1613863706588745,
"logits/rejected": 1.0867745876312256,
"logps/chosen": -509.91754150390625,
"logps/rejected": -581.9103393554688,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3365347385406494,
"rewards/margins": 8.27928352355957,
"rewards/rejected": -5.942748069763184,
"step": 189
},
{
"epoch": 4.86,
"learning_rate": 0.0002521367521367521,
"logits/chosen": 1.1696518659591675,
"logits/rejected": 1.0967986583709717,
"logps/chosen": -512.4561767578125,
"logps/rejected": -571.2943725585938,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8717398643493652,
"rewards/margins": 9.033393859863281,
"rewards/rejected": -6.161653995513916,
"step": 190
},
{
"epoch": 4.89,
"learning_rate": 0.0002517094017094017,
"logits/chosen": 1.211112141609192,
"logits/rejected": 0.9915167093276978,
"logps/chosen": -592.27294921875,
"logps/rejected": -545.3093872070312,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7333691120147705,
"rewards/margins": 8.252148628234863,
"rewards/rejected": -5.518779754638672,
"step": 191
},
{
"epoch": 4.92,
"learning_rate": 0.0002512820512820513,
"logits/chosen": 1.1865314245224,
"logits/rejected": 1.083636999130249,
"logps/chosen": -500.9031982421875,
"logps/rejected": -634.9488525390625,
"loss": 0.0616,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.7689785957336426,
"rewards/margins": 9.048822402954102,
"rewards/rejected": -6.279844760894775,
"step": 192
},
{
"epoch": 4.94,
"learning_rate": 0.00025085470085470083,
"logits/chosen": 1.308659553527832,
"logits/rejected": 1.1389790773391724,
"logps/chosen": -530.4187622070312,
"logps/rejected": -617.7844848632812,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9889013767242432,
"rewards/margins": 7.278841018676758,
"rewards/rejected": -5.2899394035339355,
"step": 193
},
{
"epoch": 4.97,
"learning_rate": 0.0002504273504273504,
"logits/chosen": 1.1351438760757446,
"logits/rejected": 1.0995919704437256,
"logps/chosen": -537.9273071289062,
"logps/rejected": -580.5325927734375,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.307258129119873,
"rewards/margins": 8.442761421203613,
"rewards/rejected": -6.135504245758057,
"step": 194
},
{
"epoch": 4.99,
"learning_rate": 0.00025,
"logits/chosen": 1.1220672130584717,
"logits/rejected": 1.0801559686660767,
"logps/chosen": -520.331787109375,
"logps/rejected": -620.8114013671875,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.103241205215454,
"rewards/margins": 8.628799438476562,
"rewards/rejected": -6.5255584716796875,
"step": 195
},
{
"epoch": 5.02,
"learning_rate": 0.00024957264957264956,
"logits/chosen": 1.2094953060150146,
"logits/rejected": 1.1247830390930176,
"logps/chosen": -482.1205749511719,
"logps/rejected": -588.2861328125,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7330946922302246,
"rewards/margins": 8.031623840332031,
"rewards/rejected": -5.298529148101807,
"step": 196
},
{
"epoch": 5.04,
"learning_rate": 0.0002491452991452991,
"logits/chosen": 1.1430044174194336,
"logits/rejected": 1.0147483348846436,
"logps/chosen": -540.6754760742188,
"logps/rejected": -578.9623413085938,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.216123104095459,
"rewards/margins": 8.867217063903809,
"rewards/rejected": -5.651093482971191,
"step": 197
},
{
"epoch": 5.07,
"learning_rate": 0.00024871794871794874,
"logits/chosen": 1.1910125017166138,
"logits/rejected": 1.0428566932678223,
"logps/chosen": -577.4218139648438,
"logps/rejected": -615.4971313476562,
"loss": 0.0152,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6112916469573975,
"rewards/margins": 8.347122192382812,
"rewards/rejected": -5.735829830169678,
"step": 198
},
{
"epoch": 5.09,
"learning_rate": 0.0002482905982905983,
"logits/chosen": 1.189084768295288,
"logits/rejected": 1.023719310760498,
"logps/chosen": -505.85125732421875,
"logps/rejected": -607.1080322265625,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.267016649246216,
"rewards/margins": 8.4349946975708,
"rewards/rejected": -6.1679768562316895,
"step": 199
},
{
"epoch": 5.12,
"learning_rate": 0.00024786324786324785,
"logits/chosen": 1.1190298795700073,
"logits/rejected": 1.0712878704071045,
"logps/chosen": -522.3341064453125,
"logps/rejected": -650.10107421875,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4759145975112915,
"rewards/margins": 8.425509452819824,
"rewards/rejected": -6.949594020843506,
"step": 200
},
{
"epoch": 5.15,
"learning_rate": 0.0002474358974358974,
"logits/chosen": 1.1855789422988892,
"logits/rejected": 1.1534652709960938,
"logps/chosen": -530.4508666992188,
"logps/rejected": -620.44873046875,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7457334995269775,
"rewards/margins": 8.301987648010254,
"rewards/rejected": -5.5562543869018555,
"step": 201
},
{
"epoch": 5.17,
"learning_rate": 0.000247008547008547,
"logits/chosen": 1.2083404064178467,
"logits/rejected": 1.0335760116577148,
"logps/chosen": -544.8739624023438,
"logps/rejected": -582.562744140625,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7809946537017822,
"rewards/margins": 8.799093246459961,
"rewards/rejected": -6.018097877502441,
"step": 202
},
{
"epoch": 5.2,
"learning_rate": 0.0002465811965811966,
"logits/chosen": 1.1989009380340576,
"logits/rejected": 1.1295719146728516,
"logps/chosen": -480.67724609375,
"logps/rejected": -609.4390258789062,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5975394248962402,
"rewards/margins": 8.314066886901855,
"rewards/rejected": -5.716527938842773,
"step": 203
},
{
"epoch": 5.22,
"learning_rate": 0.00024615384615384614,
"logits/chosen": 1.1273399591445923,
"logits/rejected": 0.9690557718276978,
"logps/chosen": -563.5905151367188,
"logps/rejected": -566.3074951171875,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.03564715385437,
"rewards/margins": 8.314454078674316,
"rewards/rejected": -6.278806686401367,
"step": 204
},
{
"epoch": 5.25,
"learning_rate": 0.0002457264957264957,
"logits/chosen": 1.0755057334899902,
"logits/rejected": 1.1226409673690796,
"logps/chosen": -515.4642333984375,
"logps/rejected": -682.0799560546875,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6836296319961548,
"rewards/margins": 8.099983215332031,
"rewards/rejected": -6.416353702545166,
"step": 205
},
{
"epoch": 5.27,
"learning_rate": 0.00024529914529914526,
"logits/chosen": 1.1585733890533447,
"logits/rejected": 1.1376292705535889,
"logps/chosen": -489.0839538574219,
"logps/rejected": -560.7153930664062,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.028048276901245,
"rewards/margins": 7.495326995849609,
"rewards/rejected": -5.467278957366943,
"step": 206
},
{
"epoch": 5.3,
"learning_rate": 0.00024487179487179487,
"logits/chosen": 1.1711719036102295,
"logits/rejected": 1.2047771215438843,
"logps/chosen": -554.0427856445312,
"logps/rejected": -657.2614135742188,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.6054043769836426,
"rewards/margins": 8.226700782775879,
"rewards/rejected": -5.621296405792236,
"step": 207
},
{
"epoch": 5.32,
"learning_rate": 0.00024444444444444443,
"logits/chosen": 1.1655393838882446,
"logits/rejected": 1.1469833850860596,
"logps/chosen": -568.0317993164062,
"logps/rejected": -558.951171875,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.950896739959717,
"rewards/margins": 8.18274974822998,
"rewards/rejected": -5.231853008270264,
"step": 208
},
{
"epoch": 5.35,
"learning_rate": 0.00024401709401709401,
"logits/chosen": 1.188609004020691,
"logits/rejected": 1.1547000408172607,
"logps/chosen": -582.5511474609375,
"logps/rejected": -659.006591796875,
"loss": 0.0024,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.448970317840576,
"rewards/margins": 9.596809387207031,
"rewards/rejected": -7.147839069366455,
"step": 209
},
{
"epoch": 5.38,
"learning_rate": 0.00024358974358974357,
"logits/chosen": 1.1199637651443481,
"logits/rejected": 1.0847599506378174,
"logps/chosen": -521.1385498046875,
"logps/rejected": -580.2830810546875,
"loss": 0.0018,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.533172845840454,
"rewards/margins": 8.593878746032715,
"rewards/rejected": -6.060705184936523,
"step": 210
},
{
"epoch": 5.4,
"learning_rate": 0.00024316239316239313,
"logits/chosen": 1.1465306282043457,
"logits/rejected": 1.1031239032745361,
"logps/chosen": -483.8526611328125,
"logps/rejected": -559.978759765625,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.983487844467163,
"rewards/margins": 8.161898612976074,
"rewards/rejected": -6.178411483764648,
"step": 211
},
{
"epoch": 5.43,
"learning_rate": 0.00024273504273504272,
"logits/chosen": 1.13920259475708,
"logits/rejected": 1.1220027208328247,
"logps/chosen": -512.029052734375,
"logps/rejected": -572.7222900390625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.37949538230896,
"rewards/margins": 9.245460510253906,
"rewards/rejected": -6.865965843200684,
"step": 212
},
{
"epoch": 5.45,
"learning_rate": 0.0002423076923076923,
"logits/chosen": 1.1790162324905396,
"logits/rejected": 1.032013177871704,
"logps/chosen": -550.051025390625,
"logps/rejected": -632.330810546875,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7981252670288086,
"rewards/margins": 9.512584686279297,
"rewards/rejected": -6.714459419250488,
"step": 213
},
{
"epoch": 5.48,
"learning_rate": 0.00024188034188034186,
"logits/chosen": 1.299055814743042,
"logits/rejected": 1.2317571640014648,
"logps/chosen": -517.1921997070312,
"logps/rejected": -619.0784301757812,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4681472778320312,
"rewards/margins": 8.650612831115723,
"rewards/rejected": -7.182465076446533,
"step": 214
},
{
"epoch": 5.5,
"learning_rate": 0.00024145299145299142,
"logits/chosen": 1.1555142402648926,
"logits/rejected": 1.1552680730819702,
"logps/chosen": -493.82232666015625,
"logps/rejected": -553.5524291992188,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.282465934753418,
"rewards/margins": 7.990402698516846,
"rewards/rejected": -5.707936763763428,
"step": 215
},
{
"epoch": 5.53,
"learning_rate": 0.000241025641025641,
"logits/chosen": 1.2630378007888794,
"logits/rejected": 1.145714282989502,
"logps/chosen": -566.5864868164062,
"logps/rejected": -555.4298095703125,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.5014400482177734,
"rewards/margins": 9.682706832885742,
"rewards/rejected": -7.1812663078308105,
"step": 216
},
{
"epoch": 5.56,
"learning_rate": 0.00024059829059829056,
"logits/chosen": 1.275704026222229,
"logits/rejected": 1.1247011423110962,
"logps/chosen": -565.8131713867188,
"logps/rejected": -633.22509765625,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3491557836532593,
"rewards/margins": 8.835249900817871,
"rewards/rejected": -7.486093997955322,
"step": 217
},
{
"epoch": 5.58,
"learning_rate": 0.00024017094017094015,
"logits/chosen": 1.24131178855896,
"logits/rejected": 1.1392734050750732,
"logps/chosen": -515.2080078125,
"logps/rejected": -557.8899536132812,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5449414253234863,
"rewards/margins": 8.077924728393555,
"rewards/rejected": -6.532983303070068,
"step": 218
},
{
"epoch": 5.61,
"learning_rate": 0.00023974358974358974,
"logits/chosen": 1.2522388696670532,
"logits/rejected": 1.049080729484558,
"logps/chosen": -607.8526611328125,
"logps/rejected": -644.0594482421875,
"loss": 0.1498,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.073587656021118,
"rewards/margins": 9.496702194213867,
"rewards/rejected": -7.42311429977417,
"step": 219
},
{
"epoch": 5.63,
"learning_rate": 0.0002393162393162393,
"logits/chosen": 1.1726680994033813,
"logits/rejected": 1.0583220720291138,
"logps/chosen": -537.9254150390625,
"logps/rejected": -589.2078857421875,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3364086151123047,
"rewards/margins": 9.462315559387207,
"rewards/rejected": -8.125906944274902,
"step": 220
},
{
"epoch": 5.66,
"learning_rate": 0.00023888888888888885,
"logits/chosen": 1.2105443477630615,
"logits/rejected": 1.0398310422897339,
"logps/chosen": -553.4381713867188,
"logps/rejected": -617.5463256835938,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.809446096420288,
"rewards/margins": 9.608116149902344,
"rewards/rejected": -7.798670291900635,
"step": 221
},
{
"epoch": 5.68,
"learning_rate": 0.00023846153846153844,
"logits/chosen": 1.0151175260543823,
"logits/rejected": 1.137940764427185,
"logps/chosen": -474.15582275390625,
"logps/rejected": -601.0347900390625,
"loss": 0.0025,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9911149740219116,
"rewards/margins": 8.517168045043945,
"rewards/rejected": -7.526054382324219,
"step": 222
},
{
"epoch": 5.71,
"learning_rate": 0.00023803418803418802,
"logits/chosen": 1.1788804531097412,
"logits/rejected": 1.0858978033065796,
"logps/chosen": -538.7382202148438,
"logps/rejected": -580.581787109375,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0802903175354004,
"rewards/margins": 8.940625190734863,
"rewards/rejected": -7.860335350036621,
"step": 223
},
{
"epoch": 5.73,
"learning_rate": 0.00023760683760683758,
"logits/chosen": 1.209570288658142,
"logits/rejected": 1.1490302085876465,
"logps/chosen": -497.189697265625,
"logps/rejected": -623.16455078125,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9234257936477661,
"rewards/margins": 9.893648147583008,
"rewards/rejected": -7.970221996307373,
"step": 224
},
{
"epoch": 5.76,
"learning_rate": 0.00023717948717948714,
"logits/chosen": 1.1075451374053955,
"logits/rejected": 1.0870707035064697,
"logps/chosen": -555.2899169921875,
"logps/rejected": -560.2510375976562,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8097199201583862,
"rewards/margins": 8.50051498413086,
"rewards/rejected": -7.690794467926025,
"step": 225
},
{
"epoch": 5.79,
"learning_rate": 0.00023675213675213675,
"logits/chosen": 1.1817216873168945,
"logits/rejected": 1.018075942993164,
"logps/chosen": -529.1445922851562,
"logps/rejected": -607.93505859375,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8409253358840942,
"rewards/margins": 9.230034828186035,
"rewards/rejected": -7.389110088348389,
"step": 226
},
{
"epoch": 5.81,
"learning_rate": 0.0002363247863247863,
"logits/chosen": 1.146735429763794,
"logits/rejected": 1.082593321800232,
"logps/chosen": -557.0680541992188,
"logps/rejected": -634.6128540039062,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9302033185958862,
"rewards/margins": 9.59379768371582,
"rewards/rejected": -8.663594245910645,
"step": 227
},
{
"epoch": 5.84,
"learning_rate": 0.00023589743589743587,
"logits/chosen": 1.0844825506210327,
"logits/rejected": 1.050144076347351,
"logps/chosen": -458.246337890625,
"logps/rejected": -643.5533447265625,
"loss": 0.008,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.983664333820343,
"rewards/margins": 9.005660057067871,
"rewards/rejected": -8.021997451782227,
"step": 228
},
{
"epoch": 5.86,
"learning_rate": 0.00023547008547008543,
"logits/chosen": 1.1802949905395508,
"logits/rejected": 1.2055476903915405,
"logps/chosen": -538.8391723632812,
"logps/rejected": -667.3803100585938,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4791993796825409,
"rewards/margins": 8.043685913085938,
"rewards/rejected": -7.564486026763916,
"step": 229
},
{
"epoch": 5.89,
"learning_rate": 0.00023504273504273504,
"logits/chosen": 1.217112421989441,
"logits/rejected": 1.0857105255126953,
"logps/chosen": -547.7744140625,
"logps/rejected": -650.827880859375,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2695170640945435,
"rewards/margins": 10.668280601501465,
"rewards/rejected": -9.398763656616211,
"step": 230
},
{
"epoch": 5.91,
"learning_rate": 0.0002346153846153846,
"logits/chosen": 1.106930136680603,
"logits/rejected": 1.1642612218856812,
"logps/chosen": -534.817626953125,
"logps/rejected": -647.9102783203125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.33475053310394287,
"rewards/margins": 9.073336601257324,
"rewards/rejected": -8.73858642578125,
"step": 231
},
{
"epoch": 5.94,
"learning_rate": 0.00023418803418803416,
"logits/chosen": 1.1831903457641602,
"logits/rejected": 1.1884675025939941,
"logps/chosen": -560.828857421875,
"logps/rejected": -663.0516357421875,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9667414426803589,
"rewards/margins": 10.761907577514648,
"rewards/rejected": -8.795166969299316,
"step": 232
},
{
"epoch": 5.96,
"learning_rate": 0.00023376068376068375,
"logits/chosen": 1.092282772064209,
"logits/rejected": 0.9970771670341492,
"logps/chosen": -529.67822265625,
"logps/rejected": -641.421142578125,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.46319279074668884,
"rewards/margins": 10.089384078979492,
"rewards/rejected": -9.626192092895508,
"step": 233
},
{
"epoch": 5.99,
"learning_rate": 0.0002333333333333333,
"logits/chosen": 1.1541541814804077,
"logits/rejected": 0.9784144759178162,
"logps/chosen": -562.6727294921875,
"logps/rejected": -608.5543823242188,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5622472763061523,
"rewards/margins": 9.451787948608398,
"rewards/rejected": -8.889540672302246,
"step": 234
},
{
"epoch": 6.02,
"learning_rate": 0.0002329059829059829,
"logits/chosen": 1.0728470087051392,
"logits/rejected": 1.0160434246063232,
"logps/chosen": -605.6416625976562,
"logps/rejected": -620.4939575195312,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3293549120426178,
"rewards/margins": 9.99177360534668,
"rewards/rejected": -9.662418365478516,
"step": 235
},
{
"epoch": 6.04,
"learning_rate": 0.00023247863247863245,
"logits/chosen": 1.0738935470581055,
"logits/rejected": 1.0548124313354492,
"logps/chosen": -494.51788330078125,
"logps/rejected": -601.1412353515625,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6489882469177246,
"rewards/margins": 9.996894836425781,
"rewards/rejected": -9.347906112670898,
"step": 236
},
{
"epoch": 6.07,
"learning_rate": 0.00023205128205128203,
"logits/chosen": 1.1992632150650024,
"logits/rejected": 1.1126775741577148,
"logps/chosen": -581.875244140625,
"logps/rejected": -654.9214477539062,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1056530475616455,
"rewards/margins": 9.1024169921875,
"rewards/rejected": -10.208070755004883,
"step": 237
},
{
"epoch": 6.09,
"learning_rate": 0.0002316239316239316,
"logits/chosen": 1.024402379989624,
"logits/rejected": 1.0234527587890625,
"logps/chosen": -527.988037109375,
"logps/rejected": -600.8884887695312,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.40097522735595703,
"rewards/margins": 10.08630084991455,
"rewards/rejected": -9.685325622558594,
"step": 238
},
{
"epoch": 6.12,
"learning_rate": 0.00023119658119658118,
"logits/chosen": 1.0920895338058472,
"logits/rejected": 0.925986647605896,
"logps/chosen": -526.6175537109375,
"logps/rejected": -593.0648803710938,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.35913902521133423,
"rewards/margins": 10.041351318359375,
"rewards/rejected": -9.682212829589844,
"step": 239
},
{
"epoch": 6.14,
"learning_rate": 0.00023076923076923076,
"logits/chosen": 1.12273108959198,
"logits/rejected": 0.9582171440124512,
"logps/chosen": -566.5948486328125,
"logps/rejected": -640.344482421875,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10319514572620392,
"rewards/margins": 10.264111518859863,
"rewards/rejected": -10.36730670928955,
"step": 240
},
{
"epoch": 6.17,
"learning_rate": 0.00023034188034188032,
"logits/chosen": 1.125780463218689,
"logits/rejected": 0.8733446598052979,
"logps/chosen": -502.71685791015625,
"logps/rejected": -526.7401123046875,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7334610819816589,
"rewards/margins": 8.98703384399414,
"rewards/rejected": -8.253572463989258,
"step": 241
},
{
"epoch": 6.2,
"learning_rate": 0.00022991452991452988,
"logits/chosen": 1.007986307144165,
"logits/rejected": 0.9879658818244934,
"logps/chosen": -493.781982421875,
"logps/rejected": -631.9270629882812,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.32805830240249634,
"rewards/margins": 10.0196533203125,
"rewards/rejected": -9.691594123840332,
"step": 242
},
{
"epoch": 6.22,
"learning_rate": 0.00022948717948717944,
"logits/chosen": 1.0395015478134155,
"logits/rejected": 1.004233479499817,
"logps/chosen": -519.82568359375,
"logps/rejected": -645.8541870117188,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.23589622974395752,
"rewards/margins": 10.204262733459473,
"rewards/rejected": -10.440156936645508,
"step": 243
},
{
"epoch": 6.25,
"learning_rate": 0.00022905982905982905,
"logits/chosen": 1.030265212059021,
"logits/rejected": 1.0151996612548828,
"logps/chosen": -490.25640869140625,
"logps/rejected": -603.8272705078125,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.07852241396903992,
"rewards/margins": 10.176379203796387,
"rewards/rejected": -10.254899978637695,
"step": 244
},
{
"epoch": 6.27,
"learning_rate": 0.0002286324786324786,
"logits/chosen": 1.2312498092651367,
"logits/rejected": 0.9529620409011841,
"logps/chosen": -602.3244018554688,
"logps/rejected": -622.9336547851562,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13221769034862518,
"rewards/margins": 10.062947273254395,
"rewards/rejected": -9.930729866027832,
"step": 245
},
{
"epoch": 6.3,
"learning_rate": 0.00022820512820512817,
"logits/chosen": 1.0476980209350586,
"logits/rejected": 0.9954835176467896,
"logps/chosen": -542.1054077148438,
"logps/rejected": -681.212646484375,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.028733327984809875,
"rewards/margins": 10.83343505859375,
"rewards/rejected": -10.86216926574707,
"step": 246
},
{
"epoch": 6.32,
"learning_rate": 0.00022777777777777778,
"logits/chosen": 0.9804601669311523,
"logits/rejected": 0.8998504281044006,
"logps/chosen": -525.8041381835938,
"logps/rejected": -585.2196655273438,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6352589130401611,
"rewards/margins": 9.895197868347168,
"rewards/rejected": -9.25993824005127,
"step": 247
},
{
"epoch": 6.35,
"learning_rate": 0.00022735042735042734,
"logits/chosen": 0.9808767437934875,
"logits/rejected": 1.03694486618042,
"logps/chosen": -460.6586608886719,
"logps/rejected": -696.9314575195312,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.008252725005149841,
"rewards/margins": 10.02670955657959,
"rewards/rejected": -10.03496265411377,
"step": 248
},
{
"epoch": 6.37,
"learning_rate": 0.0002269230769230769,
"logits/chosen": 1.0145666599273682,
"logits/rejected": 1.0171821117401123,
"logps/chosen": -550.2952880859375,
"logps/rejected": -609.4617919921875,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.587495744228363,
"rewards/margins": 9.451133728027344,
"rewards/rejected": -10.038629531860352,
"step": 249
},
{
"epoch": 6.4,
"learning_rate": 0.00022649572649572646,
"logits/chosen": 1.1384074687957764,
"logits/rejected": 0.994137167930603,
"logps/chosen": -529.2269287109375,
"logps/rejected": -592.1962890625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2150293588638306,
"rewards/margins": 9.835915565490723,
"rewards/rejected": -8.620885848999023,
"step": 250
},
{
"epoch": 6.43,
"learning_rate": 0.00022606837606837604,
"logits/chosen": 1.0672990083694458,
"logits/rejected": 1.043774127960205,
"logps/chosen": -530.2998046875,
"logps/rejected": -619.9190673828125,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16798219084739685,
"rewards/margins": 9.340568542480469,
"rewards/rejected": -9.508550643920898,
"step": 251
},
{
"epoch": 6.45,
"learning_rate": 0.00022564102564102563,
"logits/chosen": 1.0548663139343262,
"logits/rejected": 0.9898471832275391,
"logps/chosen": -515.4979858398438,
"logps/rejected": -591.4309692382812,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1059775352478027,
"rewards/margins": 9.55521297454834,
"rewards/rejected": -8.449234962463379,
"step": 252
},
{
"epoch": 6.48,
"learning_rate": 0.0002252136752136752,
"logits/chosen": 1.0579899549484253,
"logits/rejected": 1.0557491779327393,
"logps/chosen": -532.400634765625,
"logps/rejected": -688.6305541992188,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3727375268936157,
"rewards/margins": 10.276800155639648,
"rewards/rejected": -10.649538040161133,
"step": 253
},
{
"epoch": 6.5,
"learning_rate": 0.00022478632478632477,
"logits/chosen": 1.1566115617752075,
"logits/rejected": 1.0760446786880493,
"logps/chosen": -590.3530883789062,
"logps/rejected": -659.9254150390625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8200367093086243,
"rewards/margins": 10.486039161682129,
"rewards/rejected": -9.666001319885254,
"step": 254
},
{
"epoch": 6.53,
"learning_rate": 0.00022435897435897433,
"logits/chosen": 1.169042944908142,
"logits/rejected": 1.092968225479126,
"logps/chosen": -562.0431518554688,
"logps/rejected": -657.502197265625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6965909004211426,
"rewards/margins": 11.691349029541016,
"rewards/rejected": -9.994759559631348,
"step": 255
},
{
"epoch": 6.55,
"learning_rate": 0.00022393162393162392,
"logits/chosen": 1.0384266376495361,
"logits/rejected": 1.021849513053894,
"logps/chosen": -551.0145263671875,
"logps/rejected": -694.0051879882812,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5090039968490601,
"rewards/margins": 10.475794792175293,
"rewards/rejected": -9.966791152954102,
"step": 256
},
{
"epoch": 6.58,
"learning_rate": 0.0002235042735042735,
"logits/chosen": 1.1020841598510742,
"logits/rejected": 0.9978400468826294,
"logps/chosen": -552.3654174804688,
"logps/rejected": -584.1214599609375,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.505805492401123,
"rewards/margins": 11.413491249084473,
"rewards/rejected": -8.907686233520508,
"step": 257
},
{
"epoch": 6.6,
"learning_rate": 0.00022307692307692306,
"logits/chosen": 1.1605815887451172,
"logits/rejected": 1.0998469591140747,
"logps/chosen": -532.1463623046875,
"logps/rejected": -632.1581420898438,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.184004306793213,
"rewards/margins": 9.712126731872559,
"rewards/rejected": -8.528121948242188,
"step": 258
},
{
"epoch": 6.63,
"learning_rate": 0.00022264957264957262,
"logits/chosen": 1.0158207416534424,
"logits/rejected": 1.0649572610855103,
"logps/chosen": -537.4420166015625,
"logps/rejected": -689.2913818359375,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0612900257110596,
"rewards/margins": 11.472357749938965,
"rewards/rejected": -10.411066055297852,
"step": 259
},
{
"epoch": 6.66,
"learning_rate": 0.00022222222222222218,
"logits/chosen": 1.23757803440094,
"logits/rejected": 1.0114773511886597,
"logps/chosen": -557.8294677734375,
"logps/rejected": -592.405517578125,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.740067183971405,
"rewards/margins": 10.078370094299316,
"rewards/rejected": -9.338302612304688,
"step": 260
},
{
"epoch": 6.68,
"learning_rate": 0.0002217948717948718,
"logits/chosen": 1.2603142261505127,
"logits/rejected": 1.001814603805542,
"logps/chosen": -581.58935546875,
"logps/rejected": -544.1881103515625,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5611451864242554,
"rewards/margins": 9.480655670166016,
"rewards/rejected": -7.9195098876953125,
"step": 261
},
{
"epoch": 6.71,
"learning_rate": 0.00022136752136752135,
"logits/chosen": 1.1516404151916504,
"logits/rejected": 1.1282165050506592,
"logps/chosen": -560.3634033203125,
"logps/rejected": -674.9133911132812,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5357306003570557,
"rewards/margins": 9.987811088562012,
"rewards/rejected": -9.452080726623535,
"step": 262
},
{
"epoch": 6.73,
"learning_rate": 0.0002209401709401709,
"logits/chosen": 1.1687240600585938,
"logits/rejected": 1.0638331174850464,
"logps/chosen": -589.182373046875,
"logps/rejected": -682.9474487304688,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.275535225868225,
"rewards/margins": 11.063849449157715,
"rewards/rejected": -9.788314819335938,
"step": 263
},
{
"epoch": 6.76,
"learning_rate": 0.00022051282051282052,
"logits/chosen": 1.1243481636047363,
"logits/rejected": 0.967666745185852,
"logps/chosen": -575.9657592773438,
"logps/rejected": -614.3820190429688,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.424743115901947,
"rewards/margins": 9.780957221984863,
"rewards/rejected": -9.35621452331543,
"step": 264
},
{
"epoch": 6.78,
"learning_rate": 0.00022008547008547008,
"logits/chosen": 1.0041706562042236,
"logits/rejected": 0.996163547039032,
"logps/chosen": -587.704345703125,
"logps/rejected": -628.5338745117188,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7361750602722168,
"rewards/margins": 9.253312110900879,
"rewards/rejected": -8.51713752746582,
"step": 265
},
{
"epoch": 6.81,
"learning_rate": 0.00021965811965811964,
"logits/chosen": 1.1979098320007324,
"logits/rejected": 1.126028299331665,
"logps/chosen": -524.0247802734375,
"logps/rejected": -609.7775268554688,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6900510787963867,
"rewards/margins": 9.077376365661621,
"rewards/rejected": -7.387324333190918,
"step": 266
},
{
"epoch": 6.84,
"learning_rate": 0.0002192307692307692,
"logits/chosen": 1.1033226251602173,
"logits/rejected": 1.0556286573410034,
"logps/chosen": -534.7022094726562,
"logps/rejected": -597.9768676757812,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6321287155151367,
"rewards/margins": 10.527310371398926,
"rewards/rejected": -8.895181655883789,
"step": 267
},
{
"epoch": 6.86,
"learning_rate": 0.00021880341880341878,
"logits/chosen": 1.0708644390106201,
"logits/rejected": 1.0677733421325684,
"logps/chosen": -561.38525390625,
"logps/rejected": -664.557373046875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4007779359817505,
"rewards/margins": 10.572273254394531,
"rewards/rejected": -10.17149543762207,
"step": 268
},
{
"epoch": 6.89,
"learning_rate": 0.00021837606837606837,
"logits/chosen": 1.0858148336410522,
"logits/rejected": 1.0668940544128418,
"logps/chosen": -580.924072265625,
"logps/rejected": -651.995849609375,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7338685989379883,
"rewards/margins": 10.77505874633789,
"rewards/rejected": -9.041191101074219,
"step": 269
},
{
"epoch": 6.91,
"learning_rate": 0.00021794871794871793,
"logits/chosen": 1.081437110900879,
"logits/rejected": 0.9860243797302246,
"logps/chosen": -484.07366943359375,
"logps/rejected": -648.0784912109375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5497647523880005,
"rewards/margins": 11.513895034790039,
"rewards/rejected": -9.964130401611328,
"step": 270
},
{
"epoch": 6.94,
"learning_rate": 0.0002175213675213675,
"logits/chosen": 1.1952917575836182,
"logits/rejected": 1.1564627885818481,
"logps/chosen": -548.2731323242188,
"logps/rejected": -742.41845703125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5800870656967163,
"rewards/margins": 10.232941627502441,
"rewards/rejected": -9.652854919433594,
"step": 271
},
{
"epoch": 6.96,
"learning_rate": 0.00021709401709401707,
"logits/chosen": 1.2948367595672607,
"logits/rejected": 1.1679692268371582,
"logps/chosen": -573.0979614257812,
"logps/rejected": -682.3110961914062,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2798572778701782,
"rewards/margins": 9.664986610412598,
"rewards/rejected": -8.385129928588867,
"step": 272
},
{
"epoch": 6.99,
"learning_rate": 0.00021666666666666666,
"logits/chosen": 1.1744760274887085,
"logits/rejected": 0.9845774173736572,
"logps/chosen": -558.6195068359375,
"logps/rejected": -652.9862060546875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.773690104484558,
"rewards/margins": 10.953255653381348,
"rewards/rejected": -9.1795654296875,
"step": 273
},
{
"epoch": 7.01,
"learning_rate": 0.00021623931623931622,
"logits/chosen": 1.1232681274414062,
"logits/rejected": 1.0257112979888916,
"logps/chosen": -510.100341796875,
"logps/rejected": -643.0399169921875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.46491277217865,
"rewards/margins": 9.883655548095703,
"rewards/rejected": -8.418743133544922,
"step": 274
},
{
"epoch": 7.04,
"learning_rate": 0.0002158119658119658,
"logits/chosen": 1.0357047319412231,
"logits/rejected": 1.001062035560608,
"logps/chosen": -501.3252868652344,
"logps/rejected": -553.6700439453125,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0863351821899414,
"rewards/margins": 9.711782455444336,
"rewards/rejected": -8.625446319580078,
"step": 275
},
{
"epoch": 7.07,
"learning_rate": 0.00021538461538461536,
"logits/chosen": 1.1731458902359009,
"logits/rejected": 1.11858069896698,
"logps/chosen": -577.2032470703125,
"logps/rejected": -713.2083740234375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0856976509094238,
"rewards/margins": 10.696051597595215,
"rewards/rejected": -9.610353469848633,
"step": 276
},
{
"epoch": 7.09,
"learning_rate": 0.00021495726495726492,
"logits/chosen": 1.0282161235809326,
"logits/rejected": 0.9538753032684326,
"logps/chosen": -494.20562744140625,
"logps/rejected": -624.65087890625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3367016315460205,
"rewards/margins": 11.350975036621094,
"rewards/rejected": -9.014272689819336,
"step": 277
},
{
"epoch": 7.12,
"learning_rate": 0.00021452991452991453,
"logits/chosen": 1.0994839668273926,
"logits/rejected": 1.1064229011535645,
"logps/chosen": -498.6627197265625,
"logps/rejected": -695.014404296875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8084684014320374,
"rewards/margins": 10.440178871154785,
"rewards/rejected": -9.631710052490234,
"step": 278
},
{
"epoch": 7.14,
"learning_rate": 0.0002141025641025641,
"logits/chosen": 0.9965860843658447,
"logits/rejected": 0.9728628396987915,
"logps/chosen": -478.1365966796875,
"logps/rejected": -635.9570922851562,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1333738565444946,
"rewards/margins": 9.761213302612305,
"rewards/rejected": -8.627839088439941,
"step": 279
},
{
"epoch": 7.17,
"learning_rate": 0.00021367521367521365,
"logits/chosen": 1.2235289812088013,
"logits/rejected": 1.040520191192627,
"logps/chosen": -577.0011596679688,
"logps/rejected": -598.2988891601562,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.098937749862671,
"rewards/margins": 10.220855712890625,
"rewards/rejected": -9.121917724609375,
"step": 280
},
{
"epoch": 7.19,
"learning_rate": 0.0002132478632478632,
"logits/chosen": 1.1766057014465332,
"logits/rejected": 1.001685380935669,
"logps/chosen": -511.30352783203125,
"logps/rejected": -541.9244384765625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6884262561798096,
"rewards/margins": 9.974132537841797,
"rewards/rejected": -8.28570556640625,
"step": 281
},
{
"epoch": 7.22,
"learning_rate": 0.00021282051282051282,
"logits/chosen": 1.193005084991455,
"logits/rejected": 1.118786096572876,
"logps/chosen": -552.6077270507812,
"logps/rejected": -715.137451171875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0981065034866333,
"rewards/margins": 10.102052688598633,
"rewards/rejected": -9.003947257995605,
"step": 282
},
{
"epoch": 7.24,
"learning_rate": 0.00021239316239316238,
"logits/chosen": 1.1393274068832397,
"logits/rejected": 1.102120041847229,
"logps/chosen": -511.25286865234375,
"logps/rejected": -617.3232421875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.180321216583252,
"rewards/margins": 11.865279197692871,
"rewards/rejected": -9.684957504272461,
"step": 283
},
{
"epoch": 7.27,
"learning_rate": 0.00021196581196581194,
"logits/chosen": 1.0302733182907104,
"logits/rejected": 1.0308837890625,
"logps/chosen": -504.82989501953125,
"logps/rejected": -603.8284301757812,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.966596782207489,
"rewards/margins": 9.79338264465332,
"rewards/rejected": -8.826786041259766,
"step": 284
},
{
"epoch": 7.3,
"learning_rate": 0.00021153846153846152,
"logits/chosen": 0.9913230538368225,
"logits/rejected": 0.9480158090591431,
"logps/chosen": -546.1568603515625,
"logps/rejected": -645.8260498046875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7541160583496094,
"rewards/margins": 10.948417663574219,
"rewards/rejected": -10.19430160522461,
"step": 285
},
{
"epoch": 7.32,
"learning_rate": 0.0002111111111111111,
"logits/chosen": 1.1722790002822876,
"logits/rejected": 1.0763994455337524,
"logps/chosen": -604.4279174804688,
"logps/rejected": -637.1195068359375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.36835515499115,
"rewards/margins": 10.110456466674805,
"rewards/rejected": -8.742100715637207,
"step": 286
},
{
"epoch": 7.35,
"learning_rate": 0.00021068376068376067,
"logits/chosen": 1.1397333145141602,
"logits/rejected": 1.1724354028701782,
"logps/chosen": -497.7437744140625,
"logps/rejected": -731.747314453125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8257311582565308,
"rewards/margins": 10.88111686706543,
"rewards/rejected": -10.05538558959961,
"step": 287
},
{
"epoch": 7.37,
"learning_rate": 0.00021025641025641022,
"logits/chosen": 1.0152881145477295,
"logits/rejected": 0.9720747470855713,
"logps/chosen": -467.1788330078125,
"logps/rejected": -618.5647583007812,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0214035511016846,
"rewards/margins": 9.899874687194824,
"rewards/rejected": -8.878470420837402,
"step": 288
},
{
"epoch": 7.4,
"learning_rate": 0.0002098290598290598,
"logits/chosen": 1.2052510976791382,
"logits/rejected": 1.0738441944122314,
"logps/chosen": -559.755859375,
"logps/rejected": -672.936767578125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2137142419815063,
"rewards/margins": 11.24556827545166,
"rewards/rejected": -10.031854629516602,
"step": 289
},
{
"epoch": 7.42,
"learning_rate": 0.0002094017094017094,
"logits/chosen": 1.2158238887786865,
"logits/rejected": 1.0811805725097656,
"logps/chosen": -546.0452270507812,
"logps/rejected": -623.5526733398438,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6926029920578003,
"rewards/margins": 10.92724895477295,
"rewards/rejected": -9.23464584350586,
"step": 290
},
{
"epoch": 7.45,
"learning_rate": 0.00020897435897435895,
"logits/chosen": 1.2231104373931885,
"logits/rejected": 1.1560981273651123,
"logps/chosen": -578.806640625,
"logps/rejected": -620.2879028320312,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.173689603805542,
"rewards/margins": 9.140408515930176,
"rewards/rejected": -7.966719150543213,
"step": 291
},
{
"epoch": 7.48,
"learning_rate": 0.00020854700854700854,
"logits/chosen": 1.1962541341781616,
"logits/rejected": 1.0524215698242188,
"logps/chosen": -575.8316650390625,
"logps/rejected": -602.4752807617188,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0710935592651367,
"rewards/margins": 9.931817054748535,
"rewards/rejected": -7.860722541809082,
"step": 292
},
{
"epoch": 7.5,
"learning_rate": 0.0002081196581196581,
"logits/chosen": 1.2810760736465454,
"logits/rejected": 1.1952964067459106,
"logps/chosen": -611.9567260742188,
"logps/rejected": -695.635986328125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4828972816467285,
"rewards/margins": 10.279756546020508,
"rewards/rejected": -8.796858787536621,
"step": 293
},
{
"epoch": 7.53,
"learning_rate": 0.00020769230769230766,
"logits/chosen": 1.1195869445800781,
"logits/rejected": 1.032854437828064,
"logps/chosen": -496.8470153808594,
"logps/rejected": -573.8423461914062,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7543283700942993,
"rewards/margins": 10.867127418518066,
"rewards/rejected": -9.112799644470215,
"step": 294
},
{
"epoch": 7.55,
"learning_rate": 0.00020726495726495724,
"logits/chosen": 1.1649212837219238,
"logits/rejected": 1.0563678741455078,
"logps/chosen": -558.580322265625,
"logps/rejected": -644.250732421875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.997902512550354,
"rewards/margins": 10.675620079040527,
"rewards/rejected": -9.677717208862305,
"step": 295
},
{
"epoch": 7.58,
"learning_rate": 0.00020683760683760683,
"logits/chosen": 1.1838792562484741,
"logits/rejected": 1.0918617248535156,
"logps/chosen": -534.3650512695312,
"logps/rejected": -626.9495849609375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7307627201080322,
"rewards/margins": 9.962722778320312,
"rewards/rejected": -8.23196029663086,
"step": 296
},
{
"epoch": 7.6,
"learning_rate": 0.0002064102564102564,
"logits/chosen": 1.1973166465759277,
"logits/rejected": 1.1105926036834717,
"logps/chosen": -552.765625,
"logps/rejected": -571.7509765625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9999276399612427,
"rewards/margins": 9.358254432678223,
"rewards/rejected": -7.358326435089111,
"step": 297
},
{
"epoch": 7.63,
"learning_rate": 0.00020598290598290595,
"logits/chosen": 1.1777927875518799,
"logits/rejected": 1.0511623620986938,
"logps/chosen": -482.9162292480469,
"logps/rejected": -596.0338745117188,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1938194036483765,
"rewards/margins": 10.532297134399414,
"rewards/rejected": -9.338478088378906,
"step": 298
},
{
"epoch": 7.65,
"learning_rate": 0.00020555555555555556,
"logits/chosen": 1.194580078125,
"logits/rejected": 1.0481842756271362,
"logps/chosen": -523.016845703125,
"logps/rejected": -585.9976806640625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1020272970199585,
"rewards/margins": 10.315288543701172,
"rewards/rejected": -9.213261604309082,
"step": 299
},
{
"epoch": 7.68,
"learning_rate": 0.00020512820512820512,
"logits/chosen": 1.0988215208053589,
"logits/rejected": 1.024403691291809,
"logps/chosen": -481.1084289550781,
"logps/rejected": -562.4240112304688,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.069756507873535,
"rewards/margins": 10.040982246398926,
"rewards/rejected": -7.971225261688232,
"step": 300
},
{
"epoch": 7.71,
"learning_rate": 0.00020470085470085468,
"logits/chosen": 1.311755657196045,
"logits/rejected": 1.0829813480377197,
"logps/chosen": -606.013671875,
"logps/rejected": -694.0232543945312,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6969201564788818,
"rewards/margins": 9.998454093933105,
"rewards/rejected": -9.301534652709961,
"step": 301
},
{
"epoch": 7.73,
"learning_rate": 0.00020427350427350423,
"logits/chosen": 1.0659198760986328,
"logits/rejected": 1.0787678956985474,
"logps/chosen": -562.7803344726562,
"logps/rejected": -598.2239990234375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5258029699325562,
"rewards/margins": 9.569708824157715,
"rewards/rejected": -8.043905258178711,
"step": 302
},
{
"epoch": 7.76,
"learning_rate": 0.00020384615384615385,
"logits/chosen": 1.1065874099731445,
"logits/rejected": 1.075424313545227,
"logps/chosen": -503.7167053222656,
"logps/rejected": -640.2783203125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0811102390289307,
"rewards/margins": 9.623838424682617,
"rewards/rejected": -8.542729377746582,
"step": 303
},
{
"epoch": 7.78,
"learning_rate": 0.0002034188034188034,
"logits/chosen": 1.1548815965652466,
"logits/rejected": 1.1475387811660767,
"logps/chosen": -475.9542236328125,
"logps/rejected": -630.96826171875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4979651868343353,
"rewards/margins": 10.597925186157227,
"rewards/rejected": -10.099960327148438,
"step": 304
},
{
"epoch": 7.81,
"learning_rate": 0.00020299145299145296,
"logits/chosen": 1.1043236255645752,
"logits/rejected": 1.0807740688323975,
"logps/chosen": -546.5986938476562,
"logps/rejected": -615.6447143554688,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0489461421966553,
"rewards/margins": 10.541731834411621,
"rewards/rejected": -8.49278450012207,
"step": 305
},
{
"epoch": 7.83,
"learning_rate": 0.00020256410256410255,
"logits/chosen": 1.1738014221191406,
"logits/rejected": 1.0424628257751465,
"logps/chosen": -579.9978637695312,
"logps/rejected": -668.8394775390625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2401783466339111,
"rewards/margins": 11.568252563476562,
"rewards/rejected": -10.328075408935547,
"step": 306
},
{
"epoch": 7.86,
"learning_rate": 0.00020213675213675214,
"logits/chosen": 1.221925973892212,
"logits/rejected": 1.0827970504760742,
"logps/chosen": -554.8446044921875,
"logps/rejected": -623.172119140625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.212306261062622,
"rewards/margins": 9.68209457397461,
"rewards/rejected": -8.469788551330566,
"step": 307
},
{
"epoch": 7.88,
"learning_rate": 0.0002017094017094017,
"logits/chosen": 1.0534234046936035,
"logits/rejected": 1.1416208744049072,
"logps/chosen": -498.2744140625,
"logps/rejected": -689.6672973632812,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.162524938583374,
"rewards/margins": 11.897795677185059,
"rewards/rejected": -9.735269546508789,
"step": 308
},
{
"epoch": 7.91,
"learning_rate": 0.00020128205128205125,
"logits/chosen": 1.0100905895233154,
"logits/rejected": 1.1573420763015747,
"logps/chosen": -500.80841064453125,
"logps/rejected": -593.0250244140625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8166165351867676,
"rewards/margins": 9.917183876037598,
"rewards/rejected": -8.100566864013672,
"step": 309
},
{
"epoch": 7.94,
"learning_rate": 0.00020085470085470084,
"logits/chosen": 1.1027199029922485,
"logits/rejected": 0.9867293238639832,
"logps/chosen": -524.3262329101562,
"logps/rejected": -591.5633544921875,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.232293963432312,
"rewards/margins": 10.08406925201416,
"rewards/rejected": -8.851776123046875,
"step": 310
},
{
"epoch": 7.96,
"learning_rate": 0.0002004273504273504,
"logits/chosen": 1.2790604829788208,
"logits/rejected": 1.0140596628189087,
"logps/chosen": -588.2280883789062,
"logps/rejected": -668.6362915039062,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8383196592330933,
"rewards/margins": 10.089158058166504,
"rewards/rejected": -9.250838279724121,
"step": 311
},
{
"epoch": 7.99,
"learning_rate": 0.00019999999999999998,
"logits/chosen": 1.082058310508728,
"logits/rejected": 0.9627883434295654,
"logps/chosen": -609.169677734375,
"logps/rejected": -605.4269409179688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0816116333007812,
"rewards/margins": 11.790176391601562,
"rewards/rejected": -9.708564758300781,
"step": 312
},
{
"epoch": 8.01,
"learning_rate": 0.00019957264957264957,
"logits/chosen": 1.1039892435073853,
"logits/rejected": 0.9567267298698425,
"logps/chosen": -473.4000244140625,
"logps/rejected": -618.5371704101562,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4500510692596436,
"rewards/margins": 10.126335144042969,
"rewards/rejected": -8.676283836364746,
"step": 313
},
{
"epoch": 8.04,
"learning_rate": 0.00019914529914529913,
"logits/chosen": 1.1014786958694458,
"logits/rejected": 1.0613113641738892,
"logps/chosen": -511.72454833984375,
"logps/rejected": -694.4070434570312,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3977946639060974,
"rewards/margins": 11.06888484954834,
"rewards/rejected": -10.671089172363281,
"step": 314
},
{
"epoch": 8.06,
"learning_rate": 0.00019871794871794869,
"logits/chosen": 1.1368095874786377,
"logits/rejected": 0.9869575500488281,
"logps/chosen": -509.3475036621094,
"logps/rejected": -625.2825927734375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8700177669525146,
"rewards/margins": 10.845271110534668,
"rewards/rejected": -9.97525405883789,
"step": 315
},
{
"epoch": 8.09,
"learning_rate": 0.00019829059829059824,
"logits/chosen": 1.1710002422332764,
"logits/rejected": 1.1424845457077026,
"logps/chosen": -548.1114501953125,
"logps/rejected": -658.926513671875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.743558406829834,
"rewards/margins": 9.961552619934082,
"rewards/rejected": -9.217994689941406,
"step": 316
},
{
"epoch": 8.12,
"learning_rate": 0.00019786324786324786,
"logits/chosen": 1.227845311164856,
"logits/rejected": 1.1172688007354736,
"logps/chosen": -615.0020751953125,
"logps/rejected": -655.31787109375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7781198024749756,
"rewards/margins": 11.058065414428711,
"rewards/rejected": -9.279945373535156,
"step": 317
},
{
"epoch": 8.14,
"learning_rate": 0.00019743589743589742,
"logits/chosen": 1.2156200408935547,
"logits/rejected": 0.9318048357963562,
"logps/chosen": -561.3159790039062,
"logps/rejected": -528.769287109375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.775212287902832,
"rewards/margins": 9.198701858520508,
"rewards/rejected": -7.423489570617676,
"step": 318
},
{
"epoch": 8.17,
"learning_rate": 0.00019700854700854697,
"logits/chosen": 1.1288306713104248,
"logits/rejected": 1.0163847208023071,
"logps/chosen": -566.7800903320312,
"logps/rejected": -621.5507202148438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5739163160324097,
"rewards/margins": 11.509315490722656,
"rewards/rejected": -9.935400009155273,
"step": 319
},
{
"epoch": 8.19,
"learning_rate": 0.00019658119658119659,
"logits/chosen": 1.1232361793518066,
"logits/rejected": 1.1592121124267578,
"logps/chosen": -528.2422485351562,
"logps/rejected": -708.33642578125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.655145287513733,
"rewards/margins": 11.109955787658691,
"rewards/rejected": -9.454811096191406,
"step": 320
},
{
"epoch": 8.22,
"learning_rate": 0.00019615384615384615,
"logits/chosen": 1.1231738328933716,
"logits/rejected": 1.117080807685852,
"logps/chosen": -498.927001953125,
"logps/rejected": -631.9031982421875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.47227901220321655,
"rewards/margins": 10.479837417602539,
"rewards/rejected": -10.007558822631836,
"step": 321
},
{
"epoch": 8.24,
"learning_rate": 0.0001957264957264957,
"logits/chosen": 1.0491048097610474,
"logits/rejected": 0.9988434314727783,
"logps/chosen": -494.6644287109375,
"logps/rejected": -610.76806640625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1051549911499023,
"rewards/margins": 11.895588874816895,
"rewards/rejected": -9.790433883666992,
"step": 322
},
{
"epoch": 8.27,
"learning_rate": 0.00019529914529914526,
"logits/chosen": 1.098575234413147,
"logits/rejected": 1.1674755811691284,
"logps/chosen": -514.0235595703125,
"logps/rejected": -722.014892578125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.734897255897522,
"rewards/margins": 10.937726974487305,
"rewards/rejected": -10.202829360961914,
"step": 323
},
{
"epoch": 8.29,
"learning_rate": 0.00019487179487179487,
"logits/chosen": 1.1288853883743286,
"logits/rejected": 1.1453090906143188,
"logps/chosen": -488.9378662109375,
"logps/rejected": -641.2897338867188,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5595530271530151,
"rewards/margins": 10.641010284423828,
"rewards/rejected": -9.081456184387207,
"step": 324
},
{
"epoch": 8.32,
"learning_rate": 0.00019444444444444443,
"logits/chosen": 1.265305519104004,
"logits/rejected": 0.9294592142105103,
"logps/chosen": -613.603271484375,
"logps/rejected": -536.7325439453125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3043055534362793,
"rewards/margins": 10.669670104980469,
"rewards/rejected": -8.365365028381348,
"step": 325
},
{
"epoch": 8.35,
"learning_rate": 0.000194017094017094,
"logits/chosen": 1.1310415267944336,
"logits/rejected": 1.0312024354934692,
"logps/chosen": -516.1328125,
"logps/rejected": -636.2833251953125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.218218207359314,
"rewards/margins": 10.852701187133789,
"rewards/rejected": -9.634482383728027,
"step": 326
},
{
"epoch": 8.37,
"learning_rate": 0.00019358974358974358,
"logits/chosen": 1.1042028665542603,
"logits/rejected": 1.0703749656677246,
"logps/chosen": -577.5679321289062,
"logps/rejected": -605.3977661132812,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8515387773513794,
"rewards/margins": 9.11039924621582,
"rewards/rejected": -8.25886058807373,
"step": 327
},
{
"epoch": 8.4,
"learning_rate": 0.00019316239316239314,
"logits/chosen": 1.1480742692947388,
"logits/rejected": 1.0245976448059082,
"logps/chosen": -544.492919921875,
"logps/rejected": -650.3825073242188,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1441433429718018,
"rewards/margins": 10.524991989135742,
"rewards/rejected": -9.38084888458252,
"step": 328
},
{
"epoch": 8.42,
"learning_rate": 0.00019273504273504272,
"logits/chosen": 0.9963136315345764,
"logits/rejected": 1.0162067413330078,
"logps/chosen": -543.5288696289062,
"logps/rejected": -677.895751953125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5060915946960449,
"rewards/margins": 10.86108684539795,
"rewards/rejected": -10.354994773864746,
"step": 329
},
{
"epoch": 8.45,
"learning_rate": 0.0001923076923076923,
"logits/chosen": 1.192360758781433,
"logits/rejected": 1.1079771518707275,
"logps/chosen": -512.4086303710938,
"logps/rejected": -630.524169921875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9825226068496704,
"rewards/margins": 11.417821884155273,
"rewards/rejected": -9.435300827026367,
"step": 330
},
{
"epoch": 8.47,
"learning_rate": 0.00019188034188034187,
"logits/chosen": 1.0749591588974,
"logits/rejected": 1.0543586015701294,
"logps/chosen": -502.4964599609375,
"logps/rejected": -628.3140869140625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1525578498840332,
"rewards/margins": 10.45914363861084,
"rewards/rejected": -9.306587219238281,
"step": 331
},
{
"epoch": 8.5,
"learning_rate": 0.00019145299145299142,
"logits/chosen": 1.1045258045196533,
"logits/rejected": 1.094617486000061,
"logps/chosen": -556.7658081054688,
"logps/rejected": -661.129638671875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5904799103736877,
"rewards/margins": 10.494197845458984,
"rewards/rejected": -9.903717994689941,
"step": 332
},
{
"epoch": 8.52,
"learning_rate": 0.00019102564102564098,
"logits/chosen": 1.0999786853790283,
"logits/rejected": 1.0287926197052002,
"logps/chosen": -559.324951171875,
"logps/rejected": -685.59716796875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8768333196640015,
"rewards/margins": 11.342018127441406,
"rewards/rejected": -10.465184211730957,
"step": 333
},
{
"epoch": 8.55,
"learning_rate": 0.0001905982905982906,
"logits/chosen": 1.2270386219024658,
"logits/rejected": 1.0695297718048096,
"logps/chosen": -555.893310546875,
"logps/rejected": -616.703125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.512807846069336,
"rewards/margins": 11.095309257507324,
"rewards/rejected": -8.582502365112305,
"step": 334
},
{
"epoch": 8.58,
"learning_rate": 0.00019017094017094015,
"logits/chosen": 1.1461352109909058,
"logits/rejected": 1.0352705717086792,
"logps/chosen": -500.9500427246094,
"logps/rejected": -638.1965942382812,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5767711400985718,
"rewards/margins": 10.275659561157227,
"rewards/rejected": -9.698890686035156,
"step": 335
},
{
"epoch": 8.6,
"learning_rate": 0.0001897435897435897,
"logits/chosen": 1.1884602308273315,
"logits/rejected": 0.9545145630836487,
"logps/chosen": -553.399658203125,
"logps/rejected": -571.7633056640625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.724369764328003,
"rewards/margins": 9.780606269836426,
"rewards/rejected": -8.056236267089844,
"step": 336
},
{
"epoch": 8.63,
"learning_rate": 0.00018931623931623933,
"logits/chosen": 1.1067692041397095,
"logits/rejected": 1.0162923336029053,
"logps/chosen": -517.6556396484375,
"logps/rejected": -604.726318359375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7874058485031128,
"rewards/margins": 10.558215141296387,
"rewards/rejected": -8.770809173583984,
"step": 337
},
{
"epoch": 8.65,
"learning_rate": 0.00018888888888888888,
"logits/chosen": 1.217395544052124,
"logits/rejected": 1.084112286567688,
"logps/chosen": -542.150634765625,
"logps/rejected": -678.0106201171875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0376150608062744,
"rewards/margins": 10.951888084411621,
"rewards/rejected": -9.914274215698242,
"step": 338
},
{
"epoch": 8.68,
"learning_rate": 0.00018846153846153844,
"logits/chosen": 1.1382925510406494,
"logits/rejected": 1.0875835418701172,
"logps/chosen": -545.2322387695312,
"logps/rejected": -629.0198974609375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7386005520820618,
"rewards/margins": 10.408458709716797,
"rewards/rejected": -9.669858932495117,
"step": 339
},
{
"epoch": 8.7,
"learning_rate": 0.000188034188034188,
"logits/chosen": 1.115515947341919,
"logits/rejected": 1.066940426826477,
"logps/chosen": -523.6985473632812,
"logps/rejected": -574.7987670898438,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0293325185775757,
"rewards/margins": 9.218039512634277,
"rewards/rejected": -8.18870735168457,
"step": 340
},
{
"epoch": 8.73,
"learning_rate": 0.00018760683760683761,
"logits/chosen": 1.0013961791992188,
"logits/rejected": 1.0823533535003662,
"logps/chosen": -485.6407775878906,
"logps/rejected": -657.2274169921875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3279895782470703,
"rewards/margins": 10.64334774017334,
"rewards/rejected": -9.31535816192627,
"step": 341
},
{
"epoch": 8.76,
"learning_rate": 0.00018717948717948717,
"logits/chosen": 1.0347654819488525,
"logits/rejected": 1.0151424407958984,
"logps/chosen": -497.17730712890625,
"logps/rejected": -617.05029296875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.067275881767273,
"rewards/margins": 9.780403137207031,
"rewards/rejected": -8.713126182556152,
"step": 342
},
{
"epoch": 8.78,
"learning_rate": 0.00018675213675213673,
"logits/chosen": 1.1175577640533447,
"logits/rejected": 1.0508739948272705,
"logps/chosen": -543.9364013671875,
"logps/rejected": -730.9745483398438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06183135509490967,
"rewards/margins": 12.12753963470459,
"rewards/rejected": -12.065709114074707,
"step": 343
},
{
"epoch": 8.81,
"learning_rate": 0.00018632478632478632,
"logits/chosen": 1.07142174243927,
"logits/rejected": 1.0519976615905762,
"logps/chosen": -506.71435546875,
"logps/rejected": -652.1842041015625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.239646315574646,
"rewards/margins": 10.84978199005127,
"rewards/rejected": -9.61013412475586,
"step": 344
},
{
"epoch": 8.83,
"learning_rate": 0.00018589743589743588,
"logits/chosen": 1.1885634660720825,
"logits/rejected": 1.0062313079833984,
"logps/chosen": -569.5838623046875,
"logps/rejected": -602.7799072265625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2498093843460083,
"rewards/margins": 9.729214668273926,
"rewards/rejected": -8.47940444946289,
"step": 345
},
{
"epoch": 8.86,
"learning_rate": 0.00018547008547008546,
"logits/chosen": 1.2486610412597656,
"logits/rejected": 0.9658511877059937,
"logps/chosen": -563.5016479492188,
"logps/rejected": -559.8602294921875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7746890783309937,
"rewards/margins": 10.259527206420898,
"rewards/rejected": -8.484838485717773,
"step": 346
},
{
"epoch": 8.88,
"learning_rate": 0.00018504273504273502,
"logits/chosen": 1.0988224744796753,
"logits/rejected": 1.032260775566101,
"logps/chosen": -590.3836669921875,
"logps/rejected": -605.6322021484375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1525167226791382,
"rewards/margins": 10.179304122924805,
"rewards/rejected": -9.026787757873535,
"step": 347
},
{
"epoch": 8.91,
"learning_rate": 0.0001846153846153846,
"logits/chosen": 1.07295823097229,
"logits/rejected": 0.9503864645957947,
"logps/chosen": -567.7842407226562,
"logps/rejected": -599.153564453125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1233148574829102,
"rewards/margins": 10.49275016784668,
"rewards/rejected": -9.369434356689453,
"step": 348
},
{
"epoch": 8.93,
"learning_rate": 0.00018418803418803416,
"logits/chosen": 1.034525752067566,
"logits/rejected": 1.0498316287994385,
"logps/chosen": -501.63323974609375,
"logps/rejected": -582.5183715820312,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7673141956329346,
"rewards/margins": 10.9537353515625,
"rewards/rejected": -9.186420440673828,
"step": 349
},
{
"epoch": 8.96,
"learning_rate": 0.00018376068376068372,
"logits/chosen": 1.1317795515060425,
"logits/rejected": 1.0368475914001465,
"logps/chosen": -592.8737182617188,
"logps/rejected": -618.9924926757812,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9373716115951538,
"rewards/margins": 10.743789672851562,
"rewards/rejected": -9.806417465209961,
"step": 350
},
{
"epoch": 8.99,
"learning_rate": 0.00018333333333333334,
"logits/chosen": 1.236878514289856,
"logits/rejected": 1.0824024677276611,
"logps/chosen": -592.6204223632812,
"logps/rejected": -678.1072387695312,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1566966772079468,
"rewards/margins": 12.701655387878418,
"rewards/rejected": -11.544958114624023,
"step": 351
},
{
"epoch": 9.01,
"learning_rate": 0.0001829059829059829,
"logits/chosen": 1.0362117290496826,
"logits/rejected": 1.0344483852386475,
"logps/chosen": -514.698486328125,
"logps/rejected": -682.2589721679688,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7533162832260132,
"rewards/margins": 10.993279457092285,
"rewards/rejected": -10.23996353149414,
"step": 352
},
{
"epoch": 9.04,
"learning_rate": 0.00018247863247863245,
"logits/chosen": 1.0642633438110352,
"logits/rejected": 0.9820662140846252,
"logps/chosen": -531.2005004882812,
"logps/rejected": -594.51171875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5738836526870728,
"rewards/margins": 11.221702575683594,
"rewards/rejected": -9.647819519042969,
"step": 353
},
{
"epoch": 9.06,
"learning_rate": 0.000182051282051282,
"logits/chosen": 1.1875312328338623,
"logits/rejected": 1.0477038621902466,
"logps/chosen": -519.474609375,
"logps/rejected": -641.5570678710938,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4246861934661865,
"rewards/margins": 10.708345413208008,
"rewards/rejected": -9.283658981323242,
"step": 354
},
{
"epoch": 9.09,
"learning_rate": 0.00018162393162393162,
"logits/chosen": 1.300011157989502,
"logits/rejected": 1.1615049839019775,
"logps/chosen": -594.492919921875,
"logps/rejected": -714.0015258789062,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3209400177001953,
"rewards/margins": 11.944135665893555,
"rewards/rejected": -10.62319564819336,
"step": 355
},
{
"epoch": 9.11,
"learning_rate": 0.00018119658119658118,
"logits/chosen": 1.2041311264038086,
"logits/rejected": 1.09273099899292,
"logps/chosen": -517.081787109375,
"logps/rejected": -623.0797729492188,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7040494680404663,
"rewards/margins": 9.923104286193848,
"rewards/rejected": -8.21905517578125,
"step": 356
},
{
"epoch": 9.14,
"learning_rate": 0.00018076923076923074,
"logits/chosen": 1.1552023887634277,
"logits/rejected": 1.1544487476348877,
"logps/chosen": -527.7257080078125,
"logps/rejected": -654.458740234375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5392783880233765,
"rewards/margins": 10.782899856567383,
"rewards/rejected": -9.243619918823242,
"step": 357
},
{
"epoch": 9.16,
"learning_rate": 0.00018034188034188035,
"logits/chosen": 1.0705739259719849,
"logits/rejected": 1.0257068872451782,
"logps/chosen": -529.324951171875,
"logps/rejected": -572.942138671875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1362345218658447,
"rewards/margins": 10.217846870422363,
"rewards/rejected": -9.081612586975098,
"step": 358
},
{
"epoch": 9.19,
"learning_rate": 0.0001799145299145299,
"logits/chosen": 1.0554429292678833,
"logits/rejected": 0.9397602081298828,
"logps/chosen": -497.97955322265625,
"logps/rejected": -575.80810546875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4867826700210571,
"rewards/margins": 11.263152122497559,
"rewards/rejected": -9.77637004852295,
"step": 359
},
{
"epoch": 9.22,
"learning_rate": 0.00017948717948717947,
"logits/chosen": 1.028283953666687,
"logits/rejected": 1.1357170343399048,
"logps/chosen": -486.7881164550781,
"logps/rejected": -638.11083984375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6323189735412598,
"rewards/margins": 9.90245246887207,
"rewards/rejected": -8.270133972167969,
"step": 360
},
{
"epoch": 9.24,
"learning_rate": 0.00017905982905982903,
"logits/chosen": 1.140254259109497,
"logits/rejected": 0.9276759028434753,
"logps/chosen": -533.4109497070312,
"logps/rejected": -542.5346069335938,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5379778146743774,
"rewards/margins": 10.987845420837402,
"rewards/rejected": -9.449868202209473,
"step": 361
},
{
"epoch": 9.27,
"learning_rate": 0.00017863247863247861,
"logits/chosen": 1.2161970138549805,
"logits/rejected": 1.1362658739089966,
"logps/chosen": -508.8780212402344,
"logps/rejected": -620.1040649414062,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2322115898132324,
"rewards/margins": 10.1491060256958,
"rewards/rejected": -8.91689395904541,
"step": 362
},
{
"epoch": 9.29,
"learning_rate": 0.0001782051282051282,
"logits/chosen": 1.075786828994751,
"logits/rejected": 1.056262731552124,
"logps/chosen": -545.98876953125,
"logps/rejected": -626.4110717773438,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6408146023750305,
"rewards/margins": 11.502391815185547,
"rewards/rejected": -10.861577033996582,
"step": 363
},
{
"epoch": 9.32,
"learning_rate": 0.00017777777777777776,
"logits/chosen": 1.127282738685608,
"logits/rejected": 1.0822765827178955,
"logps/chosen": -518.8118286132812,
"logps/rejected": -693.237548828125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.05411094054579735,
"rewards/margins": 11.227949142456055,
"rewards/rejected": -11.282060623168945,
"step": 364
},
{
"epoch": 9.34,
"learning_rate": 0.00017735042735042734,
"logits/chosen": 1.1585055589675903,
"logits/rejected": 1.031551718711853,
"logps/chosen": -549.4298706054688,
"logps/rejected": -594.13720703125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8158806562423706,
"rewards/margins": 10.627920150756836,
"rewards/rejected": -8.81203842163086,
"step": 365
},
{
"epoch": 9.37,
"learning_rate": 0.0001769230769230769,
"logits/chosen": 1.2148640155792236,
"logits/rejected": 1.1051499843597412,
"logps/chosen": -618.556884765625,
"logps/rejected": -655.5272827148438,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7870438098907471,
"rewards/margins": 10.324081420898438,
"rewards/rejected": -9.537036895751953,
"step": 366
},
{
"epoch": 9.4,
"learning_rate": 0.00017649572649572646,
"logits/chosen": 1.0680745840072632,
"logits/rejected": 0.9647752046585083,
"logps/chosen": -520.0435791015625,
"logps/rejected": -650.8917236328125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3966693878173828,
"rewards/margins": 11.41585922241211,
"rewards/rejected": -10.019189834594727,
"step": 367
},
{
"epoch": 9.42,
"learning_rate": 0.00017606837606837605,
"logits/chosen": 0.974884033203125,
"logits/rejected": 0.9622063636779785,
"logps/chosen": -551.198486328125,
"logps/rejected": -553.2161865234375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7198470830917358,
"rewards/margins": 10.737578392028809,
"rewards/rejected": -9.017731666564941,
"step": 368
},
{
"epoch": 9.45,
"learning_rate": 0.00017564102564102563,
"logits/chosen": 1.1338629722595215,
"logits/rejected": 1.0500080585479736,
"logps/chosen": -519.1705322265625,
"logps/rejected": -645.0868530273438,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2686916589736938,
"rewards/margins": 11.208242416381836,
"rewards/rejected": -9.939552307128906,
"step": 369
},
{
"epoch": 9.47,
"learning_rate": 0.0001752136752136752,
"logits/chosen": 1.1133869886398315,
"logits/rejected": 1.0957475900650024,
"logps/chosen": -509.5137939453125,
"logps/rejected": -634.0670776367188,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1420785188674927,
"rewards/margins": 10.913748741149902,
"rewards/rejected": -9.771669387817383,
"step": 370
},
{
"epoch": 9.5,
"learning_rate": 0.00017478632478632475,
"logits/chosen": 1.0674755573272705,
"logits/rejected": 1.0430006980895996,
"logps/chosen": -549.5125732421875,
"logps/rejected": -615.5984497070312,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9752965569496155,
"rewards/margins": 10.462637901306152,
"rewards/rejected": -9.487340927124023,
"step": 371
},
{
"epoch": 9.52,
"learning_rate": 0.00017435897435897436,
"logits/chosen": 1.0406773090362549,
"logits/rejected": 0.9974726438522339,
"logps/chosen": -554.4515380859375,
"logps/rejected": -671.939697265625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2197209894657135,
"rewards/margins": 10.627167701721191,
"rewards/rejected": -10.40744686126709,
"step": 372
},
{
"epoch": 9.55,
"learning_rate": 0.00017393162393162392,
"logits/chosen": 1.0458401441574097,
"logits/rejected": 0.9767919182777405,
"logps/chosen": -508.51275634765625,
"logps/rejected": -617.0277099609375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3107894659042358,
"rewards/margins": 11.249536514282227,
"rewards/rejected": -9.938748359680176,
"step": 373
},
{
"epoch": 9.57,
"learning_rate": 0.00017350427350427348,
"logits/chosen": 1.1613295078277588,
"logits/rejected": 1.067457914352417,
"logps/chosen": -583.6506958007812,
"logps/rejected": -683.9884033203125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4901715517044067,
"rewards/margins": 11.270017623901367,
"rewards/rejected": -9.779845237731934,
"step": 374
},
{
"epoch": 9.6,
"learning_rate": 0.00017307692307692304,
"logits/chosen": 1.0676989555358887,
"logits/rejected": 1.048395037651062,
"logps/chosen": -553.3953857421875,
"logps/rejected": -713.2011108398438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.901606559753418,
"rewards/margins": 11.026727676391602,
"rewards/rejected": -10.125120162963867,
"step": 375
},
{
"epoch": 9.63,
"learning_rate": 0.00017264957264957265,
"logits/chosen": 1.196079969406128,
"logits/rejected": 1.0154253244400024,
"logps/chosen": -588.3291015625,
"logps/rejected": -622.0186767578125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1913900375366211,
"rewards/margins": 10.078536987304688,
"rewards/rejected": -9.887145042419434,
"step": 376
},
{
"epoch": 9.65,
"learning_rate": 0.0001722222222222222,
"logits/chosen": 1.0178331136703491,
"logits/rejected": 0.9668864011764526,
"logps/chosen": -546.368408203125,
"logps/rejected": -615.4050903320312,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2646788954734802,
"rewards/margins": 10.304734230041504,
"rewards/rejected": -10.040055274963379,
"step": 377
},
{
"epoch": 9.68,
"learning_rate": 0.00017179487179487177,
"logits/chosen": 1.1032341718673706,
"logits/rejected": 0.9035695791244507,
"logps/chosen": -604.7924194335938,
"logps/rejected": -652.8895874023438,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5645720958709717,
"rewards/margins": 11.598380088806152,
"rewards/rejected": -10.033807754516602,
"step": 378
},
{
"epoch": 9.7,
"learning_rate": 0.00017136752136752135,
"logits/chosen": 1.1787011623382568,
"logits/rejected": 0.9833663702011108,
"logps/chosen": -574.9915771484375,
"logps/rejected": -650.5631713867188,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0073214769363403,
"rewards/margins": 10.402047157287598,
"rewards/rejected": -9.394725799560547,
"step": 379
},
{
"epoch": 9.73,
"learning_rate": 0.00017094017094017094,
"logits/chosen": 1.1996289491653442,
"logits/rejected": 1.039535403251648,
"logps/chosen": -556.2608642578125,
"logps/rejected": -642.0311279296875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.216808557510376,
"rewards/margins": 11.361566543579102,
"rewards/rejected": -10.144757270812988,
"step": 380
},
{
"epoch": 9.75,
"learning_rate": 0.0001705128205128205,
"logits/chosen": 1.1434451341629028,
"logits/rejected": 1.051792860031128,
"logps/chosen": -493.62506103515625,
"logps/rejected": -622.7883911132812,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9653478860855103,
"rewards/margins": 10.048986434936523,
"rewards/rejected": -9.083638191223145,
"step": 381
},
{
"epoch": 9.78,
"learning_rate": 0.00017008547008547006,
"logits/chosen": 1.2439961433410645,
"logits/rejected": 0.9698901772499084,
"logps/chosen": -590.2651977539062,
"logps/rejected": -601.9655151367188,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7650651931762695,
"rewards/margins": 10.54482650756836,
"rewards/rejected": -8.779762268066406,
"step": 382
},
{
"epoch": 9.8,
"learning_rate": 0.00016965811965811964,
"logits/chosen": 1.0514246225357056,
"logits/rejected": 1.0069361925125122,
"logps/chosen": -528.1588745117188,
"logps/rejected": -668.963623046875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9377299547195435,
"rewards/margins": 11.108814239501953,
"rewards/rejected": -10.171082496643066,
"step": 383
},
{
"epoch": 9.83,
"learning_rate": 0.0001692307692307692,
"logits/chosen": 1.0299386978149414,
"logits/rejected": 0.9614180326461792,
"logps/chosen": -461.4678955078125,
"logps/rejected": -622.8182373046875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3360067903995514,
"rewards/margins": 11.178654670715332,
"rewards/rejected": -10.842646598815918,
"step": 384
},
{
"epoch": 9.86,
"learning_rate": 0.0001688034188034188,
"logits/chosen": 1.069563627243042,
"logits/rejected": 0.9999057054519653,
"logps/chosen": -532.8282470703125,
"logps/rejected": -611.0808715820312,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6393276453018188,
"rewards/margins": 11.510586738586426,
"rewards/rejected": -9.871259689331055,
"step": 385
},
{
"epoch": 9.88,
"learning_rate": 0.00016837606837606837,
"logits/chosen": 1.0466080904006958,
"logits/rejected": 0.9922081232070923,
"logps/chosen": -516.3175048828125,
"logps/rejected": -622.3922729492188,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9020825028419495,
"rewards/margins": 11.032547950744629,
"rewards/rejected": -10.13046646118164,
"step": 386
},
{
"epoch": 9.91,
"learning_rate": 0.00016794871794871793,
"logits/chosen": 1.068638801574707,
"logits/rejected": 1.0634409189224243,
"logps/chosen": -518.2888793945312,
"logps/rejected": -700.1104736328125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8084597587585449,
"rewards/margins": 11.80289363861084,
"rewards/rejected": -10.994433403015137,
"step": 387
},
{
"epoch": 9.93,
"learning_rate": 0.0001675213675213675,
"logits/chosen": 1.252096176147461,
"logits/rejected": 1.12416672706604,
"logps/chosen": -559.9757080078125,
"logps/rejected": -640.3218994140625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.250156283378601,
"rewards/margins": 10.136184692382812,
"rewards/rejected": -8.886027336120605,
"step": 388
},
{
"epoch": 9.96,
"learning_rate": 0.00016709401709401708,
"logits/chosen": 1.0118858814239502,
"logits/rejected": 1.0544030666351318,
"logps/chosen": -550.706298828125,
"logps/rejected": -651.165283203125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7763742208480835,
"rewards/margins": 11.286043167114258,
"rewards/rejected": -10.50966739654541,
"step": 389
},
{
"epoch": 9.98,
"learning_rate": 0.00016666666666666666,
"logits/chosen": 1.2127994298934937,
"logits/rejected": 0.9555975198745728,
"logps/chosen": -586.635009765625,
"logps/rejected": -619.5669555664062,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7170848846435547,
"rewards/margins": 11.147682189941406,
"rewards/rejected": -9.430597305297852,
"step": 390
},
{
"epoch": 10.01,
"learning_rate": 0.00016623931623931622,
"logits/chosen": 0.9974936246871948,
"logits/rejected": 0.98292076587677,
"logps/chosen": -537.190185546875,
"logps/rejected": -637.2457275390625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7226758599281311,
"rewards/margins": 10.400409698486328,
"rewards/rejected": -9.677732467651367,
"step": 391
},
{
"epoch": 10.04,
"learning_rate": 0.00016581196581196578,
"logits/chosen": 0.9824466109275818,
"logits/rejected": 1.0442546606063843,
"logps/chosen": -524.7149047851562,
"logps/rejected": -685.2274169921875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7316232919692993,
"rewards/margins": 12.584775924682617,
"rewards/rejected": -10.85315227508545,
"step": 392
},
{
"epoch": 10.06,
"learning_rate": 0.0001653846153846154,
"logits/chosen": 1.0178762674331665,
"logits/rejected": 1.0314674377441406,
"logps/chosen": -530.5228271484375,
"logps/rejected": -601.8655395507812,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4628024697303772,
"rewards/margins": 9.492361068725586,
"rewards/rejected": -9.029558181762695,
"step": 393
},
{
"epoch": 10.09,
"learning_rate": 0.00016495726495726495,
"logits/chosen": 1.1514480113983154,
"logits/rejected": 0.9793822765350342,
"logps/chosen": -569.4765625,
"logps/rejected": -589.162109375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2061307430267334,
"rewards/margins": 11.210150718688965,
"rewards/rejected": -9.004018783569336,
"step": 394
},
{
"epoch": 10.11,
"learning_rate": 0.0001645299145299145,
"logits/chosen": 1.008737564086914,
"logits/rejected": 0.9937309622764587,
"logps/chosen": -473.2950134277344,
"logps/rejected": -667.3652954101562,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11462172865867615,
"rewards/margins": 11.188936233520508,
"rewards/rejected": -11.303558349609375,
"step": 395
},
{
"epoch": 10.14,
"learning_rate": 0.0001641025641025641,
"logits/chosen": 1.1840193271636963,
"logits/rejected": 0.9543963670730591,
"logps/chosen": -582.7732543945312,
"logps/rejected": -608.0531616210938,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1386045217514038,
"rewards/margins": 10.291077613830566,
"rewards/rejected": -9.152473449707031,
"step": 396
},
{
"epoch": 10.16,
"learning_rate": 0.00016367521367521368,
"logits/chosen": 1.157091736793518,
"logits/rejected": 1.1324167251586914,
"logps/chosen": -568.5440063476562,
"logps/rejected": -697.5151977539062,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1107374429702759,
"rewards/margins": 11.52519702911377,
"rewards/rejected": -10.414460182189941,
"step": 397
},
{
"epoch": 10.19,
"learning_rate": 0.00016324786324786324,
"logits/chosen": 1.0599353313446045,
"logits/rejected": 0.8371300101280212,
"logps/chosen": -523.8178100585938,
"logps/rejected": -568.3763427734375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7378228306770325,
"rewards/margins": 10.31213092803955,
"rewards/rejected": -9.574308395385742,
"step": 398
},
{
"epoch": 10.21,
"learning_rate": 0.0001628205128205128,
"logits/chosen": 1.1668627262115479,
"logits/rejected": 1.1840052604675293,
"logps/chosen": -513.0997314453125,
"logps/rejected": -679.572998046875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.532392680644989,
"rewards/margins": 10.64517593383789,
"rewards/rejected": -10.112783432006836,
"step": 399
},
{
"epoch": 10.24,
"learning_rate": 0.00016239316239316238,
"logits/chosen": 1.058180570602417,
"logits/rejected": 0.9799319505691528,
"logps/chosen": -521.548095703125,
"logps/rejected": -572.1917724609375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7191563844680786,
"rewards/margins": 9.96951675415039,
"rewards/rejected": -8.250360488891602,
"step": 400
}
],
"logging_steps": 1,
"max_steps": 780,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 50,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}