{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6096631611034903, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006096631611034903, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": 0.05287215858697891, "logits/rejected": 0.009399833157658577, "logps/chosen": -73.52249145507812, "logps/rejected": -51.21772003173828, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0012193263222069807, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -0.016249075531959534, "logits/rejected": 0.055124565958976746, "logps/chosen": -168.17079162597656, "logps/rejected": -134.3463592529297, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.001828989483310471, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": 0.039522528648376465, "logits/rejected": 0.035245977342128754, "logps/chosen": -83.67476654052734, "logps/rejected": -100.48914337158203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3 }, { "epoch": 0.0024386526444139613, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": 0.1155989021062851, "logits/rejected": 0.03127114474773407, "logps/chosen": -313.77618408203125, "logps/rejected": -281.04266357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4 }, { "epoch": 0.0030483158055174516, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -0.04632345587015152, "logits/rejected": 0.047024864703416824, "logps/chosen": -153.97320556640625, "logps/rejected": -97.37532043457031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5 }, { "epoch": 0.003657978966620942, "grad_norm": 68.37611427847516, "learning_rate": 4.390243902439024e-11, "logits/chosen": 0.35580092668533325, "logits/rejected": 0.34043076634407043, "logps/chosen": -150.33229064941406, "logps/rejected": -127.56856536865234, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6 }, { "epoch": 0.004267642127724432, "grad_norm": 68.37611427847516, "learning_rate": 4.390243902439024e-11, "logits/chosen": 0.16691184043884277, "logits/rejected": 0.06875558197498322, "logps/chosen": -21.509504318237305, "logps/rejected": -56.21983337402344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7 }, { "epoch": 0.004877305288827923, "grad_norm": 74.25793363323966, "learning_rate": 8.780487804878048e-11, "logits/chosen": -0.29858696460723877, "logits/rejected": -0.16083328425884247, "logps/chosen": -211.10525512695312, "logps/rejected": -231.42721557617188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8 }, { "epoch": 0.0054869684499314125, "grad_norm": 70.63404747210473, "learning_rate": 1.3170731707317074e-10, "logits/chosen": 0.017641346901655197, "logits/rejected": 0.12449988722801208, "logps/chosen": -317.4058837890625, "logps/rejected": -266.26617431640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9 }, { "epoch": 0.006096631611034903, "grad_norm": 70.47359046304224, "learning_rate": 1.7560975609756095e-10, "logits/chosen": -0.17641407251358032, "logits/rejected": 0.1076519638299942, "logps/chosen": -291.4444580078125, "logps/rejected": -167.51348876953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 10 }, { "epoch": 0.006706294772138394, "grad_norm": 71.24522499173389, "learning_rate": 2.1951219512195122e-10, "logits/chosen": 0.2724137306213379, "logits/rejected": 0.16814950108528137, "logps/chosen": -113.14727783203125, "logps/rejected": -174.35382080078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 11 }, { "epoch": 0.007315957933241884, "grad_norm": 70.93965975944207, "learning_rate": 2.634146341463415e-10, "logits/chosen": 0.3719051778316498, "logits/rejected": 0.08294087648391724, "logps/chosen": -60.49693298339844, "logps/rejected": -129.46234130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 12 }, { "epoch": 0.007925621094345374, "grad_norm": 65.48114720101339, "learning_rate": 3.073170731707317e-10, "logits/chosen": -0.2717147469520569, "logits/rejected": -0.35204601287841797, "logps/chosen": -130.41958618164062, "logps/rejected": -90.1414794921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 13 }, { "epoch": 0.008535284255448864, "grad_norm": 85.3419090068418, "learning_rate": 3.512195121951219e-10, "logits/chosen": 0.14461614191532135, "logits/rejected": 0.12076608836650848, "logps/chosen": -194.26065063476562, "logps/rejected": -214.17393493652344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 14 }, { "epoch": 0.009144947416552356, "grad_norm": 62.78589973694846, "learning_rate": 3.9512195121951215e-10, "logits/chosen": 0.06305968761444092, "logits/rejected": -0.0633743405342102, "logps/chosen": -106.2256851196289, "logps/rejected": -165.22598266601562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 15 }, { "epoch": 0.009754610577655845, "grad_norm": 69.97682774304194, "learning_rate": 4.3902439024390244e-10, "logits/chosen": 0.0744749903678894, "logits/rejected": 0.043965235352516174, "logps/chosen": -143.64990234375, "logps/rejected": -198.17315673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 16 }, { "epoch": 0.010364273738759335, "grad_norm": 75.24344073025307, "learning_rate": 4.829268292682926e-10, "logits/chosen": 0.27772170305252075, "logits/rejected": -0.03273066133260727, "logps/chosen": -175.71385192871094, "logps/rejected": -165.26828002929688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 17 }, { "epoch": 0.010973936899862825, "grad_norm": 77.00834822090226, "learning_rate": 5.26829268292683e-10, "logits/chosen": 0.02788539230823517, "logits/rejected": 0.21624302864074707, "logps/chosen": -133.7073974609375, "logps/rejected": -54.169464111328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 18 }, { "epoch": 0.011583600060966317, "grad_norm": 70.54084146607406, "learning_rate": 5.707317073170731e-10, "logits/chosen": -0.0591580867767334, "logits/rejected": 0.33370155096054077, "logps/chosen": -346.8431396484375, "logps/rejected": -155.22689819335938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 19 }, { "epoch": 0.012193263222069806, "grad_norm": 67.3297470565744, "learning_rate": 6.146341463414634e-10, "logits/chosen": 0.191809743642807, "logits/rejected": 0.17111678421497345, "logps/chosen": -10.86440658569336, "logps/rejected": -36.462684631347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 20 }, { "epoch": 0.012802926383173296, "grad_norm": 67.3297470565744, "learning_rate": 6.146341463414634e-10, "logits/chosen": -0.1280287802219391, "logits/rejected": 0.1918884813785553, "logps/chosen": -325.1832275390625, "logps/rejected": -239.93368530273438, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 21 }, { "epoch": 0.013412589544276788, "grad_norm": 67.3297470565744, "learning_rate": 6.146341463414634e-10, "logits/chosen": 0.13415230810642242, "logits/rejected": 0.10988141596317291, "logps/chosen": -18.177507400512695, "logps/rejected": -82.06600952148438, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 22 }, { "epoch": 0.014022252705380277, "grad_norm": 93.08074695025091, "learning_rate": 6.585365853658536e-10, "logits/chosen": 0.22793379426002502, "logits/rejected": 0.19679906964302063, "logps/chosen": -124.44007873535156, "logps/rejected": -148.74542236328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 23 }, { "epoch": 0.014631915866483767, "grad_norm": 93.24059814069281, "learning_rate": 7.024390243902438e-10, "logits/chosen": -0.2758808732032776, "logits/rejected": -0.08728434145450592, "logps/chosen": -591.4380493164062, "logps/rejected": -279.4563293457031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 24 }, { "epoch": 0.015241579027587259, "grad_norm": 68.63554830592973, "learning_rate": 7.463414634146342e-10, "logits/chosen": 0.40160396695137024, "logits/rejected": 0.3144241273403168, "logps/chosen": -59.92793273925781, "logps/rejected": -72.62495422363281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 25 }, { "epoch": 0.01585124218869075, "grad_norm": 76.72619070695632, "learning_rate": 7.902439024390243e-10, "logits/chosen": 0.06995319575071335, "logits/rejected": 0.0718432143330574, "logps/chosen": -21.1335506439209, "logps/rejected": -39.64347839355469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 26 }, { "epoch": 0.01646090534979424, "grad_norm": 67.71348466576721, "learning_rate": 8.341463414634145e-10, "logits/chosen": 0.19081905484199524, "logits/rejected": 0.107520692050457, "logps/chosen": -74.09236145019531, "logps/rejected": -84.74115753173828, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 27 }, { "epoch": 0.017070568510897728, "grad_norm": 70.6623181277699, "learning_rate": 8.780487804878049e-10, "logits/chosen": 0.06893181055784225, "logits/rejected": -0.011756572872400284, "logps/chosen": -141.17626953125, "logps/rejected": -244.47691345214844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 28 }, { "epoch": 0.01768023167200122, "grad_norm": 65.09736234964716, "learning_rate": 9.21951219512195e-10, "logits/chosen": -0.11326849460601807, "logits/rejected": 0.057974159717559814, "logps/chosen": -265.6029357910156, "logps/rejected": -153.57164001464844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 29 }, { "epoch": 0.01828989483310471, "grad_norm": 62.02073777738137, "learning_rate": 9.658536585365852e-10, "logits/chosen": 0.08635324239730835, "logits/rejected": 0.07806559652090073, "logps/chosen": -14.0606689453125, "logps/rejected": -22.32585906982422, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 30 }, { "epoch": 0.0188995579942082, "grad_norm": 74.42992018793248, "learning_rate": 1.0097560975609755e-09, "logits/chosen": -0.038718074560165405, "logits/rejected": 0.3243560791015625, "logps/chosen": -75.48886108398438, "logps/rejected": -26.728788375854492, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 31 }, { "epoch": 0.01950922115531169, "grad_norm": 64.29375640734017, "learning_rate": 1.053658536585366e-09, "logits/chosen": -0.09937749058008194, "logits/rejected": -0.15032553672790527, "logps/chosen": -99.7437973022461, "logps/rejected": -160.94308471679688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 32 }, { "epoch": 0.020118884316415182, "grad_norm": 70.93071036097813, "learning_rate": 1.097560975609756e-09, "logits/chosen": 0.19127793610095978, "logits/rejected": 0.15430431067943573, "logps/chosen": -160.15353393554688, "logps/rejected": -216.41665649414062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 33 }, { "epoch": 0.02072854747751867, "grad_norm": 74.52915654091993, "learning_rate": 1.1414634146341462e-09, "logits/chosen": 0.13181781768798828, "logits/rejected": 0.2162623405456543, "logps/chosen": -119.47815704345703, "logps/rejected": -74.19168853759766, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 34 }, { "epoch": 0.021338210638622162, "grad_norm": 80.24022253894402, "learning_rate": 1.1853658536585366e-09, "logits/chosen": 0.11026953160762787, "logits/rejected": 0.05110342800617218, "logps/chosen": -156.3619384765625, "logps/rejected": -169.68821716308594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 35 }, { "epoch": 0.02194787379972565, "grad_norm": 61.9848191612332, "learning_rate": 1.2292682926829269e-09, "logits/chosen": -0.05266339331865311, "logits/rejected": 0.08744192123413086, "logps/chosen": -178.451416015625, "logps/rejected": -166.0316619873047, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 36 }, { "epoch": 0.02255753696082914, "grad_norm": 78.38786858503308, "learning_rate": 1.273170731707317e-09, "logits/chosen": 0.14256571233272552, "logits/rejected": 0.4338546693325043, "logps/chosen": -300.1292419433594, "logps/rejected": -200.44442749023438, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 37 }, { "epoch": 0.023167200121932633, "grad_norm": 77.88261701115505, "learning_rate": 1.3170731707317072e-09, "logits/chosen": 0.22669938206672668, "logits/rejected": 0.2338542938232422, "logps/chosen": -47.191864013671875, "logps/rejected": -50.06071472167969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 38 }, { "epoch": 0.02377686328303612, "grad_norm": 85.52421451349792, "learning_rate": 1.3609756097560974e-09, "logits/chosen": -0.032261237502098083, "logits/rejected": 0.23262041807174683, "logps/chosen": -409.99365234375, "logps/rejected": -253.15380859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 39 }, { "epoch": 0.024386526444139613, "grad_norm": 79.15481772304013, "learning_rate": 1.4048780487804876e-09, "logits/chosen": 0.25447630882263184, "logits/rejected": 0.18663440644741058, "logps/chosen": -336.67388916015625, "logps/rejected": -161.332275390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 40 }, { "epoch": 0.024996189605243104, "grad_norm": 71.22850423237752, "learning_rate": 1.448780487804878e-09, "logits/chosen": 0.2154097706079483, "logits/rejected": 0.2652926743030548, "logps/chosen": -132.73236083984375, "logps/rejected": -77.12788391113281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 41 }, { "epoch": 0.025605852766346592, "grad_norm": 77.86620038216259, "learning_rate": 1.4926829268292683e-09, "logits/chosen": 0.1286822408437729, "logits/rejected": 0.3024751842021942, "logps/chosen": -298.9327392578125, "logps/rejected": -119.84310913085938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 42 }, { "epoch": 0.026215515927450084, "grad_norm": 73.10788956685286, "learning_rate": 1.5365853658536586e-09, "logits/chosen": 0.20254218578338623, "logits/rejected": 0.5039613842964172, "logps/chosen": -116.90916442871094, "logps/rejected": -63.421058654785156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 43 }, { "epoch": 0.026825179088553575, "grad_norm": 72.62025945728479, "learning_rate": 1.5804878048780486e-09, "logits/chosen": 0.08061984181404114, "logits/rejected": 0.05667334049940109, "logps/chosen": -122.43840026855469, "logps/rejected": -128.56314086914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 44 }, { "epoch": 0.027434842249657063, "grad_norm": 70.52926879204011, "learning_rate": 1.6243902439024388e-09, "logits/chosen": -0.02340932935476303, "logits/rejected": 0.043494515120983124, "logps/chosen": -127.13800811767578, "logps/rejected": -94.7784194946289, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 45 }, { "epoch": 0.028044505410760555, "grad_norm": 64.88774875207254, "learning_rate": 1.668292682926829e-09, "logits/chosen": 0.39714503288269043, "logits/rejected": 0.39462360739707947, "logps/chosen": -7.40703821182251, "logps/rejected": -14.890948295593262, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 46 }, { "epoch": 0.028654168571864046, "grad_norm": 68.79365680982743, "learning_rate": 1.7121951219512195e-09, "logits/chosen": 0.08758289366960526, "logits/rejected": 0.21223704516887665, "logps/chosen": -224.2176513671875, "logps/rejected": -224.02700805664062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 47 }, { "epoch": 0.029263831732967534, "grad_norm": 71.49047996194585, "learning_rate": 1.7560975609756097e-09, "logits/chosen": 0.21929675340652466, "logits/rejected": 0.2699821889400482, "logps/chosen": -104.31961059570312, "logps/rejected": -60.335113525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 48 }, { "epoch": 0.029873494894071026, "grad_norm": 70.73794598984682, "learning_rate": 1.8e-09, "logits/chosen": -0.3800536096096039, "logits/rejected": -0.150841623544693, "logps/chosen": -220.71783447265625, "logps/rejected": -113.48838806152344, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 49 }, { "epoch": 0.030483158055174518, "grad_norm": 66.7889498717846, "learning_rate": 1.84390243902439e-09, "logits/chosen": -0.06314300000667572, "logits/rejected": 0.048857904970645905, "logps/chosen": -239.1514129638672, "logps/rejected": -155.99298095703125, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.001027262187562883, "rewards/margins": -0.000668776105158031, "rewards/rejected": -0.00035848619882017374, "step": 50 }, { "epoch": 0.031092821216278006, "grad_norm": 70.63181999151213, "learning_rate": 1.8878048780487805e-09, "logits/chosen": 0.06969748437404633, "logits/rejected": 0.427978515625, "logps/chosen": -300.5170593261719, "logps/rejected": -128.99072265625, "loss": 0.6926, "rewards/accuracies": 0.25, "rewards/chosen": 0.0006778716924600303, "rewards/margins": -0.0008071899646893144, "rewards/rejected": 0.0014850615989416838, "step": 51 }, { "epoch": 0.0317024843773815, "grad_norm": 66.6174987991604, "learning_rate": 1.9317073170731705e-09, "logits/chosen": 0.28025972843170166, "logits/rejected": 0.19407892227172852, "logps/chosen": -57.48769760131836, "logps/rejected": -79.77017974853516, "loss": 0.6941, "rewards/accuracies": 0.25, "rewards/chosen": -0.0017806501127779484, "rewards/margins": -0.0018260792130604386, "rewards/rejected": 4.542919486993924e-05, "step": 52 }, { "epoch": 0.032312147538484985, "grad_norm": 81.19668841318229, "learning_rate": 1.975609756097561e-09, "logits/chosen": 0.20146635174751282, "logits/rejected": 0.13105042278766632, "logps/chosen": -192.02371215820312, "logps/rejected": -272.7171325683594, "loss": 0.6923, "rewards/accuracies": 0.25, "rewards/chosen": -0.0005275249131955206, "rewards/margins": -0.0012145042419433594, "rewards/rejected": 0.000686979154124856, "step": 53 }, { "epoch": 0.03292181069958848, "grad_norm": 66.9314671755827, "learning_rate": 2.019512195121951e-09, "logits/chosen": -0.07518087327480316, "logits/rejected": 0.33032354712486267, "logps/chosen": -210.52719116210938, "logps/rejected": -185.80984497070312, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.0013447999954223633, "rewards/margins": -0.002561330795288086, "rewards/rejected": 0.0012165309162810445, "step": 54 }, { "epoch": 0.03353147386069197, "grad_norm": 70.91475343182961, "learning_rate": 2.0634146341463414e-09, "logits/chosen": 0.23357635736465454, "logits/rejected": 0.08638399094343185, "logps/chosen": -166.65553283691406, "logps/rejected": -243.26834106445312, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004352211835794151, "rewards/margins": 0.002582198241725564, "rewards/rejected": -0.0030174197163432837, "step": 55 }, { "epoch": 0.034141137021795456, "grad_norm": 60.88045789418313, "learning_rate": 2.107317073170732e-09, "logits/chosen": 0.1865972876548767, "logits/rejected": 0.011848561465740204, "logps/chosen": -141.57260131835938, "logps/rejected": -143.6577911376953, "loss": 0.6945, "rewards/accuracies": 0.25, "rewards/chosen": -0.002753830049186945, "rewards/margins": -0.0051968577317893505, "rewards/rejected": 0.0024430276826024055, "step": 56 }, { "epoch": 0.03475080018289895, "grad_norm": 70.07864758335027, "learning_rate": 2.151219512195122e-09, "logits/chosen": 0.07055716216564178, "logits/rejected": 0.12593163549900055, "logps/chosen": -262.76190185546875, "logps/rejected": -185.82838439941406, "loss": 0.6939, "rewards/accuracies": 0.25, "rewards/chosen": 0.0021800040267407894, "rewards/margins": -0.0016491890419274569, "rewards/rejected": 0.00382919330149889, "step": 57 }, { "epoch": 0.03536046334400244, "grad_norm": 73.56352249882974, "learning_rate": 2.195121951219512e-09, "logits/chosen": 0.04468034580349922, "logits/rejected": 0.19408458471298218, "logps/chosen": -31.595966339111328, "logps/rejected": -31.431028366088867, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.0004551530000753701, "rewards/margins": -0.0012581408955156803, "rewards/rejected": 0.0017132939537987113, "step": 58 }, { "epoch": 0.03597012650510593, "grad_norm": 80.33636599620638, "learning_rate": 2.2390243902439024e-09, "logits/chosen": 0.3012107312679291, "logits/rejected": 0.27375027537345886, "logps/chosen": -148.99156188964844, "logps/rejected": -32.328948974609375, "loss": 0.6933, "rewards/accuracies": 0.25, "rewards/chosen": -0.0038415552116930485, "rewards/margins": -0.003134638536721468, "rewards/rejected": -0.0007069166749715805, "step": 59 }, { "epoch": 0.03657978966620942, "grad_norm": 70.2703831832364, "learning_rate": 2.2829268292682924e-09, "logits/chosen": -0.024738460779190063, "logits/rejected": -0.07055769860744476, "logps/chosen": -167.5659942626953, "logps/rejected": -138.68312072753906, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.0011559011181816459, "rewards/margins": -0.0006197362090460956, "rewards/rejected": 0.0017756373854354024, "step": 60 }, { "epoch": 0.03718945282731291, "grad_norm": 74.5492664117818, "learning_rate": 2.326829268292683e-09, "logits/chosen": -0.18852980434894562, "logits/rejected": 0.006448015570640564, "logps/chosen": -165.77377319335938, "logps/rejected": -177.71710205078125, "loss": 0.6927, "rewards/accuracies": 0.25, "rewards/chosen": -0.0011799321509897709, "rewards/margins": -0.004253004677593708, "rewards/rejected": 0.003073072526603937, "step": 61 }, { "epoch": 0.0377991159884164, "grad_norm": 63.99980911512165, "learning_rate": 2.3707317073170733e-09, "logits/chosen": -0.15407471358776093, "logits/rejected": 0.41999027132987976, "logps/chosen": -127.58901977539062, "logps/rejected": -43.56388473510742, "loss": 0.6917, "rewards/accuracies": 0.75, "rewards/chosen": 0.007107532117515802, "rewards/margins": 0.010301482863724232, "rewards/rejected": -0.0031939507462084293, "step": 62 }, { "epoch": 0.038408779149519894, "grad_norm": 82.13245010916935, "learning_rate": 2.4146341463414633e-09, "logits/chosen": -0.10190024226903915, "logits/rejected": 0.2237187922000885, "logps/chosen": -229.680908203125, "logps/rejected": -208.50599670410156, "loss": 0.6917, "rewards/accuracies": 0.75, "rewards/chosen": 0.0031118900515139103, "rewards/margins": 0.000753405736759305, "rewards/rejected": 0.0023584843147546053, "step": 63 }, { "epoch": 0.03901844231062338, "grad_norm": 70.61017230058009, "learning_rate": 2.4585365853658538e-09, "logits/chosen": 0.11584638804197311, "logits/rejected": 0.20095054805278778, "logps/chosen": -173.85516357421875, "logps/rejected": -126.08563995361328, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.002786439610645175, "rewards/margins": 0.0041068256832659245, "rewards/rejected": -0.0013203859562054276, "step": 64 }, { "epoch": 0.03962810547172687, "grad_norm": 84.0050894833399, "learning_rate": 2.502439024390244e-09, "logits/chosen": 0.11885404586791992, "logits/rejected": 0.08198799192905426, "logps/chosen": -170.72412109375, "logps/rejected": -198.9031219482422, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.003328704973682761, "rewards/margins": 0.009418916888535023, "rewards/rejected": -0.006090211682021618, "step": 65 }, { "epoch": 0.040237768632830365, "grad_norm": 70.46414310404583, "learning_rate": 2.546341463414634e-09, "logits/chosen": 0.2348729968070984, "logits/rejected": 0.05341774597764015, "logps/chosen": -185.4467315673828, "logps/rejected": -201.00570678710938, "loss": 0.6933, "rewards/accuracies": 0.75, "rewards/chosen": 0.003058910369873047, "rewards/margins": 0.006163597106933594, "rewards/rejected": -0.003104686737060547, "step": 66 }, { "epoch": 0.04084743179393385, "grad_norm": 84.39671887821069, "learning_rate": 2.590243902439024e-09, "logits/chosen": 0.1465904712677002, "logits/rejected": 0.15641841292381287, "logps/chosen": -25.62973403930664, "logps/rejected": -33.12474822998047, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0015766852302476764, "rewards/margins": 9.535928256809711e-06, "rewards/rejected": 0.0015671491855755448, "step": 67 }, { "epoch": 0.04145709495503734, "grad_norm": 80.88343791628121, "learning_rate": 2.6341463414634143e-09, "logits/chosen": 0.16637735068798065, "logits/rejected": 0.14293934404850006, "logps/chosen": -79.70658874511719, "logps/rejected": -96.13426208496094, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.004015589132905006, "rewards/margins": 0.0017949821194633842, "rewards/rejected": 0.0022206068970263004, "step": 68 }, { "epoch": 0.04206675811614083, "grad_norm": 95.81105276337522, "learning_rate": 2.6780487804878048e-09, "logits/chosen": 0.1661253273487091, "logits/rejected": 0.3702969253063202, "logps/chosen": -137.08416748046875, "logps/rejected": -70.77066802978516, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": -0.005868458654731512, "rewards/margins": -0.004885214846581221, "rewards/rejected": -0.000983244157396257, "step": 69 }, { "epoch": 0.042676421277244324, "grad_norm": 85.50609013077252, "learning_rate": 2.7219512195121948e-09, "logits/chosen": 0.04000654071569443, "logits/rejected": 0.33832550048828125, "logps/chosen": -253.7117919921875, "logps/rejected": -155.51600646972656, "loss": 0.6935, "rewards/accuracies": 0.75, "rewards/chosen": -0.00038905144901946187, "rewards/margins": 0.004518482368439436, "rewards/rejected": -0.004907533526420593, "step": 70 }, { "epoch": 0.04328608443834781, "grad_norm": 80.53614050473402, "learning_rate": 2.7658536585365852e-09, "logits/chosen": 0.17947782576084137, "logits/rejected": 0.17558667063713074, "logps/chosen": -22.479354858398438, "logps/rejected": -23.31631088256836, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.0022156001068651676, "rewards/margins": 0.0005453706253319979, "rewards/rejected": 0.001670229365117848, "step": 71 }, { "epoch": 0.0438957475994513, "grad_norm": 75.93425185505885, "learning_rate": 2.8097560975609753e-09, "logits/chosen": 0.15121379494667053, "logits/rejected": -0.04035184532403946, "logps/chosen": -91.0668716430664, "logps/rejected": -99.52243041992188, "loss": 0.6937, "rewards/accuracies": 0.75, "rewards/chosen": 0.0011352181900292635, "rewards/margins": 0.005007707979530096, "rewards/rejected": -0.0038724897895008326, "step": 72 }, { "epoch": 0.044505410760554795, "grad_norm": 64.74961747089739, "learning_rate": 2.8536585365853657e-09, "logits/chosen": 0.19988885521888733, "logits/rejected": 0.16584698855876923, "logps/chosen": -107.76717376708984, "logps/rejected": -147.28042602539062, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007725238101556897, "rewards/margins": 0.001528930850327015, "rewards/rejected": -0.0007564069237560034, "step": 73 }, { "epoch": 0.04511507392165828, "grad_norm": 84.10029335113224, "learning_rate": 2.897560975609756e-09, "logits/chosen": -0.034904882311820984, "logits/rejected": 0.03536960482597351, "logps/chosen": -150.70921325683594, "logps/rejected": -80.46929931640625, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": 0.00169027887750417, "rewards/margins": -0.0003446043701842427, "rewards/rejected": 0.0020348832476884127, "step": 74 }, { "epoch": 0.04572473708276177, "grad_norm": 72.00711273151795, "learning_rate": 2.941463414634146e-09, "logits/chosen": 0.0597052201628685, "logits/rejected": 0.03572994843125343, "logps/chosen": -350.35992431640625, "logps/rejected": -322.17724609375, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": -0.00667121447622776, "rewards/margins": -0.00563659705221653, "rewards/rejected": -0.0010346174240112305, "step": 75 }, { "epoch": 0.046334400243865266, "grad_norm": 74.23989965163715, "learning_rate": 2.9853658536585366e-09, "logits/chosen": 0.31788504123687744, "logits/rejected": 0.3128117322921753, "logps/chosen": -7.529122352600098, "logps/rejected": -8.063995361328125, "loss": 0.6936, "rewards/accuracies": 0.75, "rewards/chosen": -0.000852310738991946, "rewards/margins": 0.00029422639636322856, "rewards/rejected": -0.0011465370189398527, "step": 76 }, { "epoch": 0.046944063404968754, "grad_norm": 72.2708304153655, "learning_rate": 3.0292682926829267e-09, "logits/chosen": 0.3397638201713562, "logits/rejected": 0.20202289521694183, "logps/chosen": -78.62361145019531, "logps/rejected": -112.6453628540039, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005385488038882613, "rewards/margins": -0.0005318729672580957, "rewards/rejected": -6.67572021484375e-06, "step": 77 }, { "epoch": 0.04755372656607224, "grad_norm": 75.04661372709796, "learning_rate": 3.073170731707317e-09, "logits/chosen": 0.35530340671539307, "logits/rejected": 0.22375106811523438, "logps/chosen": -64.1614990234375, "logps/rejected": -93.27325439453125, "loss": 0.694, "rewards/accuracies": 0.75, "rewards/chosen": 0.0016937867039814591, "rewards/margins": 0.004260053858160973, "rewards/rejected": -0.002566267503425479, "step": 78 }, { "epoch": 0.04816338972717574, "grad_norm": 83.73910637650769, "learning_rate": 3.1170731707317067e-09, "logits/chosen": -0.10525587201118469, "logits/rejected": -0.14019280672073364, "logps/chosen": -54.87730407714844, "logps/rejected": -60.87992858886719, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": -0.00018210409325547516, "rewards/margins": 0.0017521620029583573, "rewards/rejected": -0.0019342661835253239, "step": 79 }, { "epoch": 0.048773052888279225, "grad_norm": 65.36609025123921, "learning_rate": 3.160975609756097e-09, "logits/chosen": 0.20115497708320618, "logits/rejected": 0.19292600452899933, "logps/chosen": -43.20243835449219, "logps/rejected": -28.736347198486328, "loss": 0.6924, "rewards/accuracies": 0.75, "rewards/chosen": -0.0015144406352192163, "rewards/margins": -0.001647680765017867, "rewards/rejected": 0.0001332402171101421, "step": 80 }, { "epoch": 0.04938271604938271, "grad_norm": 68.35723980447455, "learning_rate": 3.2048780487804876e-09, "logits/chosen": 0.06546074151992798, "logits/rejected": 0.06650267541408539, "logps/chosen": -45.10093307495117, "logps/rejected": -144.30990600585938, "loss": 0.6946, "rewards/accuracies": 0.25, "rewards/chosen": -0.0006906271446496248, "rewards/margins": -0.0065479036420583725, "rewards/rejected": 0.005857276730239391, "step": 81 }, { "epoch": 0.04999237921048621, "grad_norm": 76.28615406166055, "learning_rate": 3.2487804878048777e-09, "logits/chosen": -0.11237098276615143, "logits/rejected": 0.06433381885290146, "logps/chosen": -205.50204467773438, "logps/rejected": -182.95997619628906, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007084847311489284, "rewards/margins": 0.003876638598740101, "rewards/rejected": -0.0045851231552660465, "step": 82 }, { "epoch": 0.050602042371589696, "grad_norm": 80.94256607086295, "learning_rate": 3.292682926829268e-09, "logits/chosen": 0.3187202513217926, "logits/rejected": 0.2065531611442566, "logps/chosen": -255.35821533203125, "logps/rejected": -217.7652587890625, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.006643927656114101, "rewards/margins": 0.01088404655456543, "rewards/rejected": -0.004240119829773903, "step": 83 }, { "epoch": 0.051211705532693184, "grad_norm": 71.45394858065539, "learning_rate": 3.336585365853658e-09, "logits/chosen": -0.25007927417755127, "logits/rejected": 0.15381662547588348, "logps/chosen": -180.00770568847656, "logps/rejected": -132.6005096435547, "loss": 0.6931, "rewards/accuracies": 0.25, "rewards/chosen": 0.0001525879488326609, "rewards/margins": -0.0020342827774584293, "rewards/rejected": 0.0021868706680834293, "step": 84 }, { "epoch": 0.05182136869379668, "grad_norm": 72.06045896192381, "learning_rate": 3.3804878048780486e-09, "logits/chosen": 0.2230096161365509, "logits/rejected": 0.502068817615509, "logps/chosen": -215.84043884277344, "logps/rejected": -156.7900390625, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": 0.004702424630522728, "rewards/margins": 0.007914667949080467, "rewards/rejected": -0.0032122433185577393, "step": 85 }, { "epoch": 0.05243103185490017, "grad_norm": 71.38400999771645, "learning_rate": 3.424390243902439e-09, "logits/chosen": 0.024465292692184448, "logits/rejected": 0.20754316449165344, "logps/chosen": -322.27496337890625, "logps/rejected": -150.3963623046875, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.0038268952630460262, "rewards/margins": 0.0009583504870533943, "rewards/rejected": 0.0028685452416539192, "step": 86 }, { "epoch": 0.053040695016003656, "grad_norm": 74.51379013327342, "learning_rate": 3.468292682926829e-09, "logits/chosen": 0.16065523028373718, "logits/rejected": -0.08133505284786224, "logps/chosen": -104.59339904785156, "logps/rejected": -266.986572265625, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.003070497652515769, "rewards/margins": 0.006459474563598633, "rewards/rejected": -0.0033889771439135075, "step": 87 }, { "epoch": 0.05365035817710715, "grad_norm": 64.65775651723335, "learning_rate": 3.5121951219512195e-09, "logits/chosen": 0.19086205959320068, "logits/rejected": 0.08758498728275299, "logps/chosen": -69.55133056640625, "logps/rejected": -49.74491882324219, "loss": 0.693, "rewards/accuracies": 0.25, "rewards/chosen": -0.0007361411117017269, "rewards/margins": -0.0010319054126739502, "rewards/rejected": 0.00029576424276456237, "step": 88 }, { "epoch": 0.05426002133821064, "grad_norm": 67.41490806453626, "learning_rate": 3.5560975609756095e-09, "logits/chosen": -0.3904249370098114, "logits/rejected": 0.40122270584106445, "logps/chosen": -295.4908142089844, "logps/rejected": -181.67678833007812, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.00012779224198311567, "rewards/margins": 0.0018295288318768144, "rewards/rejected": -0.001701736357063055, "step": 89 }, { "epoch": 0.05486968449931413, "grad_norm": 69.7352013160028, "learning_rate": 3.6e-09, "logits/chosen": 0.02868320234119892, "logits/rejected": 0.023117877542972565, "logps/chosen": -123.01502990722656, "logps/rejected": -327.2979431152344, "loss": 0.6926, "rewards/accuracies": 0.75, "rewards/chosen": -0.0014316558372229338, "rewards/margins": 0.0007941245567053556, "rewards/rejected": -0.0022257803939282894, "step": 90 }, { "epoch": 0.05547934766041762, "grad_norm": 85.9010441341789, "learning_rate": 3.6439024390243904e-09, "logits/chosen": -0.07268917560577393, "logits/rejected": 0.03133467584848404, "logps/chosen": -108.12759399414062, "logps/rejected": -118.45396423339844, "loss": 0.6948, "rewards/accuracies": 0.5, "rewards/chosen": -0.0018654584418982267, "rewards/margins": -0.0015044212341308594, "rewards/rejected": -0.0003610372659750283, "step": 91 }, { "epoch": 0.05608901082152111, "grad_norm": 70.92085506184092, "learning_rate": 3.68780487804878e-09, "logits/chosen": 0.1769690215587616, "logits/rejected": 0.2651098072528839, "logps/chosen": -495.8620300292969, "logps/rejected": -185.98446655273438, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.00516090402379632, "rewards/margins": 0.0031100749038159847, "rewards/rejected": 0.0020508291199803352, "step": 92 }, { "epoch": 0.0566986739826246, "grad_norm": 71.03428763196679, "learning_rate": 3.73170731707317e-09, "logits/chosen": 0.1058330312371254, "logits/rejected": 0.13351327180862427, "logps/chosen": -13.810348510742188, "logps/rejected": -17.113176345825195, "loss": 0.6944, "rewards/accuracies": 1.0, "rewards/chosen": 0.0006977797020226717, "rewards/margins": 0.0007781625026836991, "rewards/rejected": -8.038282976485789e-05, "step": 93 }, { "epoch": 0.05730833714372809, "grad_norm": 73.25497477824051, "learning_rate": 3.775609756097561e-09, "logits/chosen": 0.26408499479293823, "logits/rejected": 0.33262404799461365, "logps/chosen": -404.1988830566406, "logps/rejected": -239.35232543945312, "loss": 0.6933, "rewards/accuracies": 0.75, "rewards/chosen": 0.00642166193574667, "rewards/margins": 0.005939484108239412, "rewards/rejected": 0.00048217771109193563, "step": 94 }, { "epoch": 0.05791800030483158, "grad_norm": 67.09278397767419, "learning_rate": 3.819512195121951e-09, "logits/chosen": 0.15695643424987793, "logits/rejected": 0.18819648027420044, "logps/chosen": -119.06463623046875, "logps/rejected": -137.01116943359375, "loss": 0.6927, "rewards/accuracies": 0.75, "rewards/chosen": 0.00042524340096861124, "rewards/margins": 0.0015760839451104403, "rewards/rejected": -0.001150840544141829, "step": 95 }, { "epoch": 0.05852766346593507, "grad_norm": 75.0674412350952, "learning_rate": 3.863414634146341e-09, "logits/chosen": 0.09307468682527542, "logits/rejected": 0.08323965966701508, "logps/chosen": -261.7306823730469, "logps/rejected": -260.0284118652344, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": -0.0001418590545654297, "rewards/margins": -0.003254556329920888, "rewards/rejected": 0.003112697508186102, "step": 96 }, { "epoch": 0.059137326627038564, "grad_norm": 80.34872386837137, "learning_rate": 3.907317073170732e-09, "logits/chosen": 0.18482346832752228, "logits/rejected": 0.10585962235927582, "logps/chosen": -42.423362731933594, "logps/rejected": -63.49856948852539, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": -0.00029054880724288523, "rewards/margins": -0.001655351952649653, "rewards/rejected": 0.0013648034073412418, "step": 97 }, { "epoch": 0.05974698978814205, "grad_norm": 81.7468245081625, "learning_rate": 3.951219512195122e-09, "logits/chosen": -0.2540174722671509, "logits/rejected": -0.12836764752864838, "logps/chosen": -199.8457489013672, "logps/rejected": -181.54751586914062, "loss": 0.6937, "rewards/accuracies": 0.25, "rewards/chosen": -0.0015849112533032894, "rewards/margins": -0.0025141239166259766, "rewards/rejected": 0.0009292126633226871, "step": 98 }, { "epoch": 0.06035665294924554, "grad_norm": 65.7346035629931, "learning_rate": 3.995121951219512e-09, "logits/chosen": 0.2020762711763382, "logits/rejected": 0.04373517259955406, "logps/chosen": -17.618925094604492, "logps/rejected": -26.497638702392578, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.00015217059990391135, "rewards/margins": -4.655128577724099e-05, "rewards/rejected": 0.00019872188568115234, "step": 99 }, { "epoch": 0.060966316110349035, "grad_norm": 72.7762663088043, "learning_rate": 4.039024390243902e-09, "logits/chosen": 0.027187101542949677, "logits/rejected": -0.13702692091464996, "logps/chosen": -240.42532348632812, "logps/rejected": -361.12823486328125, "loss": 0.6932, "rewards/accuracies": 0.25, "rewards/chosen": -0.0015394926303997636, "rewards/margins": -0.00020051910541951656, "rewards/rejected": -0.0013389736413955688, "step": 100 }, { "epoch": 0.06157597927145252, "grad_norm": 87.81366907788147, "learning_rate": 4.082926829268293e-09, "logits/chosen": -0.01530487835407257, "logits/rejected": 0.003859208896756172, "logps/chosen": -263.99493408203125, "logps/rejected": -353.2584533691406, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.009844470769166946, "rewards/margins": 0.01946244202554226, "rewards/rejected": -0.009617972187697887, "step": 101 }, { "epoch": 0.06218564243255601, "grad_norm": 70.52878373104612, "learning_rate": 4.126829268292683e-09, "logits/chosen": 0.23338094353675842, "logits/rejected": 0.19701853394508362, "logps/chosen": -171.7310028076172, "logps/rejected": -102.3233413696289, "loss": 0.6942, "rewards/accuracies": 0.75, "rewards/chosen": 0.0015583753120154142, "rewards/margins": 0.0036748647689819336, "rewards/rejected": -0.0021164894569665194, "step": 102 }, { "epoch": 0.0627953055936595, "grad_norm": 65.56038043811691, "learning_rate": 4.170731707317073e-09, "logits/chosen": 0.48671942949295044, "logits/rejected": 0.4513268768787384, "logps/chosen": -30.95024871826172, "logps/rejected": -81.58662414550781, "loss": 0.6926, "rewards/accuracies": 0.25, "rewards/chosen": -0.0014354228042066097, "rewards/margins": 0.0016133070457726717, "rewards/rejected": -0.0030487298499792814, "step": 103 }, { "epoch": 0.063404968754763, "grad_norm": 64.36723338107112, "learning_rate": 4.214634146341464e-09, "logits/chosen": 0.19133959710597992, "logits/rejected": 0.2739216387271881, "logps/chosen": -125.99061584472656, "logps/rejected": -232.95196533203125, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": -0.0020974159706383944, "rewards/margins": -0.0030968666542321444, "rewards/rejected": 0.0009994508000090718, "step": 104 }, { "epoch": 0.06401463191586648, "grad_norm": 66.7082232964686, "learning_rate": 4.258536585365853e-09, "logits/chosen": 0.13941365480422974, "logits/rejected": 0.03759532794356346, "logps/chosen": -130.84054565429688, "logps/rejected": -240.2591094970703, "loss": 0.6922, "rewards/accuracies": 0.75, "rewards/chosen": -0.00239717960357666, "rewards/margins": 0.0012583851348608732, "rewards/rejected": -0.003655564971268177, "step": 105 }, { "epoch": 0.06462429507696997, "grad_norm": 77.77334705609563, "learning_rate": 4.302439024390244e-09, "logits/chosen": 0.3149589002132416, "logits/rejected": 0.12610098719596863, "logps/chosen": -70.48065185546875, "logps/rejected": -156.10791015625, "loss": 0.6926, "rewards/accuracies": 0.25, "rewards/chosen": -3.081554314121604e-05, "rewards/margins": 0.001992344856262207, "rewards/rejected": -0.002023160457611084, "step": 106 }, { "epoch": 0.06523395823807346, "grad_norm": 67.20989589144399, "learning_rate": 4.346341463414634e-09, "logits/chosen": 0.33753344416618347, "logits/rejected": 0.18962666392326355, "logps/chosen": -125.78367614746094, "logps/rejected": -190.9263916015625, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.002459835959598422, "rewards/margins": 0.00020625581964850426, "rewards/rejected": -0.00266609201207757, "step": 107 }, { "epoch": 0.06584362139917696, "grad_norm": 71.6543977566246, "learning_rate": 4.390243902439024e-09, "logits/chosen": 0.1545097827911377, "logits/rejected": 0.16738542914390564, "logps/chosen": -42.83551788330078, "logps/rejected": -58.91304016113281, "loss": 0.6944, "rewards/accuracies": 0.25, "rewards/chosen": -0.002740550087764859, "rewards/margins": -0.007119977846741676, "rewards/rejected": 0.0043794275261461735, "step": 108 }, { "epoch": 0.06645328456028045, "grad_norm": 67.8633172345031, "learning_rate": 4.434146341463415e-09, "logits/chosen": 0.14236405491828918, "logits/rejected": 0.13578234612941742, "logps/chosen": -37.31237030029297, "logps/rejected": -42.171234130859375, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012849331833422184, "rewards/margins": 0.002069628331810236, "rewards/rejected": -0.0007846951484680176, "step": 109 }, { "epoch": 0.06706294772138394, "grad_norm": 97.09783574054993, "learning_rate": 4.478048780487805e-09, "logits/chosen": 0.023844445124268532, "logits/rejected": 0.20791736245155334, "logps/chosen": -126.77980041503906, "logps/rejected": -124.97605895996094, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.0018016814719885588, "rewards/margins": -0.00039469008333981037, "rewards/rejected": 0.002196371555328369, "step": 110 }, { "epoch": 0.06767261088248742, "grad_norm": 87.92196146928691, "learning_rate": 4.521951219512195e-09, "logits/chosen": 0.23804506659507751, "logits/rejected": 0.1031482145190239, "logps/chosen": -327.8410949707031, "logps/rejected": -213.29132080078125, "loss": 0.6923, "rewards/accuracies": 0.75, "rewards/chosen": 0.005219769664108753, "rewards/margins": 0.004817938432097435, "rewards/rejected": 0.00040183070814237, "step": 111 }, { "epoch": 0.06828227404359091, "grad_norm": 66.36502679238257, "learning_rate": 4.565853658536585e-09, "logits/chosen": 0.0046591609716415405, "logits/rejected": 0.016852840781211853, "logps/chosen": -365.377685546875, "logps/rejected": -377.4510498046875, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": 0.0113862045109272, "rewards/margins": 0.01930229738354683, "rewards/rejected": -0.007916092872619629, "step": 112 }, { "epoch": 0.0688919372046944, "grad_norm": 75.66303479659113, "learning_rate": 4.609756097560976e-09, "logits/chosen": 0.12720055878162384, "logits/rejected": -0.09696392714977264, "logps/chosen": -106.36075592041016, "logps/rejected": -150.9593505859375, "loss": 0.6931, "rewards/accuracies": 0.25, "rewards/chosen": -0.0023461461532860994, "rewards/margins": -0.008014858700335026, "rewards/rejected": 0.005668711848556995, "step": 113 }, { "epoch": 0.0695016003657979, "grad_norm": 67.06432628491909, "learning_rate": 4.653658536585366e-09, "logits/chosen": 0.08921600133180618, "logits/rejected": 0.09431330859661102, "logps/chosen": -202.0355987548828, "logps/rejected": -112.927734375, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.001555365277454257, "rewards/margins": -0.0014966904418542981, "rewards/rejected": -5.86748355999589e-05, "step": 114 }, { "epoch": 0.07011126352690139, "grad_norm": 64.48741151262199, "learning_rate": 4.697560975609756e-09, "logits/chosen": -0.1840996891260147, "logits/rejected": 0.4118211567401886, "logps/chosen": -155.63963317871094, "logps/rejected": -120.42257690429688, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.002329915761947632, "rewards/margins": -0.00041795382276177406, "rewards/rejected": 0.002747869584709406, "step": 115 }, { "epoch": 0.07072092668800488, "grad_norm": 80.92793766737928, "learning_rate": 4.741463414634147e-09, "logits/chosen": 0.08726423978805542, "logits/rejected": 0.03326858952641487, "logps/chosen": -66.10111236572266, "logps/rejected": -58.67345428466797, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.0038454767782241106, "rewards/margins": 0.0025439260061830282, "rewards/rejected": 0.0013015507720410824, "step": 116 }, { "epoch": 0.07133058984910837, "grad_norm": 76.67413964771319, "learning_rate": 4.785365853658537e-09, "logits/chosen": 0.10839516669511795, "logits/rejected": 0.38721898198127747, "logps/chosen": -270.4200134277344, "logps/rejected": -179.22666931152344, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 8.792884182184935e-05, "rewards/margins": 0.005356276407837868, "rewards/rejected": -0.0052683474496006966, "step": 117 }, { "epoch": 0.07194025301021185, "grad_norm": 73.66752467833105, "learning_rate": 4.829268292682927e-09, "logits/chosen": 0.2082112729549408, "logits/rejected": 0.151906818151474, "logps/chosen": -99.91865539550781, "logps/rejected": -81.91474914550781, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0017058372031897306, "rewards/margins": 0.0025429727975279093, "rewards/rejected": -0.000837135361507535, "step": 118 }, { "epoch": 0.07254991617131534, "grad_norm": 77.3103028937938, "learning_rate": 4.8731707317073175e-09, "logits/chosen": 0.08812177926301956, "logits/rejected": 0.5891497731208801, "logps/chosen": -364.0269470214844, "logps/rejected": -34.905548095703125, "loss": 0.6923, "rewards/accuracies": 0.25, "rewards/chosen": -0.00534291286021471, "rewards/margins": -0.004794662352651358, "rewards/rejected": -0.000548250915016979, "step": 119 }, { "epoch": 0.07315957933241884, "grad_norm": 77.47933217400289, "learning_rate": 4.9170731707317075e-09, "logits/chosen": 0.050755493342876434, "logits/rejected": 0.010417714715003967, "logps/chosen": -141.3895721435547, "logps/rejected": -132.2732391357422, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.000584030116442591, "rewards/margins": -0.00037059775786474347, "rewards/rejected": -0.00021343230037018657, "step": 120 }, { "epoch": 0.07376924249352233, "grad_norm": 74.72684436968582, "learning_rate": 4.9609756097560976e-09, "logits/chosen": -0.2643606960773468, "logits/rejected": 0.16526812314987183, "logps/chosen": -260.1291198730469, "logps/rejected": -121.6513442993164, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.004811191465705633, "rewards/margins": 0.006060314364731312, "rewards/rejected": -0.001249122666195035, "step": 121 }, { "epoch": 0.07437890565462582, "grad_norm": 82.1357479983591, "learning_rate": 5.004878048780488e-09, "logits/chosen": 0.14291734993457794, "logits/rejected": 0.057670608162879944, "logps/chosen": -197.93487548828125, "logps/rejected": -234.44915771484375, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.0030743121169507504, "rewards/margins": -0.0021574492566287518, "rewards/rejected": -0.0009168625110760331, "step": 122 }, { "epoch": 0.07498856881572931, "grad_norm": 68.83068517187307, "learning_rate": 5.0487804878048785e-09, "logits/chosen": 0.2741588056087494, "logits/rejected": 0.31478196382522583, "logps/chosen": -116.69654083251953, "logps/rejected": -53.103763580322266, "loss": 0.6943, "rewards/accuracies": 0.75, "rewards/chosen": -0.0025397776626050472, "rewards/margins": -0.0028411983512341976, "rewards/rejected": 0.0003014207468368113, "step": 123 }, { "epoch": 0.0755982319768328, "grad_norm": 73.28815100450528, "learning_rate": 5.092682926829268e-09, "logits/chosen": 0.20874908566474915, "logits/rejected": 0.17765402793884277, "logps/chosen": -114.76847839355469, "logps/rejected": -147.07997131347656, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0049163103103637695, "rewards/margins": -0.0006764174322597682, "rewards/rejected": 0.005592728033661842, "step": 124 }, { "epoch": 0.07620789513793629, "grad_norm": 65.55323005492332, "learning_rate": 5.136585365853658e-09, "logits/chosen": 0.33819684386253357, "logits/rejected": 0.3356032371520996, "logps/chosen": -100.74069213867188, "logps/rejected": -189.4708709716797, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": 1.3112439773976803e-06, "rewards/margins": -0.0010233878856524825, "rewards/rejected": 0.001024699187837541, "step": 125 }, { "epoch": 0.07681755829903979, "grad_norm": 70.86724977666721, "learning_rate": 5.180487804878048e-09, "logits/chosen": 0.0392058864235878, "logits/rejected": 0.03747990354895592, "logps/chosen": -296.35845947265625, "logps/rejected": -205.3852081298828, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004379808669909835, "rewards/margins": -0.0004094182513654232, "rewards/rejected": -2.8562499210238457e-05, "step": 126 }, { "epoch": 0.07742722146014328, "grad_norm": 65.61722606001547, "learning_rate": 5.224390243902439e-09, "logits/chosen": 0.04799790680408478, "logits/rejected": 0.09802756458520889, "logps/chosen": -101.85343933105469, "logps/rejected": -75.31245422363281, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.001424169517122209, "rewards/margins": -0.000865292502567172, "rewards/rejected": -0.0005588769563473761, "step": 127 }, { "epoch": 0.07803688462124676, "grad_norm": 68.60395561720807, "learning_rate": 5.268292682926829e-09, "logits/chosen": 0.017364241182804108, "logits/rejected": 0.03964319825172424, "logps/chosen": -135.31761169433594, "logps/rejected": -177.23977661132812, "loss": 0.6933, "rewards/accuracies": 0.25, "rewards/chosen": -0.002281582448631525, "rewards/margins": -0.0005889536114409566, "rewards/rejected": -0.0016926288371905684, "step": 128 }, { "epoch": 0.07864654778235025, "grad_norm": 65.65690014360091, "learning_rate": 5.312195121951219e-09, "logits/chosen": -0.0188586488366127, "logits/rejected": 0.1370946168899536, "logps/chosen": -174.27993774414062, "logps/rejected": -133.8212890625, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.002814221428707242, "rewards/margins": 0.00708196172490716, "rewards/rejected": -0.004267740063369274, "step": 129 }, { "epoch": 0.07925621094345374, "grad_norm": 74.75775309701856, "learning_rate": 5.3560975609756095e-09, "logits/chosen": 0.14698414504528046, "logits/rejected": 0.13047370314598083, "logps/chosen": -182.12315368652344, "logps/rejected": -196.8042449951172, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0017709494568407536, "rewards/margins": 0.0008408309076912701, "rewards/rejected": 0.0009301184909418225, "step": 130 }, { "epoch": 0.07986587410455723, "grad_norm": 72.45493881357304, "learning_rate": 5.3999999999999996e-09, "logits/chosen": 0.010124213993549347, "logits/rejected": 0.11211246252059937, "logps/chosen": -154.32940673828125, "logps/rejected": -111.28329467773438, "loss": 0.6927, "rewards/accuracies": 0.75, "rewards/chosen": -0.00022742748842574656, "rewards/margins": 0.0015665411483496428, "rewards/rejected": -0.0017939688405022025, "step": 131 }, { "epoch": 0.08047553726566073, "grad_norm": 82.5594881924206, "learning_rate": 5.4439024390243896e-09, "logits/chosen": 0.394523561000824, "logits/rejected": 0.3098101019859314, "logps/chosen": -58.86858367919922, "logps/rejected": -73.11603546142578, "loss": 0.6916, "rewards/accuracies": 0.25, "rewards/chosen": 0.0026440678630024195, "rewards/margins": 0.000991898705251515, "rewards/rejected": 0.0016521692741662264, "step": 132 }, { "epoch": 0.08108520042676422, "grad_norm": 73.74422964095989, "learning_rate": 5.4878048780487804e-09, "logits/chosen": -0.2440057098865509, "logits/rejected": 0.08008784800767899, "logps/chosen": -359.561279296875, "logps/rejected": -144.55764770507812, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005802393425256014, "rewards/margins": -0.0014970778720453382, "rewards/rejected": 0.0009168386459350586, "step": 133 }, { "epoch": 0.0816948635878677, "grad_norm": 77.62821526350095, "learning_rate": 5.5317073170731705e-09, "logits/chosen": 0.06320123374462128, "logits/rejected": 0.08124424517154694, "logps/chosen": -185.04531860351562, "logps/rejected": -171.1767120361328, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": 0.004624283406883478, "rewards/margins": 0.00328406086191535, "rewards/rejected": 0.0013402223121374846, "step": 134 }, { "epoch": 0.0823045267489712, "grad_norm": 63.982325955375515, "learning_rate": 5.5756097560975605e-09, "logits/chosen": 0.12405374646186829, "logits/rejected": 0.23707516491413116, "logps/chosen": -147.10157775878906, "logps/rejected": -37.86238098144531, "loss": 0.6935, "rewards/accuracies": 0.75, "rewards/chosen": 0.0051132082007825375, "rewards/margins": 0.0039332155138254166, "rewards/rejected": 0.0011799931526184082, "step": 135 }, { "epoch": 0.08291418991007468, "grad_norm": 67.87599102837838, "learning_rate": 5.6195121951219505e-09, "logits/chosen": 0.3629865348339081, "logits/rejected": 0.41198331117630005, "logps/chosen": -37.38160705566406, "logps/rejected": -61.17850875854492, "loss": 0.6925, "rewards/accuracies": 0.25, "rewards/chosen": -0.0010821342002600431, "rewards/margins": -0.002665102481842041, "rewards/rejected": 0.001582968165166676, "step": 136 }, { "epoch": 0.08352385307117817, "grad_norm": 73.39680498164013, "learning_rate": 5.663414634146341e-09, "logits/chosen": -0.25749722123146057, "logits/rejected": -0.06635760515928268, "logps/chosen": -158.20542907714844, "logps/rejected": -147.95916748046875, "loss": 0.6926, "rewards/accuracies": 0.25, "rewards/chosen": -0.0005931377527303994, "rewards/margins": -0.0009785653091967106, "rewards/rejected": 0.0003854274982586503, "step": 137 }, { "epoch": 0.08413351623228166, "grad_norm": 74.89745405053004, "learning_rate": 5.7073170731707314e-09, "logits/chosen": 0.10234642773866653, "logits/rejected": 0.13057830929756165, "logps/chosen": -190.80194091796875, "logps/rejected": -184.14112854003906, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.006725311279296875, "rewards/margins": 0.00970449484884739, "rewards/rejected": -0.002979183103889227, "step": 138 }, { "epoch": 0.08474317939338516, "grad_norm": 71.86528735607519, "learning_rate": 5.7512195121951215e-09, "logits/chosen": 0.18370205163955688, "logits/rejected": 0.4245617389678955, "logps/chosen": -107.59567260742188, "logps/rejected": -62.562522888183594, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008191107772290707, "rewards/margins": -0.002900493098422885, "rewards/rejected": 0.0037196041084825993, "step": 139 }, { "epoch": 0.08535284255448865, "grad_norm": 67.80064759232623, "learning_rate": 5.795121951219512e-09, "logits/chosen": 0.1179521232843399, "logits/rejected": 0.17211908102035522, "logps/chosen": -76.15290832519531, "logps/rejected": -69.85437774658203, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": 0.00045137398410588503, "rewards/margins": 0.0019128681160509586, "rewards/rejected": -0.0014614940155297518, "step": 140 }, { "epoch": 0.08596250571559214, "grad_norm": 78.07775502978046, "learning_rate": 5.839024390243902e-09, "logits/chosen": 0.20504337549209595, "logits/rejected": -0.07019396126270294, "logps/chosen": -110.20990753173828, "logps/rejected": -154.95419311523438, "loss": 0.693, "rewards/accuracies": 0.75, "rewards/chosen": -0.0011904718121513724, "rewards/margins": 0.004687226377427578, "rewards/rejected": -0.005877697840332985, "step": 141 }, { "epoch": 0.08657216887669562, "grad_norm": 70.51385967841539, "learning_rate": 5.882926829268292e-09, "logits/chosen": 0.002869613468647003, "logits/rejected": -0.0603582039475441, "logps/chosen": -99.5188980102539, "logps/rejected": -124.32498931884766, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.0024773627519607544, "rewards/margins": 0.003924766089767218, "rewards/rejected": -0.0014474033378064632, "step": 142 }, { "epoch": 0.08718183203779911, "grad_norm": 82.46505649910281, "learning_rate": 5.926829268292683e-09, "logits/chosen": 0.196449413895607, "logits/rejected": 0.10419797152280807, "logps/chosen": -182.15365600585938, "logps/rejected": -239.742919921875, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011578560806810856, "rewards/margins": 0.0012749790912494063, "rewards/rejected": -0.0024328352883458138, "step": 143 }, { "epoch": 0.0877914951989026, "grad_norm": 69.5667664808787, "learning_rate": 5.970731707317073e-09, "logits/chosen": -0.04383482411503792, "logits/rejected": 0.2302844524383545, "logps/chosen": -173.1061553955078, "logps/rejected": -50.136810302734375, "loss": 0.6926, "rewards/accuracies": 0.25, "rewards/chosen": -0.0038678408600389957, "rewards/margins": -0.0033862111158668995, "rewards/rejected": -0.00048162939492613077, "step": 144 }, { "epoch": 0.0884011583600061, "grad_norm": 66.42758158814961, "learning_rate": 6.014634146341463e-09, "logits/chosen": 0.016252242028713226, "logits/rejected": 0.3309721052646637, "logps/chosen": -119.42570495605469, "logps/rejected": -68.9472885131836, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0029084503185003996, "rewards/margins": 0.0022307662293314934, "rewards/rejected": 0.0006776839727535844, "step": 145 }, { "epoch": 0.08901082152110959, "grad_norm": 70.44646364581894, "learning_rate": 6.058536585365853e-09, "logits/chosen": 0.46475750207901, "logits/rejected": 0.8950963616371155, "logps/chosen": -223.4705810546875, "logps/rejected": -428.9252624511719, "loss": 0.6933, "rewards/accuracies": 0.25, "rewards/chosen": -0.00443327147513628, "rewards/margins": -0.006467171013355255, "rewards/rejected": 0.0020338997710496187, "step": 146 }, { "epoch": 0.08962048468221308, "grad_norm": 58.04813314381115, "learning_rate": 6.102439024390244e-09, "logits/chosen": 0.05219703167676926, "logits/rejected": 0.3147392272949219, "logps/chosen": -242.02186584472656, "logps/rejected": -111.68846130371094, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": 0.001748537877574563, "rewards/margins": -0.0019744159653782845, "rewards/rejected": 0.003722954075783491, "step": 147 }, { "epoch": 0.09023014784331657, "grad_norm": 60.55357991712482, "learning_rate": 6.146341463414634e-09, "logits/chosen": 0.22557038068771362, "logits/rejected": 0.08623065799474716, "logps/chosen": -142.86192321777344, "logps/rejected": -235.4132080078125, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005093335639685392, "rewards/margins": -0.0008229495724663138, "rewards/rejected": 0.0013322830200195312, "step": 148 }, { "epoch": 0.09083981100442005, "grad_norm": 72.04225145996718, "learning_rate": 6.1902439024390234e-09, "logits/chosen": 0.1499841958284378, "logits/rejected": 0.23211584985256195, "logps/chosen": -178.3646697998047, "logps/rejected": -113.13501739501953, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.003059941343963146, "rewards/margins": 0.003623262047767639, "rewards/rejected": -0.0005633204709738493, "step": 149 }, { "epoch": 0.09144947416552354, "grad_norm": 78.80313031497876, "learning_rate": 6.2341463414634135e-09, "logits/chosen": -0.026065614074468613, "logits/rejected": -0.22232206165790558, "logps/chosen": -144.16940307617188, "logps/rejected": -271.5802917480469, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": -0.001899433322250843, "rewards/margins": -0.0005844119004905224, "rewards/rejected": -0.0013150214217603207, "step": 150 }, { "epoch": 0.09205913732662704, "grad_norm": 69.56302623595346, "learning_rate": 6.278048780487804e-09, "logits/chosen": 0.43501123785972595, "logits/rejected": 0.41309887170791626, "logps/chosen": -138.13449096679688, "logps/rejected": -198.89093017578125, "loss": 0.6915, "rewards/accuracies": 0.75, "rewards/chosen": 0.005390692036598921, "rewards/margins": 0.00937967374920845, "rewards/rejected": -0.003988981246948242, "step": 151 }, { "epoch": 0.09266880048773053, "grad_norm": 72.69477403333501, "learning_rate": 6.321951219512194e-09, "logits/chosen": 0.011581763625144958, "logits/rejected": 0.02895890176296234, "logps/chosen": -29.926578521728516, "logps/rejected": -13.040694236755371, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.004392695613205433, "rewards/margins": -0.004276537802070379, "rewards/rejected": -0.00011615746188908815, "step": 152 }, { "epoch": 0.09327846364883402, "grad_norm": 69.08178656192919, "learning_rate": 6.365853658536584e-09, "logits/chosen": -0.04695054888725281, "logits/rejected": -0.03760403022170067, "logps/chosen": -139.91473388671875, "logps/rejected": -96.32427215576172, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": 0.00232369895093143, "rewards/margins": -0.0003409802448004484, "rewards/rejected": 0.0026646791957318783, "step": 153 }, { "epoch": 0.09388812680993751, "grad_norm": 61.070245240120485, "learning_rate": 6.409756097560975e-09, "logits/chosen": 0.004546787589788437, "logits/rejected": -0.0730147734284401, "logps/chosen": -210.40216064453125, "logps/rejected": -171.3146209716797, "loss": 0.6939, "rewards/accuracies": 0.5, "rewards/chosen": 0.002398204756900668, "rewards/margins": -0.0008605123148299754, "rewards/rejected": 0.0032587170135229826, "step": 154 }, { "epoch": 0.094497789971041, "grad_norm": 99.83659075422874, "learning_rate": 6.453658536585365e-09, "logits/chosen": 0.11244277656078339, "logits/rejected": 0.44888627529144287, "logps/chosen": -33.60845184326172, "logps/rejected": -41.175819396972656, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": 0.0019771575462073088, "rewards/margins": -5.798344500362873e-05, "rewards/rejected": 0.0020351409912109375, "step": 155 }, { "epoch": 0.09510745313214448, "grad_norm": 62.447112753666794, "learning_rate": 6.497560975609755e-09, "logits/chosen": -0.13742254674434662, "logits/rejected": 0.10493803769350052, "logps/chosen": -252.29299926757812, "logps/rejected": -146.0027313232422, "loss": 0.6929, "rewards/accuracies": 0.25, "rewards/chosen": 0.0014365673996508121, "rewards/margins": -0.0029184818267822266, "rewards/rejected": 0.004355049226433039, "step": 156 }, { "epoch": 0.09571711629324799, "grad_norm": 70.99603983614955, "learning_rate": 6.541463414634146e-09, "logits/chosen": -0.005733788013458252, "logits/rejected": 0.255418598651886, "logps/chosen": -129.1565399169922, "logps/rejected": -92.6162109375, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": -0.000543901405762881, "rewards/margins": -0.0035296266432851553, "rewards/rejected": 0.0029857249464839697, "step": 157 }, { "epoch": 0.09632677945435147, "grad_norm": 74.24357293276846, "learning_rate": 6.585365853658536e-09, "logits/chosen": -0.08054069429636002, "logits/rejected": 0.06550734490156174, "logps/chosen": -320.4570007324219, "logps/rejected": -186.04795837402344, "loss": 0.6934, "rewards/accuracies": 0.25, "rewards/chosen": 0.002433204557746649, "rewards/margins": 0.0002149580977857113, "rewards/rejected": 0.0022182464599609375, "step": 158 }, { "epoch": 0.09693644261545496, "grad_norm": 81.33426491489708, "learning_rate": 6.629268292682926e-09, "logits/chosen": 0.1674249768257141, "logits/rejected": 0.13584084808826447, "logps/chosen": -269.9678955078125, "logps/rejected": -225.60061645507812, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": 0.007037067785859108, "rewards/margins": 0.006308174692094326, "rewards/rejected": 0.000728893093764782, "step": 159 }, { "epoch": 0.09754610577655845, "grad_norm": 59.87214960114537, "learning_rate": 6.673170731707316e-09, "logits/chosen": 0.1393468827009201, "logits/rejected": 0.12032058835029602, "logps/chosen": -60.78736877441406, "logps/rejected": -65.82872009277344, "loss": 0.6937, "rewards/accuracies": 0.75, "rewards/chosen": 0.002607786562293768, "rewards/margins": 0.0034523606300354004, "rewards/rejected": -0.0008445740677416325, "step": 160 }, { "epoch": 0.09815576893766194, "grad_norm": 85.93623654558834, "learning_rate": 6.717073170731707e-09, "logits/chosen": 0.19709917902946472, "logits/rejected": 0.25526177883148193, "logps/chosen": -57.56209182739258, "logps/rejected": -48.53240966796875, "loss": 0.6917, "rewards/accuracies": 0.25, "rewards/chosen": 0.0026808977127075195, "rewards/margins": -0.0017124293372035027, "rewards/rejected": 0.004393327049911022, "step": 161 }, { "epoch": 0.09876543209876543, "grad_norm": 67.99786555755102, "learning_rate": 6.760975609756097e-09, "logits/chosen": 0.028237100690603256, "logits/rejected": 0.14847011864185333, "logps/chosen": -177.81239318847656, "logps/rejected": -161.88641357421875, "loss": 0.6927, "rewards/accuracies": 0.75, "rewards/chosen": 0.001543379039503634, "rewards/margins": 0.0066079264506697655, "rewards/rejected": -0.005064547061920166, "step": 162 }, { "epoch": 0.09937509525986893, "grad_norm": 78.68252640793447, "learning_rate": 6.804878048780487e-09, "logits/chosen": 0.19014248251914978, "logits/rejected": 0.034527841955423355, "logps/chosen": -70.37176513671875, "logps/rejected": -87.45320892333984, "loss": 0.6906, "rewards/accuracies": 0.25, "rewards/chosen": 0.004232907667756081, "rewards/margins": 0.0012796358205378056, "rewards/rejected": 0.0029532716143876314, "step": 163 }, { "epoch": 0.09998475842097242, "grad_norm": 69.75392929489868, "learning_rate": 6.848780487804878e-09, "logits/chosen": 0.07758036255836487, "logits/rejected": 0.15910190343856812, "logps/chosen": -259.58648681640625, "logps/rejected": -281.25311279296875, "loss": 0.6933, "rewards/accuracies": 0.25, "rewards/chosen": 0.0008043288835324347, "rewards/margins": -0.007910936139523983, "rewards/rejected": 0.008715265430510044, "step": 164 }, { "epoch": 0.1005944215820759, "grad_norm": 80.0527512833749, "learning_rate": 6.892682926829268e-09, "logits/chosen": -0.204082190990448, "logits/rejected": 0.2399766445159912, "logps/chosen": -86.7333984375, "logps/rejected": -35.39960479736328, "loss": 0.6913, "rewards/accuracies": 0.75, "rewards/chosen": 0.001207971596159041, "rewards/margins": 0.0016857744194567204, "rewards/rejected": -0.0004778027650900185, "step": 165 }, { "epoch": 0.10120408474317939, "grad_norm": 79.6637012053017, "learning_rate": 6.936585365853658e-09, "logits/chosen": 0.17938564717769623, "logits/rejected": 0.24092160165309906, "logps/chosen": -99.06732177734375, "logps/rejected": -89.8432846069336, "loss": 0.6927, "rewards/accuracies": 0.75, "rewards/chosen": 0.001825571060180664, "rewards/margins": 0.0024854838848114014, "rewards/rejected": -0.0006599128828383982, "step": 166 }, { "epoch": 0.10181374790428288, "grad_norm": 72.41252861114882, "learning_rate": 6.980487804878049e-09, "logits/chosen": 0.023349490016698837, "logits/rejected": 0.1881551891565323, "logps/chosen": -118.31248474121094, "logps/rejected": -98.12593078613281, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": 0.002156746806576848, "rewards/margins": -0.0027242780197411776, "rewards/rejected": 0.004881024360656738, "step": 167 }, { "epoch": 0.10242341106538637, "grad_norm": 68.56858420299291, "learning_rate": 7.024390243902439e-09, "logits/chosen": 0.016032271087169647, "logits/rejected": 0.11249391734600067, "logps/chosen": -323.5757141113281, "logps/rejected": -123.6161880493164, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.003967964556068182, "rewards/margins": -0.0001054612803272903, "rewards/rejected": -0.003862503683194518, "step": 168 }, { "epoch": 0.10303307422648987, "grad_norm": 81.05255215869212, "learning_rate": 7.068292682926829e-09, "logits/chosen": 0.08069111406803131, "logits/rejected": -0.07778602838516235, "logps/chosen": -168.92613220214844, "logps/rejected": -252.23606872558594, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.0032866718247532845, "rewards/margins": 0.005971884820610285, "rewards/rejected": -0.0026852129958570004, "step": 169 }, { "epoch": 0.10364273738759336, "grad_norm": 81.7644402967527, "learning_rate": 7.112195121951219e-09, "logits/chosen": 0.31781864166259766, "logits/rejected": 0.14330296218395233, "logps/chosen": -367.62811279296875, "logps/rejected": -239.89463806152344, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.013696718961000443, "rewards/margins": 0.01696225255727768, "rewards/rejected": -0.003265536157414317, "step": 170 }, { "epoch": 0.10425240054869685, "grad_norm": 67.91609096551838, "learning_rate": 7.15609756097561e-09, "logits/chosen": 0.18671970069408417, "logits/rejected": 0.16253669559955597, "logps/chosen": -6.201991081237793, "logps/rejected": -20.30820083618164, "loss": 0.6919, "rewards/accuracies": 0.25, "rewards/chosen": -0.002145099686458707, "rewards/margins": -0.0015753626357764006, "rewards/rejected": -0.0005697369924746454, "step": 171 }, { "epoch": 0.10486206370980033, "grad_norm": 78.16797384812574, "learning_rate": 7.2e-09, "logits/chosen": 0.176150843501091, "logits/rejected": 0.14651542901992798, "logps/chosen": -247.4420166015625, "logps/rejected": -293.2618713378906, "loss": 0.6919, "rewards/accuracies": 0.75, "rewards/chosen": 0.008729219436645508, "rewards/margins": 0.007733630947768688, "rewards/rejected": 0.000995588256046176, "step": 172 }, { "epoch": 0.10547172687090382, "grad_norm": 57.67765206609665, "learning_rate": 7.24390243902439e-09, "logits/chosen": 0.18694573640823364, "logits/rejected": 0.30927199125289917, "logps/chosen": -194.29551696777344, "logps/rejected": -198.8756561279297, "loss": 0.6924, "rewards/accuracies": 0.0, "rewards/chosen": -0.001628529978916049, "rewards/margins": -0.0044615985825657845, "rewards/rejected": 0.002833068370819092, "step": 173 }, { "epoch": 0.10608139003200731, "grad_norm": 80.57272427404622, "learning_rate": 7.287804878048781e-09, "logits/chosen": -0.45078766345977783, "logits/rejected": 0.4929141104221344, "logps/chosen": -458.40496826171875, "logps/rejected": -97.18772888183594, "loss": 0.6906, "rewards/accuracies": 0.5, "rewards/chosen": 0.008907699957489967, "rewards/margins": 0.009705161675810814, "rewards/rejected": -0.0007974625332280993, "step": 174 }, { "epoch": 0.10669105319311081, "grad_norm": 80.57272427404622, "learning_rate": 7.287804878048781e-09, "logits/chosen": 0.12999090552330017, "logits/rejected": -0.07178585976362228, "logps/chosen": -126.14864349365234, "logps/rejected": -138.47508239746094, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": 0.010566807352006435, "rewards/margins": 0.009783018380403519, "rewards/rejected": 0.0007837892626412213, "step": 175 }, { "epoch": 0.1073007163542143, "grad_norm": 82.43150494799481, "learning_rate": 7.33170731707317e-09, "logits/chosen": 0.1309414952993393, "logits/rejected": 0.15505224466323853, "logps/chosen": -155.169677734375, "logps/rejected": -143.5417022705078, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.013601267710328102, "rewards/margins": 0.007817518897354603, "rewards/rejected": 0.005783748812973499, "step": 176 }, { "epoch": 0.10791037951531779, "grad_norm": 88.0310517626559, "learning_rate": 7.37560975609756e-09, "logits/chosen": 0.1322554349899292, "logits/rejected": 0.28939059376716614, "logps/chosen": -421.849609375, "logps/rejected": -345.86822509765625, "loss": 0.6923, "rewards/accuracies": 0.75, "rewards/chosen": 0.01242215745151043, "rewards/margins": 0.003427768126130104, "rewards/rejected": 0.00899438839405775, "step": 177 }, { "epoch": 0.10852004267642128, "grad_norm": 92.50555215779028, "learning_rate": 7.41951219512195e-09, "logits/chosen": 0.20377972722053528, "logits/rejected": 0.1981021612882614, "logps/chosen": -100.24634552001953, "logps/rejected": -227.90379333496094, "loss": 0.6928, "rewards/accuracies": 0.75, "rewards/chosen": 0.0025395273696631193, "rewards/margins": 0.005182635970413685, "rewards/rejected": -0.002643108367919922, "step": 178 }, { "epoch": 0.10912970583752477, "grad_norm": 70.63263622932259, "learning_rate": 7.46341463414634e-09, "logits/chosen": 0.09560803323984146, "logits/rejected": -0.16471828520298004, "logps/chosen": -184.5498046875, "logps/rejected": -421.5089111328125, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.0006469726795330644, "rewards/margins": 0.002256846521049738, "rewards/rejected": -0.0016098737251013517, "step": 179 }, { "epoch": 0.10973936899862825, "grad_norm": 81.22199051486554, "learning_rate": 7.50731707317073e-09, "logits/chosen": 0.05242873355746269, "logits/rejected": 0.1468435525894165, "logps/chosen": -474.50384521484375, "logps/rejected": -392.1907043457031, "loss": 0.6911, "rewards/accuracies": 0.75, "rewards/chosen": 0.0184478759765625, "rewards/margins": 0.021483995020389557, "rewards/rejected": -0.003036117646843195, "step": 180 }, { "epoch": 0.11034903215973174, "grad_norm": 77.14185791783366, "learning_rate": 7.551219512195122e-09, "logits/chosen": 0.2575138807296753, "logits/rejected": 0.28635162115097046, "logps/chosen": -116.87586975097656, "logps/rejected": -69.42999267578125, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": 0.0032803399953991175, "rewards/margins": 0.0046702162362635136, "rewards/rejected": -0.0013898760080337524, "step": 181 }, { "epoch": 0.11095869532083524, "grad_norm": 61.14794911738017, "learning_rate": 7.595121951219512e-09, "logits/chosen": 0.11099124699831009, "logits/rejected": 0.13927927613258362, "logps/chosen": -31.17013931274414, "logps/rejected": -67.29956817626953, "loss": 0.6942, "rewards/accuracies": 0.25, "rewards/chosen": 0.0005793154705315828, "rewards/margins": -0.0017756938468664885, "rewards/rejected": 0.0023550093173980713, "step": 182 }, { "epoch": 0.11156835848193873, "grad_norm": 68.58252525337726, "learning_rate": 7.639024390243902e-09, "logits/chosen": 0.20379912853240967, "logits/rejected": 0.2304908186197281, "logps/chosen": -44.79044723510742, "logps/rejected": -54.05611038208008, "loss": 0.6914, "rewards/accuracies": 0.25, "rewards/chosen": -0.0029021562077105045, "rewards/margins": -0.004501253366470337, "rewards/rejected": 0.001599097391590476, "step": 183 }, { "epoch": 0.11217802164304222, "grad_norm": 79.38044931815365, "learning_rate": 7.682926829268292e-09, "logits/chosen": 0.0941167026758194, "logits/rejected": 0.15919072926044464, "logps/chosen": -247.39495849609375, "logps/rejected": -171.2981414794922, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": 0.005813932977616787, "rewards/margins": 0.006898021325469017, "rewards/rejected": -0.0010840892791748047, "step": 184 }, { "epoch": 0.11278768480414571, "grad_norm": 76.2796090328514, "learning_rate": 7.726829268292682e-09, "logits/chosen": 0.05305987223982811, "logits/rejected": -0.08095654845237732, "logps/chosen": -229.75924682617188, "logps/rejected": -258.1326599121094, "loss": 0.6911, "rewards/accuracies": 0.75, "rewards/chosen": 0.01245895680040121, "rewards/margins": 0.009998410940170288, "rewards/rejected": 0.0024605453945696354, "step": 185 }, { "epoch": 0.1133973479652492, "grad_norm": 63.95901673960713, "learning_rate": 7.770731707317072e-09, "logits/chosen": 0.19521519541740417, "logits/rejected": 0.05974021553993225, "logps/chosen": -266.0050048828125, "logps/rejected": -517.4422607421875, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.012186050415039062, "rewards/margins": 0.0059299468994140625, "rewards/rejected": 0.006256103515625, "step": 186 }, { "epoch": 0.11400701112635268, "grad_norm": 74.72782131859475, "learning_rate": 7.814634146341464e-09, "logits/chosen": -0.06481368094682693, "logits/rejected": -0.05777443200349808, "logps/chosen": -145.55892944335938, "logps/rejected": -40.200660705566406, "loss": 0.6903, "rewards/accuracies": 0.25, "rewards/chosen": 0.003788316622376442, "rewards/margins": 0.001457572216168046, "rewards/rejected": 0.002330744406208396, "step": 187 }, { "epoch": 0.11461667428745619, "grad_norm": 76.91636228373078, "learning_rate": 7.858536585365854e-09, "logits/chosen": 0.11966991424560547, "logits/rejected": 0.14306171238422394, "logps/chosen": -14.93128776550293, "logps/rejected": -19.36440086364746, "loss": 0.6906, "rewards/accuracies": 0.5, "rewards/chosen": -0.0010291219223290682, "rewards/margins": -0.0011190533405169845, "rewards/rejected": 8.99315346032381e-05, "step": 188 }, { "epoch": 0.11522633744855967, "grad_norm": 57.596654535367165, "learning_rate": 7.902439024390244e-09, "logits/chosen": -0.074323371052742, "logits/rejected": 0.05753961205482483, "logps/chosen": -77.96446228027344, "logps/rejected": -96.30802154541016, "loss": 0.6936, "rewards/accuracies": 0.25, "rewards/chosen": -0.002775573870167136, "rewards/margins": -0.004966557025909424, "rewards/rejected": 0.0021909833885729313, "step": 189 }, { "epoch": 0.11583600060966316, "grad_norm": 83.12182513412321, "learning_rate": 7.946341463414634e-09, "logits/chosen": 0.26109641790390015, "logits/rejected": 0.2574569284915924, "logps/chosen": -282.251953125, "logps/rejected": -298.4128723144531, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.018349360674619675, "rewards/margins": 0.013233971782028675, "rewards/rejected": 0.0051153902895748615, "step": 190 }, { "epoch": 0.11644566377076665, "grad_norm": 77.9532709110735, "learning_rate": 7.990243902439024e-09, "logits/chosen": 0.3206827640533447, "logits/rejected": 0.2922167181968689, "logps/chosen": -110.56204223632812, "logps/rejected": -211.07745361328125, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.0025666176807135344, "rewards/margins": 0.0027544558979570866, "rewards/rejected": -0.00018783810082823038, "step": 191 }, { "epoch": 0.11705532693187014, "grad_norm": 74.79594478387207, "learning_rate": 8.034146341463414e-09, "logits/chosen": 0.1410955786705017, "logits/rejected": -0.02253938466310501, "logps/chosen": -267.8140869140625, "logps/rejected": -266.12744140625, "loss": 0.6908, "rewards/accuracies": 0.75, "rewards/chosen": 0.012267494574189186, "rewards/margins": 0.013858987018465996, "rewards/rejected": -0.0015914919786155224, "step": 192 }, { "epoch": 0.11766499009297363, "grad_norm": 78.49673136598805, "learning_rate": 8.078048780487804e-09, "logits/chosen": 0.21787187457084656, "logits/rejected": 0.17934630811214447, "logps/chosen": -239.90481567382812, "logps/rejected": -274.919677734375, "loss": 0.6905, "rewards/accuracies": 0.75, "rewards/chosen": 0.01241397950798273, "rewards/margins": 0.008371520787477493, "rewards/rejected": 0.004042458720505238, "step": 193 }, { "epoch": 0.11827465325407713, "grad_norm": 71.74342228269752, "learning_rate": 8.121951219512196e-09, "logits/chosen": 0.30999499559402466, "logits/rejected": 0.3137747049331665, "logps/chosen": -115.93299865722656, "logps/rejected": -111.77952575683594, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.011038887314498425, "rewards/margins": 0.012829184532165527, "rewards/rejected": -0.0017902969848364592, "step": 194 }, { "epoch": 0.11888431641518062, "grad_norm": 84.24460399939697, "learning_rate": 8.165853658536586e-09, "logits/chosen": -0.08364257961511612, "logits/rejected": 0.2403935045003891, "logps/chosen": -165.97039794921875, "logps/rejected": -117.39863586425781, "loss": 0.692, "rewards/accuracies": 0.25, "rewards/chosen": 0.0038569211028516293, "rewards/margins": 0.0012012722436338663, "rewards/rejected": 0.0026556490920484066, "step": 195 }, { "epoch": 0.1194939795762841, "grad_norm": 76.94193710343421, "learning_rate": 8.209756097560976e-09, "logits/chosen": -0.3118302822113037, "logits/rejected": -0.17941252887248993, "logps/chosen": -448.2669677734375, "logps/rejected": -407.0946044921875, "loss": 0.6923, "rewards/accuracies": 0.75, "rewards/chosen": 0.00807888526469469, "rewards/margins": -0.009395670145750046, "rewards/rejected": 0.01747455634176731, "step": 196 }, { "epoch": 0.12010364273738759, "grad_norm": 81.81840784374043, "learning_rate": 8.253658536585366e-09, "logits/chosen": -0.10127736628055573, "logits/rejected": 0.16854047775268555, "logps/chosen": -135.72747802734375, "logps/rejected": -125.63229370117188, "loss": 0.6913, "rewards/accuracies": 0.5, "rewards/chosen": 0.0014416694175451994, "rewards/margins": -0.0011126695899292827, "rewards/rejected": 0.002554339123889804, "step": 197 }, { "epoch": 0.12071330589849108, "grad_norm": 72.36687920211662, "learning_rate": 8.297560975609756e-09, "logits/chosen": 0.22871333360671997, "logits/rejected": 0.2734183371067047, "logps/chosen": -226.57386779785156, "logps/rejected": -166.44577026367188, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.006101250648498535, "rewards/margins": 0.008862042799592018, "rewards/rejected": -0.0027607919182628393, "step": 198 }, { "epoch": 0.12132296905959457, "grad_norm": 82.22642889906784, "learning_rate": 8.341463414634146e-09, "logits/chosen": 0.3314269781112671, "logits/rejected": 0.36815717816352844, "logps/chosen": -349.48284912109375, "logps/rejected": -342.3211975097656, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.019129181280732155, "rewards/margins": 0.01226501539349556, "rewards/rejected": 0.0068641663528978825, "step": 199 }, { "epoch": 0.12193263222069807, "grad_norm": 72.9650328696923, "learning_rate": 8.385365853658536e-09, "logits/chosen": 0.03564765304327011, "logits/rejected": 0.09759993851184845, "logps/chosen": -160.033935546875, "logps/rejected": -115.61434173583984, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": 0.011529779992997646, "rewards/margins": 0.011023235507309437, "rewards/rejected": 0.0005065440782345831, "step": 200 }, { "epoch": 0.12254229538180156, "grad_norm": 64.22348688374149, "learning_rate": 8.429268292682927e-09, "logits/chosen": 0.14622417092323303, "logits/rejected": 0.253497838973999, "logps/chosen": -186.90524291992188, "logps/rejected": -158.447265625, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.0100333783775568, "rewards/margins": 0.013532849960029125, "rewards/rejected": -0.0034994720481336117, "step": 201 }, { "epoch": 0.12315195854290505, "grad_norm": 74.52374042440181, "learning_rate": 8.473170731707316e-09, "logits/chosen": 0.054381199181079865, "logits/rejected": -0.04007922112941742, "logps/chosen": -271.0592041015625, "logps/rejected": -200.66111755371094, "loss": 0.689, "rewards/accuracies": 0.5, "rewards/chosen": 0.008767509832978249, "rewards/margins": 0.008909749798476696, "rewards/rejected": -0.00014224054757505655, "step": 202 }, { "epoch": 0.12376162170400853, "grad_norm": 77.76424103790391, "learning_rate": 8.517073170731706e-09, "logits/chosen": -0.15442079305648804, "logits/rejected": -0.15553969144821167, "logps/chosen": -189.68994140625, "logps/rejected": -283.78009033203125, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": -0.0001678466214798391, "rewards/margins": -0.0070205689407885075, "rewards/rejected": 0.0068527222611010075, "step": 203 }, { "epoch": 0.12437128486511202, "grad_norm": 80.96444954317272, "learning_rate": 8.560975609756096e-09, "logits/chosen": 0.201796293258667, "logits/rejected": 0.5128307342529297, "logps/chosen": -192.67794799804688, "logps/rejected": -180.17579650878906, "loss": 0.6937, "rewards/accuracies": 1.0, "rewards/chosen": 0.012137127108871937, "rewards/margins": 0.012270909734070301, "rewards/rejected": -0.0001337828580290079, "step": 204 }, { "epoch": 0.12498094802621551, "grad_norm": 75.66251081128826, "learning_rate": 8.604878048780488e-09, "logits/chosen": -0.023175127804279327, "logits/rejected": 0.003958750516176224, "logps/chosen": -184.97940063476562, "logps/rejected": -67.05559539794922, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": 0.00750888604670763, "rewards/margins": 0.0070923687890172005, "rewards/rejected": 0.00041651714127510786, "step": 205 }, { "epoch": 0.125590611187319, "grad_norm": 74.05343130345244, "learning_rate": 8.648780487804878e-09, "logits/chosen": 0.23187266290187836, "logits/rejected": 0.21487364172935486, "logps/chosen": -16.77553939819336, "logps/rejected": -22.398555755615234, "loss": 0.6917, "rewards/accuracies": 0.25, "rewards/chosen": -0.00042227510130032897, "rewards/margins": -0.0017372013535350561, "rewards/rejected": 0.0013149260776117444, "step": 206 }, { "epoch": 0.1262002743484225, "grad_norm": 75.21374984890747, "learning_rate": 8.692682926829268e-09, "logits/chosen": -0.06297703087329865, "logits/rejected": 0.03414461761713028, "logps/chosen": -398.39837646484375, "logps/rejected": -352.8562927246094, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.011148834601044655, "rewards/margins": 0.014732742682099342, "rewards/rejected": -0.003583908313885331, "step": 207 }, { "epoch": 0.126809937509526, "grad_norm": 64.04142645281772, "learning_rate": 8.736585365853658e-09, "logits/chosen": 0.2422681748867035, "logits/rejected": 0.12101413309574127, "logps/chosen": -161.3976593017578, "logps/rejected": -283.5793762207031, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": 0.011684561148285866, "rewards/margins": 0.00624473113566637, "rewards/rejected": 0.005439830012619495, "step": 208 }, { "epoch": 0.1274196006706295, "grad_norm": 62.59370826680618, "learning_rate": 8.780487804878048e-09, "logits/chosen": 0.21639201045036316, "logits/rejected": 0.23975257575511932, "logps/chosen": -257.2457275390625, "logps/rejected": -187.59793090820312, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": 0.010793114081025124, "rewards/margins": 0.005155419930815697, "rewards/rejected": 0.005637693218886852, "step": 209 }, { "epoch": 0.12802926383173296, "grad_norm": 81.70767471490443, "learning_rate": 8.824390243902438e-09, "logits/chosen": -0.07190696895122528, "logits/rejected": -0.13351590931415558, "logps/chosen": -70.58195495605469, "logps/rejected": -158.46754455566406, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.0009778023231774569, "rewards/margins": -0.0018841506680473685, "rewards/rejected": 0.0009063484612852335, "step": 210 }, { "epoch": 0.12863892699283647, "grad_norm": 74.5787702478486, "learning_rate": 8.86829268292683e-09, "logits/chosen": 0.25175967812538147, "logits/rejected": 0.2309994399547577, "logps/chosen": -26.840103149414062, "logps/rejected": -21.15106201171875, "loss": 0.6914, "rewards/accuracies": 0.75, "rewards/chosen": 0.005501949694007635, "rewards/margins": 0.005997526925057173, "rewards/rejected": -0.0004955768818035722, "step": 211 }, { "epoch": 0.12924859015393994, "grad_norm": 73.11146783164249, "learning_rate": 8.91219512195122e-09, "logits/chosen": 0.033776264637708664, "logits/rejected": 0.13960430026054382, "logps/chosen": -218.73910522460938, "logps/rejected": -81.34803771972656, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.005093121901154518, "rewards/margins": 0.0035371482372283936, "rewards/rejected": 0.001555973314680159, "step": 212 }, { "epoch": 0.12985825331504344, "grad_norm": 74.17960029850262, "learning_rate": 8.95609756097561e-09, "logits/chosen": 0.32470473647117615, "logits/rejected": 0.3272430896759033, "logps/chosen": -5.314748287200928, "logps/rejected": -3.1400012969970703, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": 0.0008516312809661031, "rewards/margins": 0.004156092181801796, "rewards/rejected": -0.0033044605515897274, "step": 213 }, { "epoch": 0.13046791647614692, "grad_norm": 72.54347247391986, "learning_rate": 9e-09, "logits/chosen": 0.2663123607635498, "logits/rejected": 0.16501428186893463, "logps/chosen": -150.55316162109375, "logps/rejected": -95.2711410522461, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": 0.005064946599304676, "rewards/margins": 0.001300519797950983, "rewards/rejected": 0.0037644270341843367, "step": 214 }, { "epoch": 0.13107757963725042, "grad_norm": 70.75458425042433, "learning_rate": 9.04390243902439e-09, "logits/chosen": 0.39246898889541626, "logits/rejected": -0.19269554316997528, "logps/chosen": -221.33596801757812, "logps/rejected": -500.8580627441406, "loss": 0.6915, "rewards/accuracies": 0.75, "rewards/chosen": 0.018619585782289505, "rewards/margins": 0.009124422445893288, "rewards/rejected": 0.009495163336396217, "step": 215 }, { "epoch": 0.13168724279835392, "grad_norm": 68.26030742798383, "learning_rate": 9.08780487804878e-09, "logits/chosen": -0.08046863973140717, "logits/rejected": 0.18038855493068695, "logps/chosen": -376.5609130859375, "logps/rejected": -144.64633178710938, "loss": 0.6928, "rewards/accuracies": 0.75, "rewards/chosen": 0.015356696210801601, "rewards/margins": 0.010357105173170567, "rewards/rejected": 0.00499959010630846, "step": 216 }, { "epoch": 0.1322969059594574, "grad_norm": 81.05019110481086, "learning_rate": 9.13170731707317e-09, "logits/chosen": 0.08664529770612717, "logits/rejected": 0.09168754518032074, "logps/chosen": -91.68215942382812, "logps/rejected": -56.12030029296875, "loss": 0.6907, "rewards/accuracies": 0.5, "rewards/chosen": 0.014924057759344578, "rewards/margins": 0.012504622340202332, "rewards/rejected": 0.0024194358848035336, "step": 217 }, { "epoch": 0.1329065691205609, "grad_norm": 70.44096712910866, "learning_rate": 9.175609756097561e-09, "logits/chosen": 0.26090556383132935, "logits/rejected": 0.33934906125068665, "logps/chosen": -170.63369750976562, "logps/rejected": -149.51715087890625, "loss": 0.6889, "rewards/accuracies": 0.75, "rewards/chosen": 0.010795742273330688, "rewards/margins": 0.005506640300154686, "rewards/rejected": 0.0052891019731760025, "step": 218 }, { "epoch": 0.13351623228166437, "grad_norm": 70.2700617741892, "learning_rate": 9.219512195121951e-09, "logits/chosen": -0.08490590751171112, "logits/rejected": -0.20087462663650513, "logps/chosen": -52.203590393066406, "logps/rejected": -76.68606567382812, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.0032541872933506966, "rewards/margins": 0.0005172848468646407, "rewards/rejected": 0.002736902330070734, "step": 219 }, { "epoch": 0.13412589544276787, "grad_norm": 69.55984465687895, "learning_rate": 9.263414634146341e-09, "logits/chosen": 0.1535046100616455, "logits/rejected": 0.02677365019917488, "logps/chosen": -4.253844261169434, "logps/rejected": -14.466489791870117, "loss": 0.6896, "rewards/accuracies": 0.75, "rewards/chosen": -0.0004813492705579847, "rewards/margins": 0.005597323644906282, "rewards/rejected": -0.006078672595322132, "step": 220 }, { "epoch": 0.13473555860387137, "grad_norm": 85.92535947691584, "learning_rate": 9.307317073170731e-09, "logits/chosen": 0.30106112360954285, "logits/rejected": 0.1480739563703537, "logps/chosen": -85.90007019042969, "logps/rejected": -169.9725341796875, "loss": 0.688, "rewards/accuracies": 0.75, "rewards/chosen": 0.006270409096032381, "rewards/margins": 0.0010327575728297234, "rewards/rejected": 0.00523765105754137, "step": 221 }, { "epoch": 0.13534522176497485, "grad_norm": 119.21977123994778, "learning_rate": 9.351219512195121e-09, "logits/chosen": 0.11533677577972412, "logits/rejected": 0.13379010558128357, "logps/chosen": -149.5064697265625, "logps/rejected": -71.87149810791016, "loss": 0.6913, "rewards/accuracies": 0.25, "rewards/chosen": 0.0037560579366981983, "rewards/margins": -0.0026161072310060263, "rewards/rejected": 0.006372165400534868, "step": 222 }, { "epoch": 0.13595488492607835, "grad_norm": 70.29667315024419, "learning_rate": 9.395121951219511e-09, "logits/chosen": -0.346357524394989, "logits/rejected": 0.3720865845680237, "logps/chosen": -320.56134033203125, "logps/rejected": -285.1332702636719, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": 0.014269685372710228, "rewards/margins": -8.902512490749359e-05, "rewards/rejected": 0.014358710497617722, "step": 223 }, { "epoch": 0.13656454808718183, "grad_norm": 72.74974042510391, "learning_rate": 9.439024390243903e-09, "logits/chosen": 0.14322340488433838, "logits/rejected": 0.18459954857826233, "logps/chosen": -69.95736694335938, "logps/rejected": -56.18511199951172, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": 0.01004111859947443, "rewards/margins": 0.007080426439642906, "rewards/rejected": 0.0029606912285089493, "step": 224 }, { "epoch": 0.13717421124828533, "grad_norm": 72.47288590044104, "learning_rate": 9.482926829268293e-09, "logits/chosen": 0.30410856008529663, "logits/rejected": -0.12342655658721924, "logps/chosen": -218.58901977539062, "logps/rejected": -450.06744384765625, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": 0.00634857127442956, "rewards/margins": -0.014498129487037659, "rewards/rejected": 0.02084670029580593, "step": 225 }, { "epoch": 0.1377838744093888, "grad_norm": 72.17461776109012, "learning_rate": 9.526829268292683e-09, "logits/chosen": 0.40045663714408875, "logits/rejected": 0.3636299967765808, "logps/chosen": -69.74336242675781, "logps/rejected": -81.21648406982422, "loss": 0.6902, "rewards/accuracies": 0.75, "rewards/chosen": -0.0014475822681561112, "rewards/margins": 0.00355433183722198, "rewards/rejected": -0.0050019146874547005, "step": 226 }, { "epoch": 0.1383935375704923, "grad_norm": 63.74354424455339, "learning_rate": 9.570731707317073e-09, "logits/chosen": -0.007155582308769226, "logits/rejected": 0.4101467728614807, "logps/chosen": -190.18185424804688, "logps/rejected": -129.58155822753906, "loss": 0.6899, "rewards/accuracies": 0.75, "rewards/chosen": 0.017444992437958717, "rewards/margins": 0.004689312539994717, "rewards/rejected": 0.012755680829286575, "step": 227 }, { "epoch": 0.1390032007315958, "grad_norm": 64.11977457576249, "learning_rate": 9.614634146341463e-09, "logits/chosen": 0.25982144474983215, "logits/rejected": 0.20755594968795776, "logps/chosen": -35.146148681640625, "logps/rejected": -54.77658462524414, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.010507357306778431, "rewards/margins": 0.005585014820098877, "rewards/rejected": 0.004922342021018267, "step": 228 }, { "epoch": 0.13961286389269928, "grad_norm": 73.2959257159521, "learning_rate": 9.658536585365853e-09, "logits/chosen": 0.2829352617263794, "logits/rejected": 0.226519376039505, "logps/chosen": -189.8313446044922, "logps/rejected": -381.81146240234375, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": 0.012041282840073109, "rewards/margins": 0.016209380701184273, "rewards/rejected": -0.004168099258095026, "step": 229 }, { "epoch": 0.14022252705380278, "grad_norm": 68.94961304303615, "learning_rate": 9.702439024390243e-09, "logits/chosen": -0.02386903390288353, "logits/rejected": 0.22455506026744843, "logps/chosen": -252.91159057617188, "logps/rejected": -205.3868408203125, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.028557967394590378, "rewards/margins": 0.018563751131296158, "rewards/rejected": 0.009994215331971645, "step": 230 }, { "epoch": 0.14083219021490626, "grad_norm": 78.6604435432664, "learning_rate": 9.746341463414635e-09, "logits/chosen": 0.34106820821762085, "logits/rejected": 0.07739745080471039, "logps/chosen": -122.4163818359375, "logps/rejected": -258.68914794921875, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": 0.025600817054510117, "rewards/margins": 0.005098343826830387, "rewards/rejected": 0.020502472296357155, "step": 231 }, { "epoch": 0.14144185337600976, "grad_norm": 70.32022280924876, "learning_rate": 9.790243902439025e-09, "logits/chosen": 0.16174301505088806, "logits/rejected": 0.22027698159217834, "logps/chosen": -77.50466918945312, "logps/rejected": -116.17520141601562, "loss": 0.6897, "rewards/accuracies": 0.25, "rewards/chosen": 0.005060291383415461, "rewards/margins": -0.001702064648270607, "rewards/rejected": 0.00676235556602478, "step": 232 }, { "epoch": 0.14205151653711323, "grad_norm": 73.99688934574431, "learning_rate": 9.834146341463415e-09, "logits/chosen": 0.33195942640304565, "logits/rejected": -0.011777505278587341, "logps/chosen": -150.14395141601562, "logps/rejected": -177.5060272216797, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.01235343236476183, "rewards/margins": 0.015944160521030426, "rewards/rejected": -0.0035907269921153784, "step": 233 }, { "epoch": 0.14266117969821673, "grad_norm": 63.849320564913384, "learning_rate": 9.878048780487805e-09, "logits/chosen": 0.10598531365394592, "logits/rejected": 0.12136658281087875, "logps/chosen": -181.38348388671875, "logps/rejected": -69.75995635986328, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.020735835656523705, "rewards/margins": 0.015682529658079147, "rewards/rejected": 0.005053305998444557, "step": 234 }, { "epoch": 0.14327084285932024, "grad_norm": 75.28468551677496, "learning_rate": 9.921951219512195e-09, "logits/chosen": 0.35082173347473145, "logits/rejected": 0.3787878453731537, "logps/chosen": -117.42396545410156, "logps/rejected": -79.96060180664062, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.0063343290239572525, "rewards/margins": 0.008362340740859509, "rewards/rejected": -0.002028012415394187, "step": 235 }, { "epoch": 0.1438805060204237, "grad_norm": 63.49933420829787, "learning_rate": 9.965853658536585e-09, "logits/chosen": -0.029095180332660675, "logits/rejected": -0.02358836866915226, "logps/chosen": -269.658935546875, "logps/rejected": -184.3779754638672, "loss": 0.6902, "rewards/accuracies": 0.75, "rewards/chosen": 0.028628159314393997, "rewards/margins": 0.009772539138793945, "rewards/rejected": 0.018855620175600052, "step": 236 }, { "epoch": 0.1444901691815272, "grad_norm": 75.31000642223354, "learning_rate": 1.0009756097560975e-08, "logits/chosen": -0.06486168503761292, "logits/rejected": 0.0071268510073423386, "logps/chosen": -299.6549072265625, "logps/rejected": -165.22476196289062, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.032582368701696396, "rewards/margins": 0.019990747794508934, "rewards/rejected": 0.012591619044542313, "step": 237 }, { "epoch": 0.14509983234263069, "grad_norm": 68.11793891801905, "learning_rate": 1.0053658536585367e-08, "logits/chosen": -0.018292773514986038, "logits/rejected": 0.06804099678993225, "logps/chosen": -240.0941619873047, "logps/rejected": -300.457275390625, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.02616003155708313, "rewards/margins": -0.0042310659773647785, "rewards/rejected": 0.03039109706878662, "step": 238 }, { "epoch": 0.1457094955037342, "grad_norm": 63.56920454825996, "learning_rate": 1.0097560975609757e-08, "logits/chosen": 0.20902913808822632, "logits/rejected": 0.20397305488586426, "logps/chosen": -198.93582153320312, "logps/rejected": -207.8833770751953, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.03262896463274956, "rewards/margins": 0.042327940464019775, "rewards/rejected": -0.009698974899947643, "step": 239 }, { "epoch": 0.1463191586648377, "grad_norm": 74.6828087626269, "learning_rate": 1.0141463414634145e-08, "logits/chosen": 0.16607873141765594, "logits/rejected": 0.19343027472496033, "logps/chosen": -11.94176959991455, "logps/rejected": -13.455016136169434, "loss": 0.6869, "rewards/accuracies": 0.0, "rewards/chosen": -0.0027127116918563843, "rewards/margins": -0.006608984433114529, "rewards/rejected": 0.003896272275596857, "step": 240 }, { "epoch": 0.14692882182594116, "grad_norm": 67.57369790544568, "learning_rate": 1.0185365853658535e-08, "logits/chosen": -0.14159193634986877, "logits/rejected": -0.01214917004108429, "logps/chosen": -162.12225341796875, "logps/rejected": -126.66780090332031, "loss": 0.6879, "rewards/accuracies": 0.5, "rewards/chosen": 0.013150770217180252, "rewards/margins": 0.011054106056690216, "rewards/rejected": 0.0020966639276593924, "step": 241 }, { "epoch": 0.14753848498704467, "grad_norm": 69.09487800609341, "learning_rate": 1.0229268292682925e-08, "logits/chosen": 0.05865873396396637, "logits/rejected": -0.021177947521209717, "logps/chosen": -193.3614501953125, "logps/rejected": -167.36978149414062, "loss": 0.6876, "rewards/accuracies": 0.25, "rewards/chosen": 0.005057024769484997, "rewards/margins": -0.004579547327011824, "rewards/rejected": 0.009636571630835533, "step": 242 }, { "epoch": 0.14814814814814814, "grad_norm": 62.3242029486047, "learning_rate": 1.0273170731707315e-08, "logits/chosen": 0.09503556787967682, "logits/rejected": 0.02194279432296753, "logps/chosen": -92.28962707519531, "logps/rejected": -85.16290283203125, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.010578328743577003, "rewards/margins": 0.01028031762689352, "rewards/rejected": 0.0002980114077217877, "step": 243 }, { "epoch": 0.14875781130925164, "grad_norm": 75.66652761452913, "learning_rate": 1.0317073170731705e-08, "logits/chosen": 0.20517027378082275, "logits/rejected": 0.15875345468521118, "logps/chosen": -113.72001647949219, "logps/rejected": -49.26511001586914, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": 0.004640038590878248, "rewards/margins": 0.0027486036997288465, "rewards/rejected": 0.0018914344254881144, "step": 244 }, { "epoch": 0.14936747447035512, "grad_norm": 89.69620229693227, "learning_rate": 1.0360975609756095e-08, "logits/chosen": 0.11705029010772705, "logits/rejected": 0.26808589696884155, "logps/chosen": -245.8486328125, "logps/rejected": -245.96533203125, "loss": 0.6877, "rewards/accuracies": 0.5, "rewards/chosen": 0.016329145058989525, "rewards/margins": 0.019762611016631126, "rewards/rejected": -0.0034334659576416016, "step": 245 }, { "epoch": 0.14997713763145862, "grad_norm": 67.96453824857987, "learning_rate": 1.0404878048780487e-08, "logits/chosen": 0.20623037219047546, "logits/rejected": 0.1848663091659546, "logps/chosen": -14.415666580200195, "logps/rejected": -7.248907566070557, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": -0.00188540224917233, "rewards/margins": -0.0007506401743739843, "rewards/rejected": -0.0011347620747983456, "step": 246 }, { "epoch": 0.15058680079256212, "grad_norm": 70.56113761845653, "learning_rate": 1.0448780487804877e-08, "logits/chosen": 0.0934453085064888, "logits/rejected": 0.12496070563793182, "logps/chosen": -85.32705688476562, "logps/rejected": -78.3795166015625, "loss": 0.6859, "rewards/accuracies": 0.5, "rewards/chosen": 0.0045030489563941956, "rewards/margins": 0.005073298700153828, "rewards/rejected": -0.0005702495109289885, "step": 247 }, { "epoch": 0.1511964639536656, "grad_norm": 76.9638268372275, "learning_rate": 1.0492682926829267e-08, "logits/chosen": 0.20455703139305115, "logits/rejected": 0.18343859910964966, "logps/chosen": -199.44857788085938, "logps/rejected": -103.60641479492188, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.010823631659150124, "rewards/margins": 0.013277888298034668, "rewards/rejected": -0.0024542571045458317, "step": 248 }, { "epoch": 0.1518061271147691, "grad_norm": 75.07135689944852, "learning_rate": 1.0536585365853657e-08, "logits/chosen": 0.3327924907207489, "logits/rejected": 0.36023616790771484, "logps/chosen": -185.33592224121094, "logps/rejected": -29.10883903503418, "loss": 0.6854, "rewards/accuracies": 0.25, "rewards/chosen": 0.003931510262191296, "rewards/margins": 0.0014942025300115347, "rewards/rejected": 0.002437308430671692, "step": 249 }, { "epoch": 0.15241579027587257, "grad_norm": 75.58232939333541, "learning_rate": 1.0580487804878047e-08, "logits/chosen": 0.11740530282258987, "logits/rejected": 0.18833479285240173, "logps/chosen": -424.054931640625, "logps/rejected": -343.39404296875, "loss": 0.689, "rewards/accuracies": 0.5, "rewards/chosen": 0.05939657613635063, "rewards/margins": 0.03401017189025879, "rewards/rejected": 0.025386402383446693, "step": 250 }, { "epoch": 0.15302545343697607, "grad_norm": 73.0173068286647, "learning_rate": 1.0624390243902437e-08, "logits/chosen": 0.27976128458976746, "logits/rejected": 0.25798600912094116, "logps/chosen": -70.59455108642578, "logps/rejected": -91.9207534790039, "loss": 0.6894, "rewards/accuracies": 0.25, "rewards/chosen": -0.005557769909501076, "rewards/margins": -0.011985952034592628, "rewards/rejected": 0.006428182125091553, "step": 251 }, { "epoch": 0.15363511659807957, "grad_norm": 74.34648231743009, "learning_rate": 1.0668292682926829e-08, "logits/chosen": 0.09387435019016266, "logits/rejected": -0.007215626537799835, "logps/chosen": -219.23934936523438, "logps/rejected": -250.1908416748047, "loss": 0.6892, "rewards/accuracies": 0.25, "rewards/chosen": 0.01466522179543972, "rewards/margins": 0.009566032327711582, "rewards/rejected": 0.005099189467728138, "step": 252 }, { "epoch": 0.15424477975918305, "grad_norm": 64.34807060967165, "learning_rate": 1.0712195121951219e-08, "logits/chosen": 0.02402116358280182, "logits/rejected": 0.019050151109695435, "logps/chosen": -90.4888687133789, "logps/rejected": -148.08981323242188, "loss": 0.6896, "rewards/accuracies": 0.75, "rewards/chosen": 0.011893940158188343, "rewards/margins": 0.012876272201538086, "rewards/rejected": -0.0009823321597650647, "step": 253 }, { "epoch": 0.15485444292028655, "grad_norm": 64.79662928731439, "learning_rate": 1.0756097560975609e-08, "logits/chosen": 0.22467710077762604, "logits/rejected": 0.0950518324971199, "logps/chosen": -61.54901885986328, "logps/rejected": -61.83803176879883, "loss": 0.6907, "rewards/accuracies": 0.75, "rewards/chosen": 0.01443713903427124, "rewards/margins": 0.014978361316025257, "rewards/rejected": -0.000541222165338695, "step": 254 }, { "epoch": 0.15546410608139002, "grad_norm": 59.93707097217542, "learning_rate": 1.0799999999999999e-08, "logits/chosen": 0.29136666655540466, "logits/rejected": 0.11373914033174515, "logps/chosen": -75.9327392578125, "logps/rejected": -126.5439224243164, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": 0.011048637330532074, "rewards/margins": 0.006899937056005001, "rewards/rejected": 0.004148700274527073, "step": 255 }, { "epoch": 0.15607376924249353, "grad_norm": 71.95254740944107, "learning_rate": 1.0843902439024389e-08, "logits/chosen": 0.02697262167930603, "logits/rejected": 0.18303431570529938, "logps/chosen": -187.94979858398438, "logps/rejected": -67.05463409423828, "loss": 0.6857, "rewards/accuracies": 0.25, "rewards/chosen": 0.02066936530172825, "rewards/margins": 0.011891448870301247, "rewards/rejected": 0.008777916431427002, "step": 256 }, { "epoch": 0.156683432403597, "grad_norm": 66.16923012937521, "learning_rate": 1.0887804878048779e-08, "logits/chosen": -0.10520759224891663, "logits/rejected": 0.1884341984987259, "logps/chosen": -299.9891357421875, "logps/rejected": -119.1136703491211, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.011963725090026855, "rewards/margins": 0.002015376463532448, "rewards/rejected": 0.009948348626494408, "step": 257 }, { "epoch": 0.1572930955647005, "grad_norm": 66.51771221207379, "learning_rate": 1.093170731707317e-08, "logits/chosen": 0.039739660918712616, "logits/rejected": -0.05868508294224739, "logps/chosen": -41.552310943603516, "logps/rejected": -61.76737594604492, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.004486262798309326, "rewards/margins": -2.5379355065524578e-05, "rewards/rejected": 0.004511642269790173, "step": 258 }, { "epoch": 0.157902758725804, "grad_norm": 73.0958015664315, "learning_rate": 1.0975609756097561e-08, "logits/chosen": 0.16631904244422913, "logits/rejected": 0.15885590016841888, "logps/chosen": -105.40766143798828, "logps/rejected": -167.66253662109375, "loss": 0.6882, "rewards/accuracies": 0.25, "rewards/chosen": 0.00017958879470825195, "rewards/margins": 0.0016276472015306354, "rewards/rejected": -0.0014480589888989925, "step": 259 }, { "epoch": 0.15851242188690748, "grad_norm": 70.25652911635771, "learning_rate": 1.1019512195121951e-08, "logits/chosen": 0.2429775893688202, "logits/rejected": 0.1486252248287201, "logps/chosen": -106.29670715332031, "logps/rejected": -162.82992553710938, "loss": 0.6849, "rewards/accuracies": 0.5, "rewards/chosen": 0.028435135260224342, "rewards/margins": 0.029486199840903282, "rewards/rejected": -0.0010510622523725033, "step": 260 }, { "epoch": 0.15912208504801098, "grad_norm": 72.10049120828978, "learning_rate": 1.1063414634146341e-08, "logits/chosen": -0.11289437860250473, "logits/rejected": 0.297699511051178, "logps/chosen": -347.2435607910156, "logps/rejected": -196.04190063476562, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.031672000885009766, "rewards/margins": 0.013463189825415611, "rewards/rejected": 0.018208812922239304, "step": 261 }, { "epoch": 0.15973174820911445, "grad_norm": 80.93364378898619, "learning_rate": 1.1107317073170731e-08, "logits/chosen": 0.0010141544044017792, "logits/rejected": 0.2935892641544342, "logps/chosen": -498.2570495605469, "logps/rejected": -312.3238220214844, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.08853988349437714, "rewards/margins": 0.06164970621466637, "rewards/rejected": 0.026890181005001068, "step": 262 }, { "epoch": 0.16034141137021796, "grad_norm": 102.5062733225681, "learning_rate": 1.1151219512195121e-08, "logits/chosen": -0.16654887795448303, "logits/rejected": -0.14394907653331757, "logps/chosen": -193.4143524169922, "logps/rejected": -228.5217742919922, "loss": 0.6834, "rewards/accuracies": 0.75, "rewards/chosen": 0.019860554486513138, "rewards/margins": 0.008599311113357544, "rewards/rejected": 0.01126124244183302, "step": 263 }, { "epoch": 0.16095107453132146, "grad_norm": 81.75798142518528, "learning_rate": 1.1195121951219511e-08, "logits/chosen": -0.009945273399353027, "logits/rejected": 0.0056189000606536865, "logps/chosen": -587.51708984375, "logps/rejected": -362.59234619140625, "loss": 0.6882, "rewards/accuracies": 0.5, "rewards/chosen": 0.03963165730237961, "rewards/margins": 0.030398894101381302, "rewards/rejected": 0.009232759475708008, "step": 264 }, { "epoch": 0.16156073769242493, "grad_norm": 68.03395581723177, "learning_rate": 1.1239024390243901e-08, "logits/chosen": 0.18979834020137787, "logits/rejected": 0.20110829174518585, "logps/chosen": -38.780731201171875, "logps/rejected": -47.85240173339844, "loss": 0.6843, "rewards/accuracies": 0.75, "rewards/chosen": 0.008783669210970402, "rewards/margins": 0.005118245724588633, "rewards/rejected": 0.0036654232535511255, "step": 265 }, { "epoch": 0.16217040085352843, "grad_norm": 67.12938115410965, "learning_rate": 1.1282926829268293e-08, "logits/chosen": 0.14108797907829285, "logits/rejected": 0.0863712877035141, "logps/chosen": -65.01193237304688, "logps/rejected": -197.5635986328125, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.008278220891952515, "rewards/margins": -0.0024371203035116196, "rewards/rejected": 0.010715341195464134, "step": 266 }, { "epoch": 0.1627800640146319, "grad_norm": 76.95114992952817, "learning_rate": 1.1326829268292683e-08, "logits/chosen": 0.31340616941452026, "logits/rejected": 0.29575198888778687, "logps/chosen": -147.78167724609375, "logps/rejected": -129.01556396484375, "loss": 0.6884, "rewards/accuracies": 0.5, "rewards/chosen": 0.028375498950481415, "rewards/margins": -0.008279353380203247, "rewards/rejected": 0.03665485233068466, "step": 267 }, { "epoch": 0.1633897271757354, "grad_norm": 61.97636980432929, "learning_rate": 1.1370731707317073e-08, "logits/chosen": 0.3050193190574646, "logits/rejected": 0.0612892247736454, "logps/chosen": -178.6527557373047, "logps/rejected": -183.0557861328125, "loss": 0.6888, "rewards/accuracies": 0.75, "rewards/chosen": 0.021247386932373047, "rewards/margins": 0.011964130215346813, "rewards/rejected": 0.009283257648348808, "step": 268 }, { "epoch": 0.16399939033683889, "grad_norm": 59.77971279845762, "learning_rate": 1.1414634146341463e-08, "logits/chosen": -0.049220163375139236, "logits/rejected": -0.018158867955207825, "logps/chosen": -127.37361145019531, "logps/rejected": -90.01386260986328, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.02707068994641304, "rewards/margins": 0.02435753308236599, "rewards/rejected": 0.002713155932724476, "step": 269 }, { "epoch": 0.1646090534979424, "grad_norm": 67.97088968853843, "learning_rate": 1.1458536585365853e-08, "logits/chosen": -0.22824090719223022, "logits/rejected": -0.23402655124664307, "logps/chosen": -62.05147933959961, "logps/rejected": -91.66260528564453, "loss": 0.6872, "rewards/accuracies": 0.5, "rewards/chosen": 0.012143706902861595, "rewards/margins": -0.007138777989894152, "rewards/rejected": 0.01928248628973961, "step": 270 }, { "epoch": 0.1652187166590459, "grad_norm": 72.07185018278172, "learning_rate": 1.1502439024390243e-08, "logits/chosen": -0.20504449307918549, "logits/rejected": 0.2059239149093628, "logps/chosen": -128.02024841308594, "logps/rejected": -153.56642150878906, "loss": 0.6847, "rewards/accuracies": 0.5, "rewards/chosen": 0.02709498442709446, "rewards/margins": 0.004441403783857822, "rewards/rejected": 0.022653579711914062, "step": 271 }, { "epoch": 0.16582837982014936, "grad_norm": 70.08052151086115, "learning_rate": 1.1546341463414635e-08, "logits/chosen": 0.2460930347442627, "logits/rejected": 0.6110115051269531, "logps/chosen": -157.7004852294922, "logps/rejected": -103.66472625732422, "loss": 0.6896, "rewards/accuracies": 0.75, "rewards/chosen": 0.0354963093996048, "rewards/margins": 0.013490723446011543, "rewards/rejected": 0.022005582228302956, "step": 272 }, { "epoch": 0.16643804298125287, "grad_norm": 75.28336099467903, "learning_rate": 1.1590243902439025e-08, "logits/chosen": -0.06624182313680649, "logits/rejected": -0.19460135698318481, "logps/chosen": -88.95880889892578, "logps/rejected": -237.4802703857422, "loss": 0.6896, "rewards/accuracies": 0.25, "rewards/chosen": 0.010710209608078003, "rewards/margins": -0.024119380861520767, "rewards/rejected": 0.03482959419488907, "step": 273 }, { "epoch": 0.16704770614235634, "grad_norm": 70.70930457013314, "learning_rate": 1.1634146341463415e-08, "logits/chosen": 0.004238814115524292, "logits/rejected": -0.008775483816862106, "logps/chosen": -47.78416442871094, "logps/rejected": -53.0060920715332, "loss": 0.6863, "rewards/accuracies": 0.25, "rewards/chosen": -0.011054693721234798, "rewards/margins": -0.01163354143500328, "rewards/rejected": 0.0005788472481071949, "step": 274 }, { "epoch": 0.16765736930345984, "grad_norm": 61.65440882497993, "learning_rate": 1.1678048780487805e-08, "logits/chosen": 0.04496167600154877, "logits/rejected": 0.2716519832611084, "logps/chosen": -127.30840301513672, "logps/rejected": -81.77163696289062, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 0.020088758319616318, "rewards/margins": 0.005410086363554001, "rewards/rejected": 0.014678669162094593, "step": 275 }, { "epoch": 0.16826703246456332, "grad_norm": 74.55600716443797, "learning_rate": 1.1721951219512195e-08, "logits/chosen": 0.059458762407302856, "logits/rejected": 0.0097731314599514, "logps/chosen": -144.34033203125, "logps/rejected": -183.9947967529297, "loss": 0.6872, "rewards/accuracies": 0.5, "rewards/chosen": 0.03361999988555908, "rewards/margins": 0.01005165558308363, "rewards/rejected": 0.023568345233798027, "step": 276 }, { "epoch": 0.16887669562566682, "grad_norm": 72.79481905339401, "learning_rate": 1.1765853658536585e-08, "logits/chosen": 0.24767087399959564, "logits/rejected": -0.08833365142345428, "logps/chosen": -212.47438049316406, "logps/rejected": -296.18572998046875, "loss": 0.6888, "rewards/accuracies": 0.25, "rewards/chosen": 0.026554489508271217, "rewards/margins": -0.012244321405887604, "rewards/rejected": 0.03879880905151367, "step": 277 }, { "epoch": 0.16948635878677032, "grad_norm": 71.71878768094764, "learning_rate": 1.1809756097560975e-08, "logits/chosen": -0.09843967109918594, "logits/rejected": 0.2668130695819855, "logps/chosen": -254.5621795654297, "logps/rejected": -192.59725952148438, "loss": 0.6862, "rewards/accuracies": 0.75, "rewards/chosen": 0.02968626096844673, "rewards/margins": 0.011512184515595436, "rewards/rejected": 0.018174076452851295, "step": 278 }, { "epoch": 0.1700960219478738, "grad_norm": 64.6045703248563, "learning_rate": 1.1853658536585366e-08, "logits/chosen": 0.12652164697647095, "logits/rejected": 0.17770615220069885, "logps/chosen": -112.51658630371094, "logps/rejected": -144.6085205078125, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": 0.02689737267792225, "rewards/margins": -0.0017916793003678322, "rewards/rejected": 0.028689051046967506, "step": 279 }, { "epoch": 0.1707056851089773, "grad_norm": 72.4789900569941, "learning_rate": 1.1897560975609757e-08, "logits/chosen": -0.054657019674777985, "logits/rejected": 0.34233659505844116, "logps/chosen": -216.79249572753906, "logps/rejected": -68.52703857421875, "loss": 0.6877, "rewards/accuracies": 0.75, "rewards/chosen": 0.03096923977136612, "rewards/margins": 0.024797670543193817, "rewards/rejected": 0.0061715710908174515, "step": 280 }, { "epoch": 0.17131534827008077, "grad_norm": 72.46230500292192, "learning_rate": 1.1941463414634147e-08, "logits/chosen": 0.10445204377174377, "logits/rejected": 0.11657428741455078, "logps/chosen": -321.2290954589844, "logps/rejected": -278.23028564453125, "loss": 0.6853, "rewards/accuracies": 0.75, "rewards/chosen": 0.05811662971973419, "rewards/margins": 0.0354214683175087, "rewards/rejected": 0.022695159539580345, "step": 281 }, { "epoch": 0.17192501143118427, "grad_norm": 64.7679771023373, "learning_rate": 1.1985365853658537e-08, "logits/chosen": -0.13013964891433716, "logits/rejected": 0.009020913392305374, "logps/chosen": -222.4367218017578, "logps/rejected": -148.70559692382812, "loss": 0.6845, "rewards/accuracies": 0.5, "rewards/chosen": 0.024588415399193764, "rewards/margins": 0.02192620374262333, "rewards/rejected": 0.0026622116565704346, "step": 282 }, { "epoch": 0.17253467459228777, "grad_norm": 76.10500673143999, "learning_rate": 1.2029268292682927e-08, "logits/chosen": -0.01958797127008438, "logits/rejected": 0.08497656136751175, "logps/chosen": -311.93658447265625, "logps/rejected": -220.24310302734375, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": 0.061127856373786926, "rewards/margins": 0.051468826830387115, "rewards/rejected": 0.009659028612077236, "step": 283 }, { "epoch": 0.17314433775339125, "grad_norm": 67.20260473782929, "learning_rate": 1.2073170731707317e-08, "logits/chosen": -0.1200600266456604, "logits/rejected": 0.16886617243289948, "logps/chosen": -113.31182098388672, "logps/rejected": -54.280303955078125, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.0186814796179533, "rewards/margins": 0.017820002511143684, "rewards/rejected": 0.0008614779217168689, "step": 284 }, { "epoch": 0.17375400091449475, "grad_norm": 68.2214791236598, "learning_rate": 1.2117073170731707e-08, "logits/chosen": 0.2490437924861908, "logits/rejected": 0.19444838166236877, "logps/chosen": -96.3344497680664, "logps/rejected": -105.00188446044922, "loss": 0.6823, "rewards/accuracies": 0.75, "rewards/chosen": 0.013741038739681244, "rewards/margins": 0.02261732891201973, "rewards/rejected": -0.008876289241015911, "step": 285 }, { "epoch": 0.17436366407559822, "grad_norm": 68.96159344811562, "learning_rate": 1.2160975609756098e-08, "logits/chosen": 0.15206699073314667, "logits/rejected": 0.19348430633544922, "logps/chosen": -23.09684944152832, "logps/rejected": -57.77210998535156, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": -0.0002287685638293624, "rewards/margins": -0.008274602703750134, "rewards/rejected": 0.008045833557844162, "step": 286 }, { "epoch": 0.17497332723670173, "grad_norm": 75.95314882506473, "learning_rate": 1.2204878048780488e-08, "logits/chosen": 0.07631039619445801, "logits/rejected": 0.05917757749557495, "logps/chosen": -182.9187774658203, "logps/rejected": -146.29830932617188, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.04305421561002731, "rewards/margins": -0.0070860725827515125, "rewards/rejected": 0.05014029145240784, "step": 287 }, { "epoch": 0.1755829903978052, "grad_norm": 63.03047471572614, "learning_rate": 1.2248780487804878e-08, "logits/chosen": 0.06928090751171112, "logits/rejected": 0.05114259943366051, "logps/chosen": -9.679801940917969, "logps/rejected": -46.23088455200195, "loss": 0.6893, "rewards/accuracies": 0.25, "rewards/chosen": -0.0027698874473571777, "rewards/margins": -0.0036961734294891357, "rewards/rejected": 0.0009262859239242971, "step": 288 }, { "epoch": 0.1761926535589087, "grad_norm": 77.659425328325, "learning_rate": 1.2292682926829268e-08, "logits/chosen": 0.1752196103334427, "logits/rejected": 0.009080037474632263, "logps/chosen": -176.6894989013672, "logps/rejected": -155.2613525390625, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.02636120282113552, "rewards/margins": 0.0025505602825433016, "rewards/rejected": 0.023810643702745438, "step": 289 }, { "epoch": 0.1768023167200122, "grad_norm": 62.44123666007119, "learning_rate": 1.2336585365853658e-08, "logits/chosen": 0.11948700249195099, "logits/rejected": 0.13347646594047546, "logps/chosen": -255.56439208984375, "logps/rejected": -272.15509033203125, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": 0.046774283051490784, "rewards/margins": 0.003010498359799385, "rewards/rejected": 0.04376378282904625, "step": 290 }, { "epoch": 0.17741197988111568, "grad_norm": 80.2087763882322, "learning_rate": 1.2380487804878047e-08, "logits/chosen": -0.143706813454628, "logits/rejected": 0.03876394033432007, "logps/chosen": -327.98394775390625, "logps/rejected": -221.12884521484375, "loss": 0.6822, "rewards/accuracies": 0.5, "rewards/chosen": 0.05700650438666344, "rewards/margins": 0.05423087626695633, "rewards/rejected": 0.0027756216004490852, "step": 291 }, { "epoch": 0.17802164304221918, "grad_norm": 67.0077788494056, "learning_rate": 1.2424390243902437e-08, "logits/chosen": 0.11543978005647659, "logits/rejected": 0.08386662602424622, "logps/chosen": -142.90765380859375, "logps/rejected": -203.89308166503906, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": 0.023183250799775124, "rewards/margins": -0.020256493240594864, "rewards/rejected": 0.04343974590301514, "step": 292 }, { "epoch": 0.17863130620332265, "grad_norm": 70.08502232129598, "learning_rate": 1.2468292682926827e-08, "logits/chosen": 0.3591795861721039, "logits/rejected": 0.5780224800109863, "logps/chosen": -200.40618896484375, "logps/rejected": -150.89456176757812, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.04847842827439308, "rewards/margins": 0.064790278673172, "rewards/rejected": -0.016311855986714363, "step": 293 }, { "epoch": 0.17924096936442616, "grad_norm": 80.72254859228765, "learning_rate": 1.2512195121951219e-08, "logits/chosen": 0.21753554046154022, "logits/rejected": 0.3264102041721344, "logps/chosen": -210.21556091308594, "logps/rejected": -255.1538848876953, "loss": 0.6801, "rewards/accuracies": 0.5, "rewards/chosen": 0.022591782733798027, "rewards/margins": 0.005800206214189529, "rewards/rejected": 0.016791576519608498, "step": 294 }, { "epoch": 0.17985063252552966, "grad_norm": 66.42495940270162, "learning_rate": 1.2556097560975609e-08, "logits/chosen": -0.036476314067840576, "logits/rejected": 0.001236744225025177, "logps/chosen": -38.627227783203125, "logps/rejected": -24.439422607421875, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.002048587892204523, "rewards/margins": 0.012927819043397903, "rewards/rejected": -0.010879230685532093, "step": 295 }, { "epoch": 0.18046029568663313, "grad_norm": 68.23030718617511, "learning_rate": 1.2599999999999999e-08, "logits/chosen": 0.033884622156620026, "logits/rejected": -0.24851661920547485, "logps/chosen": -100.02306365966797, "logps/rejected": -188.92852783203125, "loss": 0.6831, "rewards/accuracies": 0.5, "rewards/chosen": 0.012595081701874733, "rewards/margins": -0.019318008795380592, "rewards/rejected": 0.031913090497255325, "step": 296 }, { "epoch": 0.18106995884773663, "grad_norm": 60.34466285564362, "learning_rate": 1.2643902439024389e-08, "logits/chosen": 0.28695887327194214, "logits/rejected": 0.2805330753326416, "logps/chosen": -53.780399322509766, "logps/rejected": -55.31425476074219, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": 0.011322993785142899, "rewards/margins": 0.004818183369934559, "rewards/rejected": 0.00650481041520834, "step": 297 }, { "epoch": 0.1816796220088401, "grad_norm": 65.27088798439672, "learning_rate": 1.2687804878048779e-08, "logits/chosen": -0.3651633560657501, "logits/rejected": -0.26293641328811646, "logps/chosen": -83.63407135009766, "logps/rejected": -109.81761169433594, "loss": 0.6795, "rewards/accuracies": 0.5, "rewards/chosen": 0.004503482487052679, "rewards/margins": 0.004652255214750767, "rewards/rejected": -0.0001487727276980877, "step": 298 }, { "epoch": 0.1822892851699436, "grad_norm": 70.8428190288745, "learning_rate": 1.2731707317073169e-08, "logits/chosen": 0.11984477937221527, "logits/rejected": 0.1091717779636383, "logps/chosen": -97.25994873046875, "logps/rejected": -83.18009185791016, "loss": 0.6838, "rewards/accuracies": 0.75, "rewards/chosen": 0.01965179480612278, "rewards/margins": 0.005422592628747225, "rewards/rejected": 0.014229202643036842, "step": 299 }, { "epoch": 0.18289894833104708, "grad_norm": 78.14090515623363, "learning_rate": 1.277560975609756e-08, "logits/chosen": 0.18486709892749786, "logits/rejected": 0.23836563527584076, "logps/chosen": -279.04827880859375, "logps/rejected": -162.093994140625, "loss": 0.685, "rewards/accuracies": 0.75, "rewards/chosen": 0.03765735402703285, "rewards/margins": -0.012346451170742512, "rewards/rejected": 0.05000380426645279, "step": 300 }, { "epoch": 0.1835086114921506, "grad_norm": 67.7939086879178, "learning_rate": 1.281951219512195e-08, "logits/chosen": -0.051534123718738556, "logits/rejected": 0.009366922080516815, "logps/chosen": -140.4922637939453, "logps/rejected": -180.57522583007812, "loss": 0.6892, "rewards/accuracies": 0.0, "rewards/chosen": 0.01847749948501587, "rewards/margins": -0.007998285815119743, "rewards/rejected": 0.02647578716278076, "step": 301 }, { "epoch": 0.1841182746532541, "grad_norm": 66.52604228882412, "learning_rate": 1.286341463414634e-08, "logits/chosen": 0.17128783464431763, "logits/rejected": 0.08553063124418259, "logps/chosen": -63.9774055480957, "logps/rejected": -86.0840072631836, "loss": 0.6826, "rewards/accuracies": 0.25, "rewards/chosen": 0.0008003080729395151, "rewards/margins": -0.004530087113380432, "rewards/rejected": 0.005330395884811878, "step": 302 }, { "epoch": 0.18472793781435756, "grad_norm": 57.95653169447476, "learning_rate": 1.290731707317073e-08, "logits/chosen": 0.10138653963804245, "logits/rejected": 0.04099968075752258, "logps/chosen": -57.205345153808594, "logps/rejected": -158.7127685546875, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": 0.012483743950724602, "rewards/margins": -0.013382421806454659, "rewards/rejected": 0.02586616389453411, "step": 303 }, { "epoch": 0.18533760097546106, "grad_norm": 77.2152688293654, "learning_rate": 1.295121951219512e-08, "logits/chosen": -0.09431920945644379, "logits/rejected": 0.21408668160438538, "logps/chosen": -187.88401794433594, "logps/rejected": -175.33287048339844, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": 0.03187720850110054, "rewards/margins": 0.010566890239715576, "rewards/rejected": 0.021310318261384964, "step": 304 }, { "epoch": 0.18594726413656454, "grad_norm": 64.46126122037869, "learning_rate": 1.299512195121951e-08, "logits/chosen": -0.0165112167596817, "logits/rejected": 0.04795960336923599, "logps/chosen": -21.955730438232422, "logps/rejected": -40.482303619384766, "loss": 0.6897, "rewards/accuracies": 0.75, "rewards/chosen": 0.004565143957734108, "rewards/margins": -0.0029786108061671257, "rewards/rejected": 0.007543754298239946, "step": 305 }, { "epoch": 0.18655692729766804, "grad_norm": 70.30427168183434, "learning_rate": 1.30390243902439e-08, "logits/chosen": 0.11046487092971802, "logits/rejected": -0.20083540678024292, "logps/chosen": -312.3200378417969, "logps/rejected": -483.58050537109375, "loss": 0.6834, "rewards/accuracies": 0.25, "rewards/chosen": 0.03268561512231827, "rewards/margins": -0.02073187753558159, "rewards/rejected": 0.05341749265789986, "step": 306 }, { "epoch": 0.18716659045877154, "grad_norm": 68.57658949942703, "learning_rate": 1.3082926829268292e-08, "logits/chosen": 0.32596519589424133, "logits/rejected": 0.2826036810874939, "logps/chosen": -173.72314453125, "logps/rejected": -270.8544616699219, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": 0.049483247101306915, "rewards/margins": 0.002615373581647873, "rewards/rejected": 0.04686787351965904, "step": 307 }, { "epoch": 0.18777625361987502, "grad_norm": 69.75985299125284, "learning_rate": 1.3126829268292682e-08, "logits/chosen": -0.0608794242143631, "logits/rejected": 0.05051223561167717, "logps/chosen": -182.84506225585938, "logps/rejected": -154.138916015625, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.018271446228027344, "rewards/margins": -0.0005187001079320908, "rewards/rejected": 0.018790146335959435, "step": 308 }, { "epoch": 0.18838591678097852, "grad_norm": 76.91526853893619, "learning_rate": 1.3170731707317072e-08, "logits/chosen": 0.08850069344043732, "logits/rejected": -0.018443353474140167, "logps/chosen": -160.97219848632812, "logps/rejected": -243.20228576660156, "loss": 0.6774, "rewards/accuracies": 0.75, "rewards/chosen": 0.07890434563159943, "rewards/margins": 0.08282683789730072, "rewards/rejected": -0.003922486677765846, "step": 309 }, { "epoch": 0.188995579942082, "grad_norm": 90.68328105332195, "learning_rate": 1.3214634146341462e-08, "logits/chosen": -0.008595839142799377, "logits/rejected": -0.034652434289455414, "logps/chosen": -120.25025939941406, "logps/rejected": -113.72771453857422, "loss": 0.6698, "rewards/accuracies": 0.75, "rewards/chosen": 0.02222449705004692, "rewards/margins": 0.01967179775238037, "rewards/rejected": 0.0025526999961584806, "step": 310 }, { "epoch": 0.1896052431031855, "grad_norm": 68.07711194958355, "learning_rate": 1.3258536585365852e-08, "logits/chosen": 0.19305960834026337, "logits/rejected": 0.031466081738471985, "logps/chosen": -190.48495483398438, "logps/rejected": -322.8329162597656, "loss": 0.6854, "rewards/accuracies": 0.5, "rewards/chosen": 0.06905165314674377, "rewards/margins": 0.01939660869538784, "rewards/rejected": 0.04965504631400108, "step": 311 }, { "epoch": 0.19021490626428897, "grad_norm": 79.53778607030513, "learning_rate": 1.3302439024390242e-08, "logits/chosen": 0.21615885198116302, "logits/rejected": 0.06739786267280579, "logps/chosen": -12.077869415283203, "logps/rejected": -36.77394485473633, "loss": 0.6821, "rewards/accuracies": 0.5, "rewards/chosen": -0.00834993738681078, "rewards/margins": 0.015117020346224308, "rewards/rejected": -0.023466957733035088, "step": 312 }, { "epoch": 0.19082456942539247, "grad_norm": 64.73371427411682, "learning_rate": 1.3346341463414633e-08, "logits/chosen": -0.018608778715133667, "logits/rejected": 0.34300583600997925, "logps/chosen": -267.38226318359375, "logps/rejected": -72.22630310058594, "loss": 0.68, "rewards/accuracies": 0.75, "rewards/chosen": 0.07769179344177246, "rewards/margins": 0.0658668652176857, "rewards/rejected": 0.011824929155409336, "step": 313 }, { "epoch": 0.19143423258649597, "grad_norm": 75.58955343976855, "learning_rate": 1.3390243902439024e-08, "logits/chosen": 0.06820555776357651, "logits/rejected": 0.38175755739212036, "logps/chosen": -137.0054931640625, "logps/rejected": -179.29209899902344, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.01276245154440403, "rewards/margins": -0.00345882261171937, "rewards/rejected": 0.016221273690462112, "step": 314 }, { "epoch": 0.19204389574759945, "grad_norm": 68.68931860822275, "learning_rate": 1.3434146341463414e-08, "logits/chosen": -0.2746092677116394, "logits/rejected": 0.3636752963066101, "logps/chosen": -537.5502319335938, "logps/rejected": -327.2066650390625, "loss": 0.6806, "rewards/accuracies": 0.75, "rewards/chosen": 0.08374825119972229, "rewards/margins": 0.04939880222082138, "rewards/rejected": 0.03434944525361061, "step": 315 }, { "epoch": 0.19265355890870295, "grad_norm": 74.7544819583531, "learning_rate": 1.3478048780487804e-08, "logits/chosen": 0.12123498320579529, "logits/rejected": 0.12799996137619019, "logps/chosen": -269.18017578125, "logps/rejected": -345.50787353515625, "loss": 0.6754, "rewards/accuracies": 0.75, "rewards/chosen": 0.0619237907230854, "rewards/margins": 0.019133759662508965, "rewards/rejected": 0.04279003292322159, "step": 316 }, { "epoch": 0.19326322206980642, "grad_norm": 75.88764727172456, "learning_rate": 1.3521951219512194e-08, "logits/chosen": 0.2825852036476135, "logits/rejected": 0.195020854473114, "logps/chosen": -279.4342041015625, "logps/rejected": -225.8255157470703, "loss": 0.6774, "rewards/accuracies": 0.5, "rewards/chosen": 0.036413855850696564, "rewards/margins": 0.019771214574575424, "rewards/rejected": 0.01664264127612114, "step": 317 }, { "epoch": 0.19387288523090992, "grad_norm": 80.27254116091218, "learning_rate": 1.3565853658536584e-08, "logits/chosen": 0.06519882380962372, "logits/rejected": 0.16340479254722595, "logps/chosen": -272.77972412109375, "logps/rejected": -296.5497741699219, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": 0.02005164511501789, "rewards/margins": 0.011397051624953747, "rewards/rejected": 0.008654594421386719, "step": 318 }, { "epoch": 0.1944825483920134, "grad_norm": 78.19214046578293, "learning_rate": 1.3609756097560974e-08, "logits/chosen": 0.22997507452964783, "logits/rejected": 0.22382646799087524, "logps/chosen": -15.212729454040527, "logps/rejected": -20.76936149597168, "loss": 0.6839, "rewards/accuracies": 0.75, "rewards/chosen": -0.007372564170509577, "rewards/margins": 0.0029362740460783243, "rewards/rejected": -0.010308838449418545, "step": 319 }, { "epoch": 0.1950922115531169, "grad_norm": 67.95675168668035, "learning_rate": 1.3653658536585366e-08, "logits/chosen": 0.14689771831035614, "logits/rejected": 0.2453615814447403, "logps/chosen": -315.5559387207031, "logps/rejected": -197.97909545898438, "loss": 0.6879, "rewards/accuracies": 0.75, "rewards/chosen": 0.061474286019802094, "rewards/margins": 0.02385631576180458, "rewards/rejected": 0.037617966532707214, "step": 320 }, { "epoch": 0.1957018747142204, "grad_norm": 65.1681457046982, "learning_rate": 1.3697560975609756e-08, "logits/chosen": -0.10886508971452713, "logits/rejected": 0.0993279367685318, "logps/chosen": -172.10069274902344, "logps/rejected": -123.24795532226562, "loss": 0.6776, "rewards/accuracies": 0.75, "rewards/chosen": 0.04928499832749367, "rewards/margins": 0.041656363755464554, "rewards/rejected": 0.00762863177806139, "step": 321 }, { "epoch": 0.19631153787532388, "grad_norm": 85.13184974325452, "learning_rate": 1.3741463414634146e-08, "logits/chosen": 0.04807595908641815, "logits/rejected": 0.10156890004873276, "logps/chosen": -83.60809326171875, "logps/rejected": -70.86627197265625, "loss": 0.6901, "rewards/accuracies": 0.25, "rewards/chosen": -0.004450934939086437, "rewards/margins": -0.01871207356452942, "rewards/rejected": 0.014261138625442982, "step": 322 }, { "epoch": 0.19692120103642738, "grad_norm": 71.36768546843327, "learning_rate": 1.3785365853658536e-08, "logits/chosen": 0.39577725529670715, "logits/rejected": 0.2465072125196457, "logps/chosen": -86.98997497558594, "logps/rejected": -94.35633850097656, "loss": 0.6814, "rewards/accuracies": 0.5, "rewards/chosen": 0.023127174004912376, "rewards/margins": 0.002038024365901947, "rewards/rejected": 0.02108914777636528, "step": 323 }, { "epoch": 0.19753086419753085, "grad_norm": 81.6959801621272, "learning_rate": 1.3829268292682926e-08, "logits/chosen": 0.3274226784706116, "logits/rejected": 0.3019973933696747, "logps/chosen": -12.285454750061035, "logps/rejected": -12.830500602722168, "loss": 0.6868, "rewards/accuracies": 0.75, "rewards/chosen": 0.002893734024837613, "rewards/margins": 0.008286512456834316, "rewards/rejected": -0.005392777733504772, "step": 324 }, { "epoch": 0.19814052735863436, "grad_norm": 71.60153689924304, "learning_rate": 1.3873170731707316e-08, "logits/chosen": 0.13020280003547668, "logits/rejected": 0.11316104233264923, "logps/chosen": -21.833646774291992, "logps/rejected": -22.383956909179688, "loss": 0.6768, "rewards/accuracies": 0.25, "rewards/chosen": -0.004084374755620956, "rewards/margins": -0.003979433327913284, "rewards/rejected": -0.00010494131129235029, "step": 325 }, { "epoch": 0.19875019051973786, "grad_norm": 72.5793081914242, "learning_rate": 1.3917073170731706e-08, "logits/chosen": -0.010539315640926361, "logits/rejected": 0.02804122120141983, "logps/chosen": -74.72570037841797, "logps/rejected": -19.162845611572266, "loss": 0.6776, "rewards/accuracies": 0.5, "rewards/chosen": -0.005877655930817127, "rewards/margins": -0.005930660292506218, "rewards/rejected": 5.300401244312525e-05, "step": 326 }, { "epoch": 0.19935985368084133, "grad_norm": 61.15732466814767, "learning_rate": 1.3960975609756098e-08, "logits/chosen": 0.05574619770050049, "logits/rejected": 0.010447800159454346, "logps/chosen": -17.385906219482422, "logps/rejected": -33.139347076416016, "loss": 0.68, "rewards/accuracies": 0.75, "rewards/chosen": 0.005576098337769508, "rewards/margins": 0.011889881454408169, "rewards/rejected": -0.00631378311663866, "step": 327 }, { "epoch": 0.19996951684194483, "grad_norm": 65.92631336914816, "learning_rate": 1.4004878048780488e-08, "logits/chosen": -0.010778829455375671, "logits/rejected": 0.045100219547748566, "logps/chosen": -166.8217010498047, "logps/rejected": -171.8801727294922, "loss": 0.6884, "rewards/accuracies": 0.5, "rewards/chosen": 0.03109455294907093, "rewards/margins": -0.006510566920042038, "rewards/rejected": 0.03760511800646782, "step": 328 }, { "epoch": 0.2005791800030483, "grad_norm": 73.21614905742888, "learning_rate": 1.4048780487804878e-08, "logits/chosen": 0.11447135359048843, "logits/rejected": 0.30053094029426575, "logps/chosen": -71.59620666503906, "logps/rejected": -47.92333984375, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": 0.02008110284805298, "rewards/margins": -0.012820337899029255, "rewards/rejected": 0.03290143981575966, "step": 329 }, { "epoch": 0.2011888431641518, "grad_norm": 69.45047129413042, "learning_rate": 1.4092682926829268e-08, "logits/chosen": 0.1477673351764679, "logits/rejected": -0.25245657563209534, "logps/chosen": -78.40091705322266, "logps/rejected": -154.0237274169922, "loss": 0.6773, "rewards/accuracies": 0.75, "rewards/chosen": 0.018941640853881836, "rewards/margins": -0.003910040948539972, "rewards/rejected": 0.02285168133676052, "step": 330 }, { "epoch": 0.20179850632525528, "grad_norm": 74.12584253721137, "learning_rate": 1.4136585365853658e-08, "logits/chosen": 0.025876976549625397, "logits/rejected": 0.1491180956363678, "logps/chosen": -270.4678649902344, "logps/rejected": -162.21859741210938, "loss": 0.677, "rewards/accuracies": 0.5, "rewards/chosen": 0.07905292510986328, "rewards/margins": 0.05511059612035751, "rewards/rejected": 0.02394232712686062, "step": 331 }, { "epoch": 0.20240816948635879, "grad_norm": 70.77365253302148, "learning_rate": 1.4180487804878048e-08, "logits/chosen": 0.18511934578418732, "logits/rejected": 0.1744193583726883, "logps/chosen": -166.81033325195312, "logps/rejected": -178.5686492919922, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.060819078236818314, "rewards/margins": 0.059952981770038605, "rewards/rejected": 0.0008660973981022835, "step": 332 }, { "epoch": 0.2030178326474623, "grad_norm": 72.78538549799997, "learning_rate": 1.4224390243902438e-08, "logits/chosen": 0.23981404304504395, "logits/rejected": -0.009058898314833641, "logps/chosen": -42.26893997192383, "logps/rejected": -107.62962341308594, "loss": 0.6897, "rewards/accuracies": 0.25, "rewards/chosen": -0.0003601315547712147, "rewards/margins": -0.027908090502023697, "rewards/rejected": 0.027547962963581085, "step": 333 }, { "epoch": 0.20362749580856576, "grad_norm": 81.6015116007124, "learning_rate": 1.426829268292683e-08, "logits/chosen": 0.21915912628173828, "logits/rejected": 0.12471703439950943, "logps/chosen": -93.21151733398438, "logps/rejected": -82.82246398925781, "loss": 0.6794, "rewards/accuracies": 0.5, "rewards/chosen": 0.018854821100831032, "rewards/margins": -0.0010149849113076925, "rewards/rejected": 0.01986980438232422, "step": 334 }, { "epoch": 0.20423715896966926, "grad_norm": 70.07969810368989, "learning_rate": 1.431219512195122e-08, "logits/chosen": -0.07455149292945862, "logits/rejected": 0.16817906498908997, "logps/chosen": -255.03790283203125, "logps/rejected": -149.12513732910156, "loss": 0.6914, "rewards/accuracies": 0.75, "rewards/chosen": 0.0851854756474495, "rewards/margins": 0.06319474428892136, "rewards/rejected": 0.021990716457366943, "step": 335 }, { "epoch": 0.20484682213077274, "grad_norm": 67.79996961976595, "learning_rate": 1.435609756097561e-08, "logits/chosen": 0.30746638774871826, "logits/rejected": 0.26823362708091736, "logps/chosen": -22.284900665283203, "logps/rejected": -12.613653182983398, "loss": 0.6869, "rewards/accuracies": 0.75, "rewards/chosen": -0.015383643098175526, "rewards/margins": 0.0022090603597462177, "rewards/rejected": -0.01759270392358303, "step": 336 }, { "epoch": 0.20545648529187624, "grad_norm": 71.5858654272021, "learning_rate": 1.44e-08, "logits/chosen": -0.04005648195743561, "logits/rejected": 0.006720844656229019, "logps/chosen": -45.05376434326172, "logps/rejected": -51.12355422973633, "loss": 0.6811, "rewards/accuracies": 0.25, "rewards/chosen": 0.006362485699355602, "rewards/margins": 0.001839661505073309, "rewards/rejected": 0.004522824659943581, "step": 337 }, { "epoch": 0.20606614845297974, "grad_norm": 60.123978078064084, "learning_rate": 1.444390243902439e-08, "logits/chosen": 0.18723182380199432, "logits/rejected": 0.05063885822892189, "logps/chosen": -242.62884521484375, "logps/rejected": -243.73561096191406, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": 0.04879581928253174, "rewards/margins": 0.027438664808869362, "rewards/rejected": 0.021357156336307526, "step": 338 }, { "epoch": 0.20667581161408322, "grad_norm": 73.62335435629042, "learning_rate": 1.448780487804878e-08, "logits/chosen": 0.24362905323505402, "logits/rejected": 0.13164113461971283, "logps/chosen": -291.8893737792969, "logps/rejected": -349.28289794921875, "loss": 0.6749, "rewards/accuracies": 0.75, "rewards/chosen": 0.06466102600097656, "rewards/margins": 0.020573971793055534, "rewards/rejected": 0.04408705234527588, "step": 339 }, { "epoch": 0.20728547477518672, "grad_norm": 69.46968495310414, "learning_rate": 1.4531707317073172e-08, "logits/chosen": -0.10782186686992645, "logits/rejected": 0.057648010551929474, "logps/chosen": -382.67706298828125, "logps/rejected": -227.41738891601562, "loss": 0.6807, "rewards/accuracies": 0.75, "rewards/chosen": 0.11021074652671814, "rewards/margins": 0.05586683005094528, "rewards/rejected": 0.05434391647577286, "step": 340 }, { "epoch": 0.2078951379362902, "grad_norm": 105.32468726144603, "learning_rate": 1.4575609756097562e-08, "logits/chosen": -0.10040725767612457, "logits/rejected": 0.22399954497814178, "logps/chosen": -131.145263671875, "logps/rejected": -87.96915435791016, "loss": 0.6862, "rewards/accuracies": 0.75, "rewards/chosen": 0.009819794446229935, "rewards/margins": 0.006483984645456076, "rewards/rejected": 0.0033358102664351463, "step": 341 }, { "epoch": 0.2085048010973937, "grad_norm": 68.16982295009444, "learning_rate": 1.4619512195121952e-08, "logits/chosen": -0.03772008419036865, "logits/rejected": -0.08728692680597305, "logps/chosen": -88.0912857055664, "logps/rejected": -116.21687316894531, "loss": 0.6763, "rewards/accuracies": 0.25, "rewards/chosen": 0.0516599640250206, "rewards/margins": -0.0033350931480526924, "rewards/rejected": 0.054995059967041016, "step": 342 }, { "epoch": 0.20911446425849717, "grad_norm": 71.28049059029516, "learning_rate": 1.466341463414634e-08, "logits/chosen": 0.22303928434848785, "logits/rejected": 0.286670982837677, "logps/chosen": -142.04025268554688, "logps/rejected": -63.7384033203125, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": 0.04323422163724899, "rewards/margins": 0.029265914112329483, "rewards/rejected": 0.013968306593596935, "step": 343 }, { "epoch": 0.20972412741960067, "grad_norm": 66.75179875324964, "learning_rate": 1.470731707317073e-08, "logits/chosen": 0.08236043155193329, "logits/rejected": 0.10761090368032455, "logps/chosen": -262.4383544921875, "logps/rejected": -132.79490661621094, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.019755559042096138, "rewards/margins": 0.054811589419841766, "rewards/rejected": -0.03505603224039078, "step": 344 }, { "epoch": 0.21033379058070417, "grad_norm": 67.62567899403905, "learning_rate": 1.475121951219512e-08, "logits/chosen": 0.3225303590297699, "logits/rejected": 0.08760485053062439, "logps/chosen": -76.64276123046875, "logps/rejected": -275.00244140625, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": -0.0005946287419646978, "rewards/margins": 0.036113083362579346, "rewards/rejected": -0.03670771047472954, "step": 345 }, { "epoch": 0.21094345374180765, "grad_norm": 71.01903716429102, "learning_rate": 1.479512195121951e-08, "logits/chosen": 0.31444257497787476, "logits/rejected": 0.10715361684560776, "logps/chosen": -62.70277786254883, "logps/rejected": -155.2872314453125, "loss": 0.679, "rewards/accuracies": 0.75, "rewards/chosen": 0.039180826395750046, "rewards/margins": 0.01031193882226944, "rewards/rejected": 0.028868889436125755, "step": 346 }, { "epoch": 0.21155311690291115, "grad_norm": 66.59902373176311, "learning_rate": 1.48390243902439e-08, "logits/chosen": 0.06511352956295013, "logits/rejected": 0.040359482169151306, "logps/chosen": -101.69937896728516, "logps/rejected": -85.10595703125, "loss": 0.6853, "rewards/accuracies": 0.75, "rewards/chosen": 0.0786714106798172, "rewards/margins": 0.017423998564481735, "rewards/rejected": 0.06124741584062576, "step": 347 }, { "epoch": 0.21216278006401462, "grad_norm": 77.56987130285738, "learning_rate": 1.4882926829268292e-08, "logits/chosen": 0.3418797254562378, "logits/rejected": 0.35460710525512695, "logps/chosen": -8.70235824584961, "logps/rejected": -13.812934875488281, "loss": 0.6747, "rewards/accuracies": 0.25, "rewards/chosen": -0.009386222809553146, "rewards/margins": -0.0036907976027578115, "rewards/rejected": -0.005695425905287266, "step": 348 }, { "epoch": 0.21277244322511812, "grad_norm": 73.92616252372007, "learning_rate": 1.492682926829268e-08, "logits/chosen": 0.14430879056453705, "logits/rejected": 0.04682805389165878, "logps/chosen": -126.25191497802734, "logps/rejected": -186.4823760986328, "loss": 0.6746, "rewards/accuracies": 0.75, "rewards/chosen": 0.015251537784934044, "rewards/margins": -0.0009545590728521347, "rewards/rejected": 0.016206098720431328, "step": 349 }, { "epoch": 0.21338210638622163, "grad_norm": 74.28393494263427, "learning_rate": 1.4970731707317072e-08, "logits/chosen": 0.0976111888885498, "logits/rejected": 0.3321661353111267, "logps/chosen": -271.572265625, "logps/rejected": -187.7233123779297, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.08827683329582214, "rewards/margins": 0.02552950382232666, "rewards/rejected": 0.06274733692407608, "step": 350 }, { "epoch": 0.2139917695473251, "grad_norm": 69.05233519059429, "learning_rate": 1.501463414634146e-08, "logits/chosen": 0.2835656702518463, "logits/rejected": 0.24562017619609833, "logps/chosen": -10.603927612304688, "logps/rejected": -20.90781021118164, "loss": 0.6745, "rewards/accuracies": 0.75, "rewards/chosen": 0.0011128070764243603, "rewards/margins": 0.0070378538221120834, "rewards/rejected": -0.005925048142671585, "step": 351 }, { "epoch": 0.2146014327084286, "grad_norm": 59.37878833199465, "learning_rate": 1.5058536585365852e-08, "logits/chosen": 0.16556383669376373, "logits/rejected": 0.16473199427127838, "logps/chosen": -15.063851356506348, "logps/rejected": -8.009671211242676, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": -0.003202187828719616, "rewards/margins": 0.008544718846678734, "rewards/rejected": -0.01174690667539835, "step": 352 }, { "epoch": 0.21521109586953208, "grad_norm": 85.20699487189376, "learning_rate": 1.5102439024390244e-08, "logits/chosen": -0.09299083054065704, "logits/rejected": -0.07927870750427246, "logps/chosen": -249.5418243408203, "logps/rejected": -218.46571350097656, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.06072034686803818, "rewards/margins": 0.00193779356777668, "rewards/rejected": 0.05878255516290665, "step": 353 }, { "epoch": 0.21582075903063558, "grad_norm": 69.44085781934815, "learning_rate": 1.5146341463414632e-08, "logits/chosen": 0.20161965489387512, "logits/rejected": 0.27443933486938477, "logps/chosen": -60.126609802246094, "logps/rejected": -105.94583129882812, "loss": 0.6863, "rewards/accuracies": 0.75, "rewards/chosen": 0.032499201595783234, "rewards/margins": 0.0038094576448202133, "rewards/rejected": 0.02868974395096302, "step": 354 }, { "epoch": 0.21643042219173905, "grad_norm": 86.77175274631634, "learning_rate": 1.5190243902439024e-08, "logits/chosen": 0.3397376239299774, "logits/rejected": 0.49842071533203125, "logps/chosen": -145.3336944580078, "logps/rejected": -133.59646606445312, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.05712239816784859, "rewards/margins": 0.03735244274139404, "rewards/rejected": 0.019769955426454544, "step": 355 }, { "epoch": 0.21704008535284255, "grad_norm": 70.30705893545047, "learning_rate": 1.5234146341463412e-08, "logits/chosen": 0.3698701858520508, "logits/rejected": 0.37715011835098267, "logps/chosen": -201.96597290039062, "logps/rejected": -180.94497680664062, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.129371777176857, "rewards/margins": 0.12676672637462616, "rewards/rejected": 0.002605058252811432, "step": 356 }, { "epoch": 0.21764974851394606, "grad_norm": 72.03748108737564, "learning_rate": 1.5278048780487804e-08, "logits/chosen": -0.01042770966887474, "logits/rejected": -0.040296003222465515, "logps/chosen": -85.13925170898438, "logps/rejected": -87.65917205810547, "loss": 0.6762, "rewards/accuracies": 0.25, "rewards/chosen": 0.018222618848085403, "rewards/margins": 0.006572199985384941, "rewards/rejected": 0.011650418862700462, "step": 357 }, { "epoch": 0.21825941167504953, "grad_norm": 71.02458108213027, "learning_rate": 1.5321951219512196e-08, "logits/chosen": 0.37640199065208435, "logits/rejected": 0.3643328547477722, "logps/chosen": -110.40522003173828, "logps/rejected": -125.43128204345703, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.04483679682016373, "rewards/margins": 0.037366390228271484, "rewards/rejected": 0.0074704051949083805, "step": 358 }, { "epoch": 0.21886907483615303, "grad_norm": 72.27060883103734, "learning_rate": 1.5365853658536584e-08, "logits/chosen": -0.029051396995782852, "logits/rejected": 0.003680701367557049, "logps/chosen": -196.9068603515625, "logps/rejected": -92.90815734863281, "loss": 0.6823, "rewards/accuracies": 0.75, "rewards/chosen": 0.07214992493391037, "rewards/margins": 0.01639285311102867, "rewards/rejected": 0.0557570680975914, "step": 359 }, { "epoch": 0.2194787379972565, "grad_norm": 75.53555099638058, "learning_rate": 1.5409756097560976e-08, "logits/chosen": 0.1217944324016571, "logits/rejected": 0.14078305661678314, "logps/chosen": -194.9487762451172, "logps/rejected": -117.2947769165039, "loss": 0.6762, "rewards/accuracies": 0.75, "rewards/chosen": 0.06772525608539581, "rewards/margins": 0.05223900079727173, "rewards/rejected": 0.015486260876059532, "step": 360 }, { "epoch": 0.22008840115836, "grad_norm": 71.75572742756489, "learning_rate": 1.5453658536585364e-08, "logits/chosen": 0.26062798500061035, "logits/rejected": 0.041195519268512726, "logps/chosen": -79.91947937011719, "logps/rejected": -109.20831298828125, "loss": 0.6946, "rewards/accuracies": 0.75, "rewards/chosen": 0.061104319989681244, "rewards/margins": 0.05685047060251236, "rewards/rejected": 0.004253853112459183, "step": 361 }, { "epoch": 0.22069806431946348, "grad_norm": 62.98288199759064, "learning_rate": 1.5497560975609756e-08, "logits/chosen": 0.16532793641090393, "logits/rejected": 0.15597744286060333, "logps/chosen": -42.087825775146484, "logps/rejected": -80.29241180419922, "loss": 0.6775, "rewards/accuracies": 0.5, "rewards/chosen": 0.029199182987213135, "rewards/margins": 0.02532096952199936, "rewards/rejected": 0.0038782134652137756, "step": 362 }, { "epoch": 0.22130772748056698, "grad_norm": 75.18737218514192, "learning_rate": 1.5541463414634144e-08, "logits/chosen": 0.09273873269557953, "logits/rejected": 0.1297970712184906, "logps/chosen": -81.83040618896484, "logps/rejected": -37.91920852661133, "loss": 0.6743, "rewards/accuracies": 0.5, "rewards/chosen": 0.03654135391116142, "rewards/margins": 0.022258350625634193, "rewards/rejected": 0.014283007010817528, "step": 363 }, { "epoch": 0.2219173906416705, "grad_norm": 76.11977534284549, "learning_rate": 1.5585365853658536e-08, "logits/chosen": 0.16321827471256256, "logits/rejected": 0.343872994184494, "logps/chosen": -384.1506042480469, "logps/rejected": -154.103515625, "loss": 0.664, "rewards/accuracies": 0.75, "rewards/chosen": 0.16939716041088104, "rewards/margins": 0.1677771955728531, "rewards/rejected": 0.001619958784431219, "step": 364 }, { "epoch": 0.22252705380277396, "grad_norm": 71.43880381699847, "learning_rate": 1.5629268292682927e-08, "logits/chosen": 0.12354715168476105, "logits/rejected": 0.18932867050170898, "logps/chosen": -381.6833801269531, "logps/rejected": -319.3189392089844, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": 0.15169011056423187, "rewards/margins": 0.1057649627327919, "rewards/rejected": 0.045925140380859375, "step": 365 }, { "epoch": 0.22313671696387746, "grad_norm": 70.59839888869874, "learning_rate": 1.5673170731707316e-08, "logits/chosen": 0.3457193374633789, "logits/rejected": 0.35894274711608887, "logps/chosen": -77.30677032470703, "logps/rejected": -94.93476104736328, "loss": 0.6809, "rewards/accuracies": 0.25, "rewards/chosen": 0.052020955830812454, "rewards/margins": 0.010834289714694023, "rewards/rejected": 0.04118666425347328, "step": 366 }, { "epoch": 0.22374638012498094, "grad_norm": 66.61954489334738, "learning_rate": 1.5717073170731707e-08, "logits/chosen": -0.16386066377162933, "logits/rejected": 0.1188196912407875, "logps/chosen": -337.9201354980469, "logps/rejected": -212.218017578125, "loss": 0.6802, "rewards/accuracies": 0.75, "rewards/chosen": 0.0663379654288292, "rewards/margins": 0.06414184719324112, "rewards/rejected": 0.002196121495217085, "step": 367 }, { "epoch": 0.22435604328608444, "grad_norm": 66.95436799361724, "learning_rate": 1.5760975609756096e-08, "logits/chosen": 0.2824529707431793, "logits/rejected": -0.034459665417671204, "logps/chosen": -44.80812072753906, "logps/rejected": -98.14501190185547, "loss": 0.6842, "rewards/accuracies": 0.25, "rewards/chosen": -0.013245273381471634, "rewards/margins": -0.02863171324133873, "rewards/rejected": 0.015386438928544521, "step": 368 }, { "epoch": 0.22496570644718794, "grad_norm": 73.92873087284248, "learning_rate": 1.5804878048780488e-08, "logits/chosen": 0.2049051970243454, "logits/rejected": 0.2036842703819275, "logps/chosen": -23.559926986694336, "logps/rejected": -19.191795349121094, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": -0.00024777118233032525, "rewards/margins": 0.0016170772723853588, "rewards/rejected": -0.001864847494289279, "step": 369 }, { "epoch": 0.22557536960829142, "grad_norm": 76.3516607294514, "learning_rate": 1.5848780487804876e-08, "logits/chosen": -0.022610221058130264, "logits/rejected": 0.06078797206282616, "logps/chosen": -238.5109100341797, "logps/rejected": -150.67947387695312, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 0.06348590552806854, "rewards/margins": 0.072443388402462, "rewards/rejected": -0.00895748008042574, "step": 370 }, { "epoch": 0.22618503276939492, "grad_norm": 71.07269983204453, "learning_rate": 1.5892682926829268e-08, "logits/chosen": -0.17673306167125702, "logits/rejected": -0.04492383077740669, "logps/chosen": -212.25999450683594, "logps/rejected": -166.0389862060547, "loss": 0.677, "rewards/accuracies": 0.75, "rewards/chosen": 0.0878465548157692, "rewards/margins": 0.058984603732824326, "rewards/rejected": 0.028861945495009422, "step": 371 }, { "epoch": 0.2267946959304984, "grad_norm": 62.117189746721714, "learning_rate": 1.593658536585366e-08, "logits/chosen": 0.2229243814945221, "logits/rejected": 0.19021424651145935, "logps/chosen": -104.9860610961914, "logps/rejected": -148.7133026123047, "loss": 0.6891, "rewards/accuracies": 0.75, "rewards/chosen": 0.06309252977371216, "rewards/margins": 0.04245776683092117, "rewards/rejected": 0.020634770393371582, "step": 372 }, { "epoch": 0.2274043590916019, "grad_norm": 71.22364328481646, "learning_rate": 1.5980487804878048e-08, "logits/chosen": 0.1851186901330948, "logits/rejected": 0.1108449250459671, "logps/chosen": -229.87127685546875, "logps/rejected": -267.2801818847656, "loss": 0.6803, "rewards/accuracies": 0.75, "rewards/chosen": 0.12824177742004395, "rewards/margins": 0.07881484925746918, "rewards/rejected": 0.049426935613155365, "step": 373 }, { "epoch": 0.22801402225270537, "grad_norm": 64.10057772002011, "learning_rate": 1.602439024390244e-08, "logits/chosen": 0.41635462641716003, "logits/rejected": 0.4206591546535492, "logps/chosen": -9.9085054397583, "logps/rejected": -3.9580512046813965, "loss": 0.6759, "rewards/accuracies": 0.75, "rewards/chosen": -0.008948644623160362, "rewards/margins": 0.01165156438946724, "rewards/rejected": -0.02060021087527275, "step": 374 }, { "epoch": 0.22862368541380887, "grad_norm": 67.90557059778888, "learning_rate": 1.6068292682926828e-08, "logits/chosen": 0.10314898192882538, "logits/rejected": 0.0605747364461422, "logps/chosen": -203.1617889404297, "logps/rejected": -161.07518005371094, "loss": 0.678, "rewards/accuracies": 0.25, "rewards/chosen": 0.1218641996383667, "rewards/margins": 0.07133705914020538, "rewards/rejected": 0.050527140498161316, "step": 375 }, { "epoch": 0.22923334857491237, "grad_norm": 76.57097598943874, "learning_rate": 1.611219512195122e-08, "logits/chosen": -0.10780695080757141, "logits/rejected": 0.17631042003631592, "logps/chosen": -318.8275451660156, "logps/rejected": -158.26197814941406, "loss": 0.6662, "rewards/accuracies": 0.5, "rewards/chosen": 0.0654771700501442, "rewards/margins": 0.05782622471451759, "rewards/rejected": 0.0076509444043040276, "step": 376 }, { "epoch": 0.22984301173601585, "grad_norm": 64.66974887707583, "learning_rate": 1.6156097560975608e-08, "logits/chosen": 0.2233721911907196, "logits/rejected": 0.25299790501594543, "logps/chosen": -89.06605529785156, "logps/rejected": -87.20704650878906, "loss": 0.6798, "rewards/accuracies": 0.75, "rewards/chosen": 0.043851472437381744, "rewards/margins": 0.0646313801407814, "rewards/rejected": -0.020779911428689957, "step": 377 }, { "epoch": 0.23045267489711935, "grad_norm": 68.70294004326945, "learning_rate": 1.62e-08, "logits/chosen": 0.015518620610237122, "logits/rejected": 0.04770684242248535, "logps/chosen": -122.88348388671875, "logps/rejected": -90.29814910888672, "loss": 0.6755, "rewards/accuracies": 0.25, "rewards/chosen": 0.042027123272418976, "rewards/margins": -0.0038461170624941587, "rewards/rejected": 0.045873235911130905, "step": 378 }, { "epoch": 0.23106233805822282, "grad_norm": 69.59769440231553, "learning_rate": 1.624390243902439e-08, "logits/chosen": 0.13944903016090393, "logits/rejected": 0.08162405341863632, "logps/chosen": -22.848159790039062, "logps/rejected": -26.46254539489746, "loss": 0.6773, "rewards/accuracies": 0.75, "rewards/chosen": 0.0193781740963459, "rewards/margins": 0.031845081597566605, "rewards/rejected": -0.012466907501220703, "step": 379 }, { "epoch": 0.23167200121932632, "grad_norm": 86.15023573147026, "learning_rate": 1.628780487804878e-08, "logits/chosen": 0.01964738965034485, "logits/rejected": 0.08549747616052628, "logps/chosen": -315.42120361328125, "logps/rejected": -250.3840789794922, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 0.1753004491329193, "rewards/margins": 0.042230166494846344, "rewards/rejected": 0.13307029008865356, "step": 380 }, { "epoch": 0.23228166438042983, "grad_norm": 73.90575796165558, "learning_rate": 1.633170731707317e-08, "logits/chosen": 0.2518499493598938, "logits/rejected": 0.20044390857219696, "logps/chosen": -282.0683898925781, "logps/rejected": -402.50018310546875, "loss": 0.6604, "rewards/accuracies": 1.0, "rewards/chosen": 0.2078959345817566, "rewards/margins": 0.07857249677181244, "rewards/rejected": 0.12932342290878296, "step": 381 }, { "epoch": 0.2328913275415333, "grad_norm": 65.02101991761026, "learning_rate": 1.637560975609756e-08, "logits/chosen": 0.12615638971328735, "logits/rejected": -0.0017370134592056274, "logps/chosen": -144.21067810058594, "logps/rejected": -175.3563690185547, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": 0.021349143236875534, "rewards/margins": -0.027446461841464043, "rewards/rejected": 0.048795606940984726, "step": 382 }, { "epoch": 0.2335009907026368, "grad_norm": 84.86151834266377, "learning_rate": 1.641951219512195e-08, "logits/chosen": 0.08340620249509811, "logits/rejected": 0.09593676775693893, "logps/chosen": -173.072021484375, "logps/rejected": -201.46633911132812, "loss": 0.6543, "rewards/accuracies": 1.0, "rewards/chosen": 0.15591296553611755, "rewards/margins": 0.13049039244651794, "rewards/rejected": 0.025422584265470505, "step": 383 }, { "epoch": 0.23411065386374028, "grad_norm": 66.10244136306407, "learning_rate": 1.646341463414634e-08, "logits/chosen": -0.07609017938375473, "logits/rejected": -0.06766972690820694, "logps/chosen": -85.554931640625, "logps/rejected": -83.42675018310547, "loss": 0.6744, "rewards/accuracies": 0.5, "rewards/chosen": 0.0532434917986393, "rewards/margins": 0.0021083245519548655, "rewards/rejected": 0.05113516375422478, "step": 384 }, { "epoch": 0.23472031702484378, "grad_norm": 67.2842157885265, "learning_rate": 1.650731707317073e-08, "logits/chosen": 0.044176261872053146, "logits/rejected": 0.2498132288455963, "logps/chosen": -93.33272552490234, "logps/rejected": -56.10679244995117, "loss": 0.6712, "rewards/accuracies": 0.5, "rewards/chosen": 0.06030142307281494, "rewards/margins": 0.03469962999224663, "rewards/rejected": 0.025601793080568314, "step": 385 }, { "epoch": 0.23532998018594725, "grad_norm": 62.52062745782673, "learning_rate": 1.6551219512195123e-08, "logits/chosen": 0.17597870528697968, "logits/rejected": 0.23001301288604736, "logps/chosen": -23.035144805908203, "logps/rejected": -38.43086242675781, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": -0.021736489608883858, "rewards/margins": -0.03424207493662834, "rewards/rejected": 0.01250558439642191, "step": 386 }, { "epoch": 0.23593964334705075, "grad_norm": 75.00696126073979, "learning_rate": 1.659512195121951e-08, "logits/chosen": 0.054178833961486816, "logits/rejected": 0.8317170739173889, "logps/chosen": -294.10821533203125, "logps/rejected": -207.20391845703125, "loss": 0.6686, "rewards/accuracies": 0.75, "rewards/chosen": 0.02296428754925728, "rewards/margins": 0.02375636249780655, "rewards/rejected": -0.0007920744828879833, "step": 387 }, { "epoch": 0.23654930650815426, "grad_norm": 69.94571772040307, "learning_rate": 1.6639024390243903e-08, "logits/chosen": 0.13065457344055176, "logits/rejected": 0.1328984498977661, "logps/chosen": -162.05563354492188, "logps/rejected": -255.76258850097656, "loss": 0.6675, "rewards/accuracies": 0.75, "rewards/chosen": 0.08216194808483124, "rewards/margins": 0.043298520147800446, "rewards/rejected": 0.038863420486450195, "step": 388 }, { "epoch": 0.23715896966925773, "grad_norm": 69.62954136859975, "learning_rate": 1.668292682926829e-08, "logits/chosen": 0.2529556155204773, "logits/rejected": 0.1152164563536644, "logps/chosen": -268.89447021484375, "logps/rejected": -566.7929077148438, "loss": 0.6693, "rewards/accuracies": 0.25, "rewards/chosen": 0.11476411670446396, "rewards/margins": 0.05900774151086807, "rewards/rejected": 0.05575638264417648, "step": 389 }, { "epoch": 0.23776863283036123, "grad_norm": 70.59570631057309, "learning_rate": 1.6726829268292683e-08, "logits/chosen": 0.21214380860328674, "logits/rejected": 0.23615749180316925, "logps/chosen": -27.17583465576172, "logps/rejected": -41.58104705810547, "loss": 0.6761, "rewards/accuracies": 0.25, "rewards/chosen": 0.024480272084474564, "rewards/margins": -0.029336892068386078, "rewards/rejected": 0.05381716042757034, "step": 390 }, { "epoch": 0.2383782959914647, "grad_norm": 62.53230766531854, "learning_rate": 1.677073170731707e-08, "logits/chosen": 0.04014543443918228, "logits/rejected": 0.1318899691104889, "logps/chosen": -296.1917724609375, "logps/rejected": -224.27108764648438, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.14381027221679688, "rewards/margins": 0.09685342758893967, "rewards/rejected": 0.046956852078437805, "step": 391 }, { "epoch": 0.2389879591525682, "grad_norm": 66.55683699515184, "learning_rate": 1.6814634146341463e-08, "logits/chosen": 0.32010552287101746, "logits/rejected": 0.2512872517108917, "logps/chosen": -69.6078872680664, "logps/rejected": -126.6366195678711, "loss": 0.663, "rewards/accuracies": 0.75, "rewards/chosen": 0.012041234411299229, "rewards/margins": 0.0030512893572449684, "rewards/rejected": 0.008989945985376835, "step": 392 }, { "epoch": 0.2395976223136717, "grad_norm": 69.45333612941845, "learning_rate": 1.6858536585365855e-08, "logits/chosen": -0.005228646099567413, "logits/rejected": 0.503351092338562, "logps/chosen": -258.2673645019531, "logps/rejected": -256.77728271484375, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.11970634758472443, "rewards/margins": 0.04186611622571945, "rewards/rejected": 0.07784023135900497, "step": 393 }, { "epoch": 0.24020728547477518, "grad_norm": 56.83502852210459, "learning_rate": 1.6902439024390243e-08, "logits/chosen": -0.14765405654907227, "logits/rejected": -0.25918394327163696, "logps/chosen": -96.93010711669922, "logps/rejected": -176.19113159179688, "loss": 0.6733, "rewards/accuracies": 1.0, "rewards/chosen": 0.04271545261144638, "rewards/margins": 0.043752267956733704, "rewards/rejected": -0.0010368103394284844, "step": 394 }, { "epoch": 0.24081694863587869, "grad_norm": 77.91111017133723, "learning_rate": 1.6946341463414632e-08, "logits/chosen": 0.30944663286209106, "logits/rejected": 0.21470560133457184, "logps/chosen": -419.9107666015625, "logps/rejected": -390.0615234375, "loss": 0.6809, "rewards/accuracies": 0.5, "rewards/chosen": 0.09270159155130386, "rewards/margins": 0.015946976840496063, "rewards/rejected": 0.0767546147108078, "step": 395 }, { "epoch": 0.24142661179698216, "grad_norm": 68.24931704160569, "learning_rate": 1.6990243902439023e-08, "logits/chosen": 0.1575380265712738, "logits/rejected": 0.1915324479341507, "logps/chosen": -26.391529083251953, "logps/rejected": -25.220199584960938, "loss": 0.6854, "rewards/accuracies": 0.5, "rewards/chosen": -0.016043771058321, "rewards/margins": 0.008305154740810394, "rewards/rejected": -0.024348925799131393, "step": 396 }, { "epoch": 0.24203627495808566, "grad_norm": 58.7737804792128, "learning_rate": 1.7034146341463412e-08, "logits/chosen": 0.28481295704841614, "logits/rejected": 0.21050994098186493, "logps/chosen": -72.94969940185547, "logps/rejected": -151.31069946289062, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 0.04740132391452789, "rewards/margins": 0.06847492605447769, "rewards/rejected": -0.021073605865240097, "step": 397 }, { "epoch": 0.24264593811918914, "grad_norm": 75.02976906364937, "learning_rate": 1.7078048780487803e-08, "logits/chosen": 0.16953998804092407, "logits/rejected": 0.38476526737213135, "logps/chosen": -152.58050537109375, "logps/rejected": -50.43113708496094, "loss": 0.6726, "rewards/accuracies": 0.75, "rewards/chosen": 0.05665000155568123, "rewards/margins": 0.06928949803113937, "rewards/rejected": -0.012639498338103294, "step": 398 }, { "epoch": 0.24325560128029264, "grad_norm": 75.03590741770022, "learning_rate": 1.7121951219512192e-08, "logits/chosen": 0.15704068541526794, "logits/rejected": 0.20390085875988007, "logps/chosen": -290.34130859375, "logps/rejected": -163.56857299804688, "loss": 0.6644, "rewards/accuracies": 0.75, "rewards/chosen": 0.11936527490615845, "rewards/margins": 0.08607812225818634, "rewards/rejected": 0.03328714519739151, "step": 399 }, { "epoch": 0.24386526444139614, "grad_norm": 63.37300742176534, "learning_rate": 1.7165853658536583e-08, "logits/chosen": 0.04996849596500397, "logits/rejected": 0.44465917348861694, "logps/chosen": -398.497802734375, "logps/rejected": -296.1901550292969, "loss": 0.6832, "rewards/accuracies": 0.75, "rewards/chosen": 0.18099498748779297, "rewards/margins": 0.09929068386554718, "rewards/rejected": 0.08170431107282639, "step": 400 }, { "epoch": 0.24447492760249961, "grad_norm": 66.90625226276222, "learning_rate": 1.7209756097560975e-08, "logits/chosen": 0.09004481881856918, "logits/rejected": 0.13244619965553284, "logps/chosen": -120.15802001953125, "logps/rejected": -104.40956115722656, "loss": 0.6586, "rewards/accuracies": 0.75, "rewards/chosen": 0.13799571990966797, "rewards/margins": 0.06407812237739563, "rewards/rejected": 0.07391760498285294, "step": 401 }, { "epoch": 0.24508459076360312, "grad_norm": 57.48359214891604, "learning_rate": 1.7253658536585364e-08, "logits/chosen": 0.0856451690196991, "logits/rejected": 0.1392892301082611, "logps/chosen": -9.377010345458984, "logps/rejected": -14.292369842529297, "loss": 0.6821, "rewards/accuracies": 0.5, "rewards/chosen": 0.0027606389485299587, "rewards/margins": -0.01972278766334057, "rewards/rejected": 0.022483427077531815, "step": 402 }, { "epoch": 0.2456942539247066, "grad_norm": 66.60343709678001, "learning_rate": 1.7297560975609755e-08, "logits/chosen": 0.3190247714519501, "logits/rejected": 0.0962214469909668, "logps/chosen": -95.74803161621094, "logps/rejected": -177.26490783691406, "loss": 0.6713, "rewards/accuracies": 0.25, "rewards/chosen": -0.011794280260801315, "rewards/margins": -0.022473935037851334, "rewards/rejected": 0.010679653845727444, "step": 403 }, { "epoch": 0.2463039170858101, "grad_norm": 69.51953007957174, "learning_rate": 1.7341463414634144e-08, "logits/chosen": 0.2589119076728821, "logits/rejected": 0.4490640461444855, "logps/chosen": -448.5097351074219, "logps/rejected": -340.63507080078125, "loss": 0.6583, "rewards/accuracies": 1.0, "rewards/chosen": 0.17544841766357422, "rewards/margins": 0.13995066285133362, "rewards/rejected": 0.0354977622628212, "step": 404 }, { "epoch": 0.24691358024691357, "grad_norm": 83.31258597050335, "learning_rate": 1.7385365853658535e-08, "logits/chosen": -0.07003955543041229, "logits/rejected": 0.3138444721698761, "logps/chosen": -391.0108642578125, "logps/rejected": -156.393310546875, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 0.1309162825345993, "rewards/margins": 0.12338415533304214, "rewards/rejected": 0.007532132789492607, "step": 405 }, { "epoch": 0.24752324340801707, "grad_norm": 66.03022550267046, "learning_rate": 1.7429268292682927e-08, "logits/chosen": 0.07335321605205536, "logits/rejected": 0.08494667708873749, "logps/chosen": -99.77315521240234, "logps/rejected": -66.1137466430664, "loss": 0.6663, "rewards/accuracies": 1.0, "rewards/chosen": 0.046557843685150146, "rewards/margins": 0.09241703897714615, "rewards/rejected": -0.045859195291996, "step": 406 }, { "epoch": 0.24813290656912057, "grad_norm": 74.775088673753, "learning_rate": 1.7473170731707315e-08, "logits/chosen": 0.24731683731079102, "logits/rejected": 0.03129180520772934, "logps/chosen": -211.49945068359375, "logps/rejected": -364.1927490234375, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.1337459683418274, "rewards/margins": 0.1884436011314392, "rewards/rejected": -0.05469761788845062, "step": 407 }, { "epoch": 0.24874256973022404, "grad_norm": 63.2436985259422, "learning_rate": 1.7517073170731707e-08, "logits/chosen": 0.2944685220718384, "logits/rejected": 0.16938583552837372, "logps/chosen": -60.469932556152344, "logps/rejected": -226.4971466064453, "loss": 0.6697, "rewards/accuracies": 0.25, "rewards/chosen": 0.01250808872282505, "rewards/margins": -0.03638029098510742, "rewards/rejected": 0.048888374119997025, "step": 408 }, { "epoch": 0.24935223289132755, "grad_norm": 64.69668791721564, "learning_rate": 1.7560975609756095e-08, "logits/chosen": 0.19941283762454987, "logits/rejected": 0.2230873554944992, "logps/chosen": -187.40347290039062, "logps/rejected": -127.78536224365234, "loss": 0.6603, "rewards/accuracies": 0.75, "rewards/chosen": 0.035814762115478516, "rewards/margins": 0.028313683345913887, "rewards/rejected": 0.007501076906919479, "step": 409 }, { "epoch": 0.24996189605243102, "grad_norm": 64.72994759177746, "learning_rate": 1.7604878048780487e-08, "logits/chosen": 0.09028814733028412, "logits/rejected": 0.12085546553134918, "logps/chosen": -79.18751525878906, "logps/rejected": -212.34779357910156, "loss": 0.6641, "rewards/accuracies": 1.0, "rewards/chosen": 0.02626047283411026, "rewards/margins": 0.050155192613601685, "rewards/rejected": -0.023894723504781723, "step": 410 }, { "epoch": 0.2505715592135345, "grad_norm": 80.73701959991998, "learning_rate": 1.7648780487804875e-08, "logits/chosen": 0.21164226531982422, "logits/rejected": 0.12578725814819336, "logps/chosen": -24.69331932067871, "logps/rejected": -62.57676696777344, "loss": 0.6619, "rewards/accuracies": 0.5, "rewards/chosen": 0.017589306458830833, "rewards/margins": 0.0014315862208604813, "rewards/rejected": 0.0161577221006155, "step": 411 }, { "epoch": 0.251181222374638, "grad_norm": 66.00444890188646, "learning_rate": 1.7692682926829267e-08, "logits/chosen": 0.07404673099517822, "logits/rejected": -0.03831756114959717, "logps/chosen": -107.595947265625, "logps/rejected": -138.1403350830078, "loss": 0.6817, "rewards/accuracies": 0.75, "rewards/chosen": 0.07385444641113281, "rewards/margins": 0.013393595814704895, "rewards/rejected": 0.060460854321718216, "step": 412 }, { "epoch": 0.2517908855357415, "grad_norm": 64.14888621824818, "learning_rate": 1.773658536585366e-08, "logits/chosen": -0.05230114609003067, "logits/rejected": 0.33488669991493225, "logps/chosen": -127.23094177246094, "logps/rejected": -57.00074005126953, "loss": 0.673, "rewards/accuracies": 0.75, "rewards/chosen": 0.038622189313173294, "rewards/margins": 0.03555377945303917, "rewards/rejected": 0.0030684114899486303, "step": 413 }, { "epoch": 0.252400548696845, "grad_norm": 64.3480813371628, "learning_rate": 1.7780487804878047e-08, "logits/chosen": -0.13973070681095123, "logits/rejected": 0.19591814279556274, "logps/chosen": -174.74899291992188, "logps/rejected": -186.95013427734375, "loss": 0.6711, "rewards/accuracies": 0.75, "rewards/chosen": 0.031192690134048462, "rewards/margins": -0.034893058240413666, "rewards/rejected": 0.06608574837446213, "step": 414 }, { "epoch": 0.2530102118579485, "grad_norm": 73.86086587703944, "learning_rate": 1.782439024390244e-08, "logits/chosen": -0.09877456724643707, "logits/rejected": 0.1697251796722412, "logps/chosen": -358.82391357421875, "logps/rejected": -161.81329345703125, "loss": 0.6589, "rewards/accuracies": 0.75, "rewards/chosen": 0.1746138632297516, "rewards/margins": 0.12531642615795135, "rewards/rejected": 0.049297429621219635, "step": 415 }, { "epoch": 0.253619875019052, "grad_norm": 61.51659876037952, "learning_rate": 1.7868292682926827e-08, "logits/chosen": -0.07529552280902863, "logits/rejected": 0.021304693073034286, "logps/chosen": -74.10441589355469, "logps/rejected": -82.88666534423828, "loss": 0.6735, "rewards/accuracies": 0.5, "rewards/chosen": 0.015579151920974255, "rewards/margins": -0.00527801550924778, "rewards/rejected": 0.02085716649889946, "step": 416 }, { "epoch": 0.2542295381801555, "grad_norm": 75.4546435785866, "learning_rate": 1.791219512195122e-08, "logits/chosen": -0.0038693025708198547, "logits/rejected": 0.17084497213363647, "logps/chosen": -217.5841064453125, "logps/rejected": -302.18634033203125, "loss": 0.6868, "rewards/accuracies": 0.0, "rewards/chosen": 0.09169816970825195, "rewards/margins": -0.04251394420862198, "rewards/rejected": 0.13421212136745453, "step": 417 }, { "epoch": 0.254839201341259, "grad_norm": 66.82138942843116, "learning_rate": 1.7956097560975607e-08, "logits/chosen": 0.3456932306289673, "logits/rejected": 0.2908247113227844, "logps/chosen": -112.54121398925781, "logps/rejected": -72.51783752441406, "loss": 0.6716, "rewards/accuracies": 0.75, "rewards/chosen": 0.14502708613872528, "rewards/margins": 0.122799813747406, "rewards/rejected": 0.022227276116609573, "step": 418 }, { "epoch": 0.2554488645023624, "grad_norm": 72.92019462743787, "learning_rate": 1.8e-08, "logits/chosen": 0.12400247156620026, "logits/rejected": 0.3096410930156708, "logps/chosen": -290.733642578125, "logps/rejected": -235.09213256835938, "loss": 0.6684, "rewards/accuracies": 0.5, "rewards/chosen": 0.10874108970165253, "rewards/margins": 0.025292323902249336, "rewards/rejected": 0.08344876766204834, "step": 419 }, { "epoch": 0.25605852766346593, "grad_norm": 70.71468075252082, "learning_rate": 1.804390243902439e-08, "logits/chosen": -0.012874945998191833, "logits/rejected": -0.034830302000045776, "logps/chosen": -273.19000244140625, "logps/rejected": -224.76705932617188, "loss": 0.6732, "rewards/accuracies": 0.5, "rewards/chosen": 0.2105557769536972, "rewards/margins": 0.23022204637527466, "rewards/rejected": -0.019666265696287155, "step": 420 }, { "epoch": 0.25666819082456943, "grad_norm": 74.03103411357598, "learning_rate": 1.808780487804878e-08, "logits/chosen": 0.25390613079071045, "logits/rejected": 0.24167406558990479, "logps/chosen": -94.19416046142578, "logps/rejected": -97.90225982666016, "loss": 0.6549, "rewards/accuracies": 0.75, "rewards/chosen": 0.032872818410396576, "rewards/margins": 0.006419522687792778, "rewards/rejected": 0.026453299447894096, "step": 421 }, { "epoch": 0.25727785398567293, "grad_norm": 70.28787189866513, "learning_rate": 1.813170731707317e-08, "logits/chosen": -0.1942972093820572, "logits/rejected": -0.15210787951946259, "logps/chosen": -132.333984375, "logps/rejected": -113.71392059326172, "loss": 0.6773, "rewards/accuracies": 0.75, "rewards/chosen": 0.02886814996600151, "rewards/margins": 0.03950446844100952, "rewards/rejected": -0.010636317543685436, "step": 422 }, { "epoch": 0.2578875171467764, "grad_norm": 79.69632773166165, "learning_rate": 1.817560975609756e-08, "logits/chosen": 0.2247251272201538, "logits/rejected": 0.31595107913017273, "logps/chosen": -108.94129943847656, "logps/rejected": -365.61224365234375, "loss": 0.6546, "rewards/accuracies": 0.75, "rewards/chosen": 0.038946785032749176, "rewards/margins": 0.04372507333755493, "rewards/rejected": -0.004778290167450905, "step": 423 }, { "epoch": 0.2584971803078799, "grad_norm": 63.99624156445586, "learning_rate": 1.821951219512195e-08, "logits/chosen": 0.11942100524902344, "logits/rejected": 0.06516227126121521, "logps/chosen": -32.335792541503906, "logps/rejected": -79.2452392578125, "loss": 0.6578, "rewards/accuracies": 0.25, "rewards/chosen": 0.007939141243696213, "rewards/margins": -0.01192130520939827, "rewards/rejected": 0.019860446453094482, "step": 424 }, { "epoch": 0.2591068434689834, "grad_norm": 69.47093923455672, "learning_rate": 1.826341463414634e-08, "logits/chosen": 0.06378969550132751, "logits/rejected": 0.34526288509368896, "logps/chosen": -209.16851806640625, "logps/rejected": -113.60540771484375, "loss": 0.6595, "rewards/accuracies": 0.75, "rewards/chosen": 0.12819691002368927, "rewards/margins": 0.13993854820728302, "rewards/rejected": -0.01174163818359375, "step": 425 }, { "epoch": 0.2597165066300869, "grad_norm": 68.55476273419893, "learning_rate": 1.830731707317073e-08, "logits/chosen": 0.14273546636104584, "logits/rejected": 0.4567739963531494, "logps/chosen": -133.89926147460938, "logps/rejected": -53.5922966003418, "loss": 0.6645, "rewards/accuracies": 0.75, "rewards/chosen": 0.04424731805920601, "rewards/margins": 0.09210419654846191, "rewards/rejected": -0.047856878489255905, "step": 426 }, { "epoch": 0.2603261697911904, "grad_norm": 69.5881777755876, "learning_rate": 1.8351219512195123e-08, "logits/chosen": -0.07335227727890015, "logits/rejected": -0.1167667955160141, "logps/chosen": -277.70880126953125, "logps/rejected": -338.48626708984375, "loss": 0.67, "rewards/accuracies": 0.75, "rewards/chosen": 0.17319995164871216, "rewards/margins": 0.042127128690481186, "rewards/rejected": 0.13107281923294067, "step": 427 }, { "epoch": 0.26093583295229383, "grad_norm": 67.29761472890893, "learning_rate": 1.839512195121951e-08, "logits/chosen": 0.4153785705566406, "logits/rejected": 0.3026084899902344, "logps/chosen": -182.0535888671875, "logps/rejected": -243.2552490234375, "loss": 0.6388, "rewards/accuracies": 0.5, "rewards/chosen": 0.13951362669467926, "rewards/margins": 0.31202802062034607, "rewards/rejected": -0.17251437902450562, "step": 428 }, { "epoch": 0.26154549611339734, "grad_norm": 64.93182010188912, "learning_rate": 1.8439024390243903e-08, "logits/chosen": -0.04005971923470497, "logits/rejected": -0.051459453999996185, "logps/chosen": -34.133052825927734, "logps/rejected": -48.049068450927734, "loss": 0.6726, "rewards/accuracies": 0.5, "rewards/chosen": 0.01578432321548462, "rewards/margins": -0.005022215656936169, "rewards/rejected": 0.020806537941098213, "step": 429 }, { "epoch": 0.26215515927450084, "grad_norm": 66.63677062463813, "learning_rate": 1.848292682926829e-08, "logits/chosen": 0.14951828122138977, "logits/rejected": 0.12362108379602432, "logps/chosen": -304.57733154296875, "logps/rejected": -325.12017822265625, "loss": 0.6634, "rewards/accuracies": 1.0, "rewards/chosen": 0.29309120774269104, "rewards/margins": 0.20215153694152832, "rewards/rejected": 0.09093966335058212, "step": 430 }, { "epoch": 0.26276482243560434, "grad_norm": 68.94318829626472, "learning_rate": 1.8526829268292683e-08, "logits/chosen": 0.11069389432668686, "logits/rejected": 0.1824926733970642, "logps/chosen": -103.77944946289062, "logps/rejected": -68.4943618774414, "loss": 0.6689, "rewards/accuracies": 0.25, "rewards/chosen": 0.038526106625795364, "rewards/margins": -0.055515188723802567, "rewards/rejected": 0.09404130280017853, "step": 431 }, { "epoch": 0.26337448559670784, "grad_norm": 61.669867746373285, "learning_rate": 1.857073170731707e-08, "logits/chosen": 0.17164219915866852, "logits/rejected": 0.1094513013958931, "logps/chosen": -98.68592071533203, "logps/rejected": -111.42066955566406, "loss": 0.6673, "rewards/accuracies": 0.75, "rewards/chosen": 0.06978795677423477, "rewards/margins": 0.08880459517240524, "rewards/rejected": -0.019016636535525322, "step": 432 }, { "epoch": 0.2639841487578113, "grad_norm": 67.73293276351028, "learning_rate": 1.8614634146341463e-08, "logits/chosen": 0.06384044885635376, "logits/rejected": 0.18351881206035614, "logps/chosen": -55.75654983520508, "logps/rejected": -14.070140838623047, "loss": 0.6698, "rewards/accuracies": 0.5, "rewards/chosen": -0.031082313507795334, "rewards/margins": 0.011718548834323883, "rewards/rejected": -0.04280085861682892, "step": 433 }, { "epoch": 0.2645938119189148, "grad_norm": 74.82178146757374, "learning_rate": 1.8658536585365854e-08, "logits/chosen": 0.21451379358768463, "logits/rejected": 0.2656710743904114, "logps/chosen": -252.65444946289062, "logps/rejected": -249.56112670898438, "loss": 0.6592, "rewards/accuracies": 0.5, "rewards/chosen": 0.16140216588974, "rewards/margins": 0.07049685716629028, "rewards/rejected": 0.09090529382228851, "step": 434 }, { "epoch": 0.2652034750800183, "grad_norm": 68.59328455137637, "learning_rate": 1.8702439024390243e-08, "logits/chosen": 0.08416593074798584, "logits/rejected": -0.025433972477912903, "logps/chosen": -126.26637268066406, "logps/rejected": -389.1315002441406, "loss": 0.6566, "rewards/accuracies": 0.5, "rewards/chosen": 0.026880884543061256, "rewards/margins": 0.05392398685216904, "rewards/rejected": -0.02704310230910778, "step": 435 }, { "epoch": 0.2658131382411218, "grad_norm": 67.76205030806439, "learning_rate": 1.8746341463414635e-08, "logits/chosen": -0.12446662783622742, "logits/rejected": 0.14436742663383484, "logps/chosen": -296.7868957519531, "logps/rejected": -198.11489868164062, "loss": 0.6793, "rewards/accuracies": 0.75, "rewards/chosen": 0.07260856032371521, "rewards/margins": 0.05034847557544708, "rewards/rejected": 0.022260094061493874, "step": 436 }, { "epoch": 0.2664228014022253, "grad_norm": 56.44823636494468, "learning_rate": 1.8790243902439023e-08, "logits/chosen": 0.2524694502353668, "logits/rejected": 0.31131431460380554, "logps/chosen": -205.24276733398438, "logps/rejected": -92.745849609375, "loss": 0.6648, "rewards/accuracies": 1.0, "rewards/chosen": 0.15956419706344604, "rewards/margins": 0.17301282286643982, "rewards/rejected": -0.013448631390929222, "step": 437 }, { "epoch": 0.26703246456332874, "grad_norm": 94.67153857744482, "learning_rate": 1.8834146341463415e-08, "logits/chosen": 0.2888627350330353, "logits/rejected": 0.4149096608161926, "logps/chosen": -194.0454559326172, "logps/rejected": -111.11502838134766, "loss": 0.6869, "rewards/accuracies": 0.75, "rewards/chosen": 0.14418041706085205, "rewards/margins": 0.13503269851207733, "rewards/rejected": 0.009147727862000465, "step": 438 }, { "epoch": 0.26764212772443224, "grad_norm": 71.36579622673247, "learning_rate": 1.8878048780487806e-08, "logits/chosen": -0.11654005944728851, "logits/rejected": 0.27451732754707336, "logps/chosen": -329.767822265625, "logps/rejected": -143.21238708496094, "loss": 0.6615, "rewards/accuracies": 0.5, "rewards/chosen": 0.19159859418869019, "rewards/margins": 0.10668019950389862, "rewards/rejected": 0.08491840958595276, "step": 439 }, { "epoch": 0.26825179088553575, "grad_norm": 66.68799340256919, "learning_rate": 1.8921951219512195e-08, "logits/chosen": 0.08359631896018982, "logits/rejected": 0.05886491760611534, "logps/chosen": -59.129661560058594, "logps/rejected": -121.8931884765625, "loss": 0.6586, "rewards/accuracies": 0.25, "rewards/chosen": 0.009189892560243607, "rewards/margins": -0.0016784649342298508, "rewards/rejected": 0.010868358425796032, "step": 440 }, { "epoch": 0.26886145404663925, "grad_norm": 78.97967669661037, "learning_rate": 1.8965853658536586e-08, "logits/chosen": -0.0766112208366394, "logits/rejected": 0.24237942695617676, "logps/chosen": -144.33291625976562, "logps/rejected": -142.45741271972656, "loss": 0.6706, "rewards/accuracies": 0.25, "rewards/chosen": 0.055260419845581055, "rewards/margins": -0.01594863086938858, "rewards/rejected": 0.07120904326438904, "step": 441 }, { "epoch": 0.26947111720774275, "grad_norm": 88.67666461641667, "learning_rate": 1.9009756097560975e-08, "logits/chosen": -0.1746220588684082, "logits/rejected": 0.1630934476852417, "logps/chosen": -135.13832092285156, "logps/rejected": -153.19435119628906, "loss": 0.6835, "rewards/accuracies": 0.0, "rewards/chosen": -0.01004643552005291, "rewards/margins": -0.08069337159395218, "rewards/rejected": 0.07064693421125412, "step": 442 }, { "epoch": 0.2700807803688462, "grad_norm": 60.63270295307191, "learning_rate": 1.9053658536585366e-08, "logits/chosen": 0.14319084584712982, "logits/rejected": 0.1383337825536728, "logps/chosen": -39.036170959472656, "logps/rejected": -64.15113830566406, "loss": 0.6846, "rewards/accuracies": 0.5, "rewards/chosen": 0.004637223668396473, "rewards/margins": -0.01796860620379448, "rewards/rejected": 0.022605828940868378, "step": 443 }, { "epoch": 0.2706904435299497, "grad_norm": 59.29665474045625, "learning_rate": 1.9097560975609755e-08, "logits/chosen": -0.1462271511554718, "logits/rejected": -0.19049985706806183, "logps/chosen": -37.981868743896484, "logps/rejected": -66.49958801269531, "loss": 0.6768, "rewards/accuracies": 0.75, "rewards/chosen": 0.03529193997383118, "rewards/margins": -0.0017494894564151764, "rewards/rejected": 0.03704142943024635, "step": 444 }, { "epoch": 0.2713001066910532, "grad_norm": 75.98124784362513, "learning_rate": 1.9141463414634146e-08, "logits/chosen": 0.3471428155899048, "logits/rejected": 0.19750745594501495, "logps/chosen": -215.55172729492188, "logps/rejected": -281.86029052734375, "loss": 0.6387, "rewards/accuracies": 1.0, "rewards/chosen": 0.19224663078784943, "rewards/margins": 0.24797995388507843, "rewards/rejected": -0.055733323097229004, "step": 445 }, { "epoch": 0.2719097698521567, "grad_norm": 73.1487317087868, "learning_rate": 1.9185365853658538e-08, "logits/chosen": 0.06400503218173981, "logits/rejected": -0.08633884787559509, "logps/chosen": -156.72970581054688, "logps/rejected": -203.78372192382812, "loss": 0.6654, "rewards/accuracies": 0.75, "rewards/chosen": 0.07347560673952103, "rewards/margins": 0.017678987234830856, "rewards/rejected": 0.05579661950469017, "step": 446 }, { "epoch": 0.27251943301326015, "grad_norm": 69.84496644271896, "learning_rate": 1.9229268292682927e-08, "logits/chosen": 0.032298870384693146, "logits/rejected": 0.2304471880197525, "logps/chosen": -155.3098602294922, "logps/rejected": -86.67618560791016, "loss": 0.6769, "rewards/accuracies": 0.75, "rewards/chosen": 0.02124052867293358, "rewards/margins": 0.008774537593126297, "rewards/rejected": 0.01246599294245243, "step": 447 }, { "epoch": 0.27312909617436365, "grad_norm": 60.17134465077584, "learning_rate": 1.9273170731707318e-08, "logits/chosen": 0.3111189901828766, "logits/rejected": 0.2346172034740448, "logps/chosen": -184.82394409179688, "logps/rejected": -142.02420043945312, "loss": 0.6522, "rewards/accuracies": 0.75, "rewards/chosen": 0.12719501554965973, "rewards/margins": 0.08638709783554077, "rewards/rejected": 0.04080791771411896, "step": 448 }, { "epoch": 0.27373875933546715, "grad_norm": 68.92332057326264, "learning_rate": 1.9317073170731707e-08, "logits/chosen": 0.1487908959388733, "logits/rejected": 0.4169542193412781, "logps/chosen": -278.8198547363281, "logps/rejected": -315.25506591796875, "loss": 0.6517, "rewards/accuracies": 1.0, "rewards/chosen": 0.2707786560058594, "rewards/margins": 0.28594255447387695, "rewards/rejected": -0.015163922682404518, "step": 449 }, { "epoch": 0.27434842249657065, "grad_norm": 64.30204877730138, "learning_rate": 1.9360975609756098e-08, "logits/chosen": -0.09532143175601959, "logits/rejected": 0.07058137655258179, "logps/chosen": -135.10035705566406, "logps/rejected": -127.02887725830078, "loss": 0.6586, "rewards/accuracies": 0.5, "rewards/chosen": 0.04978819191455841, "rewards/margins": 0.04082760959863663, "rewards/rejected": 0.00896057952195406, "step": 450 }, { "epoch": 0.27495808565767416, "grad_norm": 57.44123209699844, "learning_rate": 1.9404878048780487e-08, "logits/chosen": 0.28252366185188293, "logits/rejected": 0.29748132824897766, "logps/chosen": -127.99627685546875, "logps/rejected": -151.4498291015625, "loss": 0.6555, "rewards/accuracies": 0.75, "rewards/chosen": 0.13848179578781128, "rewards/margins": 0.18644532561302185, "rewards/rejected": -0.04796352982521057, "step": 451 }, { "epoch": 0.2755677488187776, "grad_norm": 86.40538860877604, "learning_rate": 1.944878048780488e-08, "logits/chosen": 0.10914494842290878, "logits/rejected": 0.16794736683368683, "logps/chosen": -52.78816223144531, "logps/rejected": -59.555259704589844, "loss": 0.6253, "rewards/accuracies": 0.5, "rewards/chosen": 0.02861565351486206, "rewards/margins": -0.04731559008359909, "rewards/rejected": 0.07593125104904175, "step": 452 }, { "epoch": 0.2761774119798811, "grad_norm": 63.60873431451276, "learning_rate": 1.949268292682927e-08, "logits/chosen": 0.03662291169166565, "logits/rejected": 0.5540964007377625, "logps/chosen": -319.99407958984375, "logps/rejected": -136.2207489013672, "loss": 0.6653, "rewards/accuracies": 0.75, "rewards/chosen": -0.009144261479377747, "rewards/margins": 0.0010973364114761353, "rewards/rejected": -0.010241604410111904, "step": 453 }, { "epoch": 0.2767870751409846, "grad_norm": 63.343343518333135, "learning_rate": 1.953658536585366e-08, "logits/chosen": 0.21227677166461945, "logits/rejected": 0.1923665702342987, "logps/chosen": -12.800304412841797, "logps/rejected": -34.73240661621094, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": -0.012887751683592796, "rewards/margins": 0.014210665598511696, "rewards/rejected": -0.027098417282104492, "step": 454 }, { "epoch": 0.2773967383020881, "grad_norm": 63.29144790106512, "learning_rate": 1.958048780487805e-08, "logits/chosen": 0.3340422511100769, "logits/rejected": 0.20748065412044525, "logps/chosen": -132.12779235839844, "logps/rejected": -201.5473175048828, "loss": 0.6634, "rewards/accuracies": 0.25, "rewards/chosen": 0.04269295185804367, "rewards/margins": -0.011317897588014603, "rewards/rejected": 0.05401084944605827, "step": 455 }, { "epoch": 0.2780064014631916, "grad_norm": 74.57964819213488, "learning_rate": 1.962439024390244e-08, "logits/chosen": 0.07027482241392136, "logits/rejected": -0.019699640572071075, "logps/chosen": -161.471923828125, "logps/rejected": -185.7052001953125, "loss": 0.653, "rewards/accuracies": 0.75, "rewards/chosen": 0.059368353337049484, "rewards/margins": 0.10203978419303894, "rewards/rejected": -0.042671434581279755, "step": 456 }, { "epoch": 0.27861606462429506, "grad_norm": 67.61667659665781, "learning_rate": 1.966829268292683e-08, "logits/chosen": 0.20456407964229584, "logits/rejected": 0.18659041821956635, "logps/chosen": -123.71971130371094, "logps/rejected": -143.08709716796875, "loss": 0.6727, "rewards/accuracies": 0.75, "rewards/chosen": 0.044207461178302765, "rewards/margins": 0.03395986557006836, "rewards/rejected": 0.010247595608234406, "step": 457 }, { "epoch": 0.27922572778539856, "grad_norm": 69.41472807818889, "learning_rate": 1.971219512195122e-08, "logits/chosen": 0.48095908761024475, "logits/rejected": 0.4224778413772583, "logps/chosen": -266.6629943847656, "logps/rejected": -149.05494689941406, "loss": 0.6443, "rewards/accuracies": 0.75, "rewards/chosen": 0.1881321519613266, "rewards/margins": 0.13835391402244568, "rewards/rejected": 0.04977824538946152, "step": 458 }, { "epoch": 0.27983539094650206, "grad_norm": 72.22389428627339, "learning_rate": 1.975609756097561e-08, "logits/chosen": -0.3132532835006714, "logits/rejected": -0.18809807300567627, "logps/chosen": -119.96725463867188, "logps/rejected": -111.36251068115234, "loss": 0.6584, "rewards/accuracies": 0.25, "rewards/chosen": -0.00742526026442647, "rewards/margins": -0.05493436008691788, "rewards/rejected": 0.047509100288152695, "step": 459 }, { "epoch": 0.28044505410760556, "grad_norm": 68.39678394859612, "learning_rate": 1.9800000000000002e-08, "logits/chosen": 0.1794874668121338, "logits/rejected": 0.10086102783679962, "logps/chosen": -127.66497802734375, "logps/rejected": -135.0052490234375, "loss": 0.6475, "rewards/accuracies": 0.5, "rewards/chosen": 0.08427328616380692, "rewards/margins": 0.06482025980949402, "rewards/rejected": 0.019453026354312897, "step": 460 }, { "epoch": 0.28105471726870906, "grad_norm": 68.2361183005348, "learning_rate": 1.984390243902439e-08, "logits/chosen": 0.07122084498405457, "logits/rejected": 0.031408704817295074, "logps/chosen": -110.41446685791016, "logps/rejected": -122.53499603271484, "loss": 0.6785, "rewards/accuracies": 0.25, "rewards/chosen": -0.03592252731323242, "rewards/margins": 0.07255663722753525, "rewards/rejected": -0.10847915709018707, "step": 461 }, { "epoch": 0.2816643804298125, "grad_norm": 69.020638385843, "learning_rate": 1.9887804878048782e-08, "logits/chosen": 0.14568686485290527, "logits/rejected": 0.26515433192253113, "logps/chosen": -209.664306640625, "logps/rejected": -171.8083953857422, "loss": 0.6472, "rewards/accuracies": 0.75, "rewards/chosen": 0.21378999948501587, "rewards/margins": 0.15649661421775818, "rewards/rejected": 0.05729339271783829, "step": 462 }, { "epoch": 0.282274043590916, "grad_norm": 73.05756135117628, "learning_rate": 1.993170731707317e-08, "logits/chosen": 0.09534000605344772, "logits/rejected": 0.02491987869143486, "logps/chosen": -219.74197387695312, "logps/rejected": -208.32476806640625, "loss": 0.661, "rewards/accuracies": 0.5, "rewards/chosen": 0.07846088707447052, "rewards/margins": 0.06449966132640839, "rewards/rejected": 0.013961220160126686, "step": 463 }, { "epoch": 0.2828837067520195, "grad_norm": 69.79106803917409, "learning_rate": 1.9975609756097562e-08, "logits/chosen": -0.06213008612394333, "logits/rejected": -0.23186719417572021, "logps/chosen": -153.84791564941406, "logps/rejected": -151.7867431640625, "loss": 0.6721, "rewards/accuracies": 0.75, "rewards/chosen": 0.10366620123386383, "rewards/margins": 0.07202887535095215, "rewards/rejected": 0.03163733333349228, "step": 464 }, { "epoch": 0.283493369913123, "grad_norm": 61.283147710449725, "learning_rate": 2.001951219512195e-08, "logits/chosen": 0.21901625394821167, "logits/rejected": 0.20077916979789734, "logps/chosen": -7.487462997436523, "logps/rejected": -7.173701286315918, "loss": 0.678, "rewards/accuracies": 0.5, "rewards/chosen": 0.006824742071330547, "rewards/margins": 0.01945982500910759, "rewards/rejected": -0.012635082937777042, "step": 465 }, { "epoch": 0.28410303307422646, "grad_norm": 62.43884619620277, "learning_rate": 2.0063414634146342e-08, "logits/chosen": 0.29454317688941956, "logits/rejected": 0.2875673770904541, "logps/chosen": -146.99061584472656, "logps/rejected": -144.388671875, "loss": 0.6507, "rewards/accuracies": 0.75, "rewards/chosen": 0.21434439718723297, "rewards/margins": 0.20665687322616577, "rewards/rejected": 0.0076875220984220505, "step": 466 }, { "epoch": 0.28471269623532997, "grad_norm": 79.08259419954352, "learning_rate": 2.0107317073170734e-08, "logits/chosen": 0.522144079208374, "logits/rejected": 0.1844376027584076, "logps/chosen": -106.72376251220703, "logps/rejected": -123.9677505493164, "loss": 0.6678, "rewards/accuracies": 0.75, "rewards/chosen": 0.04608582705259323, "rewards/margins": 0.05327960103750229, "rewards/rejected": -0.007193779572844505, "step": 467 }, { "epoch": 0.28532235939643347, "grad_norm": 72.96261350010558, "learning_rate": 2.0151219512195122e-08, "logits/chosen": 0.08090576529502869, "logits/rejected": 0.14955011010169983, "logps/chosen": -181.6560821533203, "logps/rejected": -222.9476318359375, "loss": 0.6544, "rewards/accuracies": 0.75, "rewards/chosen": 0.08407467603683472, "rewards/margins": 0.15113595128059387, "rewards/rejected": -0.06706128269433975, "step": 468 }, { "epoch": 0.28593202255753697, "grad_norm": 77.14306159327754, "learning_rate": 2.0195121951219514e-08, "logits/chosen": -0.27836892008781433, "logits/rejected": 0.03967548906803131, "logps/chosen": -338.9379577636719, "logps/rejected": -126.48583984375, "loss": 0.6544, "rewards/accuracies": 0.75, "rewards/chosen": 0.4431110620498657, "rewards/margins": 0.39103490114212036, "rewards/rejected": 0.052076149731874466, "step": 469 }, { "epoch": 0.28654168571864047, "grad_norm": 61.034460109343556, "learning_rate": 2.0239024390243902e-08, "logits/chosen": 0.23134541511535645, "logits/rejected": 0.22853803634643555, "logps/chosen": -31.802772521972656, "logps/rejected": -12.457967758178711, "loss": 0.6617, "rewards/accuracies": 1.0, "rewards/chosen": 0.018924130126833916, "rewards/margins": 0.1008741557598114, "rewards/rejected": -0.08195002377033234, "step": 470 }, { "epoch": 0.2871513488797439, "grad_norm": 68.55916541041393, "learning_rate": 2.028292682926829e-08, "logits/chosen": -0.09053162485361099, "logits/rejected": 0.25271421670913696, "logps/chosen": -321.14013671875, "logps/rejected": -105.07038879394531, "loss": 0.6581, "rewards/accuracies": 0.5, "rewards/chosen": 0.18394261598587036, "rewards/margins": 0.1811237931251526, "rewards/rejected": 0.002818802371621132, "step": 471 }, { "epoch": 0.2877610120408474, "grad_norm": 82.15582208471778, "learning_rate": 2.032682926829268e-08, "logits/chosen": 0.2014414221048355, "logits/rejected": -0.21294736862182617, "logps/chosen": -173.34231567382812, "logps/rejected": -726.0479736328125, "loss": 0.6744, "rewards/accuracies": 0.0, "rewards/chosen": 0.008779885247349739, "rewards/margins": -0.19394630193710327, "rewards/rejected": 0.20272618532180786, "step": 472 }, { "epoch": 0.2883706752019509, "grad_norm": 56.49253641977703, "learning_rate": 2.037073170731707e-08, "logits/chosen": 0.3929775059223175, "logits/rejected": 0.10422532260417938, "logps/chosen": -12.824280738830566, "logps/rejected": -29.03037452697754, "loss": 0.6727, "rewards/accuracies": 0.25, "rewards/chosen": -0.013209665194153786, "rewards/margins": -0.004942881874740124, "rewards/rejected": -0.008266782388091087, "step": 473 }, { "epoch": 0.2889803383630544, "grad_norm": 69.26657882016576, "learning_rate": 2.041463414634146e-08, "logits/chosen": 0.1593598872423172, "logits/rejected": 0.19768331944942474, "logps/chosen": -210.8773193359375, "logps/rejected": -146.36282348632812, "loss": 0.6278, "rewards/accuracies": 1.0, "rewards/chosen": 0.23647665977478027, "rewards/margins": 0.2512618601322174, "rewards/rejected": -0.014785194769501686, "step": 474 }, { "epoch": 0.2895900015241579, "grad_norm": 64.71109478449996, "learning_rate": 2.045853658536585e-08, "logits/chosen": -0.18643224239349365, "logits/rejected": 0.19490055739879608, "logps/chosen": -214.46121215820312, "logps/rejected": -137.38888549804688, "loss": 0.6483, "rewards/accuracies": 1.0, "rewards/chosen": 0.114950992166996, "rewards/margins": 0.140077143907547, "rewards/rejected": -0.02512616105377674, "step": 475 }, { "epoch": 0.29019966468526137, "grad_norm": 66.42831607463249, "learning_rate": 2.0502439024390242e-08, "logits/chosen": 0.032998643815517426, "logits/rejected": -0.06949509680271149, "logps/chosen": -20.744892120361328, "logps/rejected": -38.61202621459961, "loss": 0.6738, "rewards/accuracies": 0.25, "rewards/chosen": -0.005787133239209652, "rewards/margins": -0.038832686841487885, "rewards/rejected": 0.033045556396245956, "step": 476 }, { "epoch": 0.2908093278463649, "grad_norm": 66.66597610193936, "learning_rate": 2.054634146341463e-08, "logits/chosen": 0.2674751877784729, "logits/rejected": 0.32825157046318054, "logps/chosen": -374.01495361328125, "logps/rejected": -170.8480987548828, "loss": 0.6634, "rewards/accuracies": 0.75, "rewards/chosen": 0.13175562024116516, "rewards/margins": 0.16506260633468628, "rewards/rejected": -0.03330698236823082, "step": 477 }, { "epoch": 0.2914189910074684, "grad_norm": 64.16360845160342, "learning_rate": 2.0590243902439023e-08, "logits/chosen": 0.04146502912044525, "logits/rejected": 0.177695631980896, "logps/chosen": -215.79151916503906, "logps/rejected": -265.5438232421875, "loss": 0.6816, "rewards/accuracies": 0.75, "rewards/chosen": 0.07885445654392242, "rewards/margins": -0.013722196221351624, "rewards/rejected": 0.09257666021585464, "step": 478 }, { "epoch": 0.2920286541685719, "grad_norm": 58.67004279648168, "learning_rate": 2.063414634146341e-08, "logits/chosen": 0.11150379478931427, "logits/rejected": 0.21114593744277954, "logps/chosen": -55.13713836669922, "logps/rejected": -42.53518295288086, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": 0.07448139041662216, "rewards/margins": 0.0382765531539917, "rewards/rejected": 0.03620484098792076, "step": 479 }, { "epoch": 0.2926383173296754, "grad_norm": 59.23975018464548, "learning_rate": 2.0678048780487803e-08, "logits/chosen": 0.271587610244751, "logits/rejected": 0.3958515524864197, "logps/chosen": -154.6184844970703, "logps/rejected": -168.01158142089844, "loss": 0.65, "rewards/accuracies": 0.75, "rewards/chosen": 0.2721033990383148, "rewards/margins": 0.2399720847606659, "rewards/rejected": 0.03213133662939072, "step": 480 }, { "epoch": 0.2932479804907788, "grad_norm": 76.54887162726922, "learning_rate": 2.072195121951219e-08, "logits/chosen": -0.045136116445064545, "logits/rejected": 0.20272579789161682, "logps/chosen": -180.22154235839844, "logps/rejected": -48.23255157470703, "loss": 0.6418, "rewards/accuracies": 0.5, "rewards/chosen": 0.1005738377571106, "rewards/margins": 0.06824728101491928, "rewards/rejected": 0.03232654929161072, "step": 481 }, { "epoch": 0.2938576436518823, "grad_norm": 66.95281779745834, "learning_rate": 2.0765853658536583e-08, "logits/chosen": 0.20636984705924988, "logits/rejected": 0.22792255878448486, "logps/chosen": -78.6353988647461, "logps/rejected": -67.68612670898438, "loss": 0.6323, "rewards/accuracies": 0.5, "rewards/chosen": 0.10261476784944534, "rewards/margins": 0.1378861665725708, "rewards/rejected": -0.03527141734957695, "step": 482 }, { "epoch": 0.29446730681298583, "grad_norm": 60.80437736239082, "learning_rate": 2.0809756097560974e-08, "logits/chosen": 0.027224496006965637, "logits/rejected": 0.17680969834327698, "logps/chosen": -102.2612075805664, "logps/rejected": -51.69792175292969, "loss": 0.6685, "rewards/accuracies": 0.5, "rewards/chosen": 0.11472471058368683, "rewards/margins": 0.05192282423377037, "rewards/rejected": 0.06280189007520676, "step": 483 }, { "epoch": 0.29507696997408933, "grad_norm": 62.56218317882643, "learning_rate": 2.0853658536585363e-08, "logits/chosen": 0.08629345893859863, "logits/rejected": 0.07546723634004593, "logps/chosen": -208.2896270751953, "logps/rejected": -216.16351318359375, "loss": 0.6228, "rewards/accuracies": 1.0, "rewards/chosen": 0.19518309831619263, "rewards/margins": 0.28668278455734253, "rewards/rejected": -0.0914997085928917, "step": 484 }, { "epoch": 0.29568663313519283, "grad_norm": 65.77568573255537, "learning_rate": 2.0897560975609754e-08, "logits/chosen": 0.2876260280609131, "logits/rejected": 0.16935913264751434, "logps/chosen": -144.30734252929688, "logps/rejected": -227.69224548339844, "loss": 0.6375, "rewards/accuracies": 0.75, "rewards/chosen": 0.16153675317764282, "rewards/margins": 0.0677390769124031, "rewards/rejected": 0.09379769116640091, "step": 485 }, { "epoch": 0.2962962962962963, "grad_norm": 59.660312881733, "learning_rate": 2.0941463414634143e-08, "logits/chosen": 0.05226050317287445, "logits/rejected": 0.29561787843704224, "logps/chosen": -224.3292694091797, "logps/rejected": -93.36639404296875, "loss": 0.6362, "rewards/accuracies": 0.75, "rewards/chosen": 0.0905165821313858, "rewards/margins": 0.09735140949487686, "rewards/rejected": -0.006834829226136208, "step": 486 }, { "epoch": 0.2969059594573998, "grad_norm": 71.22731953559592, "learning_rate": 2.0985365853658534e-08, "logits/chosen": 0.16766256093978882, "logits/rejected": 0.23183496296405792, "logps/chosen": -97.92478942871094, "logps/rejected": -118.4116439819336, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": 0.1377440094947815, "rewards/margins": 0.1952911913394928, "rewards/rejected": -0.05754717439413071, "step": 487 }, { "epoch": 0.2975156226185033, "grad_norm": 64.11624827435091, "learning_rate": 2.1029268292682923e-08, "logits/chosen": -0.0726562887430191, "logits/rejected": 0.1790645569562912, "logps/chosen": -81.5301742553711, "logps/rejected": -37.156185150146484, "loss": 0.6551, "rewards/accuracies": 1.0, "rewards/chosen": 0.07962194085121155, "rewards/margins": 0.08470235764980316, "rewards/rejected": -0.005080413073301315, "step": 488 }, { "epoch": 0.2981252857796068, "grad_norm": 73.46932744002982, "learning_rate": 2.1073170731707315e-08, "logits/chosen": 0.18244323134422302, "logits/rejected": 0.29160937666893005, "logps/chosen": -217.43319702148438, "logps/rejected": -187.8267364501953, "loss": 0.6529, "rewards/accuracies": 0.75, "rewards/chosen": 0.14290522038936615, "rewards/margins": 0.05252870172262192, "rewards/rejected": 0.09037652611732483, "step": 489 }, { "epoch": 0.29873494894071023, "grad_norm": 67.30755185649637, "learning_rate": 2.1117073170731706e-08, "logits/chosen": 0.10757433623075485, "logits/rejected": 0.23386675119400024, "logps/chosen": -245.03697204589844, "logps/rejected": -123.92584228515625, "loss": 0.6577, "rewards/accuracies": 0.5, "rewards/chosen": 0.2160666435956955, "rewards/margins": 0.07086057960987091, "rewards/rejected": 0.14520607888698578, "step": 490 }, { "epoch": 0.29934461210181373, "grad_norm": 68.91968012122946, "learning_rate": 2.1160975609756095e-08, "logits/chosen": -0.05915515869855881, "logits/rejected": 0.0703243836760521, "logps/chosen": -400.88525390625, "logps/rejected": -329.91278076171875, "loss": 0.659, "rewards/accuracies": 1.0, "rewards/chosen": 0.3326101303100586, "rewards/margins": 0.3214416801929474, "rewards/rejected": 0.011168431490659714, "step": 491 }, { "epoch": 0.29995427526291724, "grad_norm": 66.67348569851553, "learning_rate": 2.1204878048780486e-08, "logits/chosen": 0.21445180475711823, "logits/rejected": 0.17440001666545868, "logps/chosen": -259.8293151855469, "logps/rejected": -273.2416076660156, "loss": 0.6422, "rewards/accuracies": 0.75, "rewards/chosen": 0.17863722145557404, "rewards/margins": 0.11631821095943451, "rewards/rejected": 0.06231900304555893, "step": 492 }, { "epoch": 0.30056393842402074, "grad_norm": 75.67037991925316, "learning_rate": 2.1248780487804875e-08, "logits/chosen": 0.2672348618507385, "logits/rejected": 0.24317914247512817, "logps/chosen": -19.860942840576172, "logps/rejected": -30.22275161743164, "loss": 0.6498, "rewards/accuracies": 0.5, "rewards/chosen": -0.05794403329491615, "rewards/margins": -0.011715312488377094, "rewards/rejected": -0.04622872173786163, "step": 493 }, { "epoch": 0.30117360158512424, "grad_norm": 68.92658229754335, "learning_rate": 2.1292682926829266e-08, "logits/chosen": -0.319587767124176, "logits/rejected": 0.1461396962404251, "logps/chosen": -361.044921875, "logps/rejected": -213.08692932128906, "loss": 0.6352, "rewards/accuracies": 0.5, "rewards/chosen": 0.23673886060714722, "rewards/margins": 0.19788327813148499, "rewards/rejected": 0.038855601102113724, "step": 494 }, { "epoch": 0.3017832647462277, "grad_norm": 71.12561791125117, "learning_rate": 2.1336585365853658e-08, "logits/chosen": 0.24701619148254395, "logits/rejected": 0.24567127227783203, "logps/chosen": -120.77401733398438, "logps/rejected": -37.010032653808594, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.1529422104358673, "rewards/margins": 0.18868255615234375, "rewards/rejected": -0.03574035316705704, "step": 495 }, { "epoch": 0.3023929279073312, "grad_norm": 73.09061978854592, "learning_rate": 2.1380487804878046e-08, "logits/chosen": 0.030435562133789062, "logits/rejected": -0.13487783074378967, "logps/chosen": -59.882545471191406, "logps/rejected": -77.14221954345703, "loss": 0.664, "rewards/accuracies": 0.0, "rewards/chosen": -0.24862869083881378, "rewards/margins": -0.03769826889038086, "rewards/rejected": -0.2109304517507553, "step": 496 }, { "epoch": 0.3030025910684347, "grad_norm": 76.35528849301075, "learning_rate": 2.1424390243902438e-08, "logits/chosen": 0.03241553157567978, "logits/rejected": 0.4508446156978607, "logps/chosen": -304.6023254394531, "logps/rejected": -99.01864624023438, "loss": 0.6397, "rewards/accuracies": 1.0, "rewards/chosen": 0.2151942253112793, "rewards/margins": 0.18481206893920898, "rewards/rejected": 0.030382152646780014, "step": 497 }, { "epoch": 0.3036122542295382, "grad_norm": 69.77074908699126, "learning_rate": 2.1468292682926826e-08, "logits/chosen": 0.2624257802963257, "logits/rejected": 0.2517472505569458, "logps/chosen": -111.65718078613281, "logps/rejected": -158.25135803222656, "loss": 0.6524, "rewards/accuracies": 0.75, "rewards/chosen": 0.11233830451965332, "rewards/margins": 0.2487785518169403, "rewards/rejected": -0.1364402323961258, "step": 498 }, { "epoch": 0.3042219173906417, "grad_norm": 69.85888827015366, "learning_rate": 2.1512195121951218e-08, "logits/chosen": 0.5841522216796875, "logits/rejected": 0.40747517347335815, "logps/chosen": -68.72338104248047, "logps/rejected": -371.1597595214844, "loss": 0.6572, "rewards/accuracies": 0.25, "rewards/chosen": 0.03852539137005806, "rewards/margins": -0.04676118120551109, "rewards/rejected": 0.08528657257556915, "step": 499 }, { "epoch": 0.30483158055174514, "grad_norm": 60.87334931589632, "learning_rate": 2.1556097560975607e-08, "logits/chosen": 0.09502071887254715, "logits/rejected": 0.3352612853050232, "logps/chosen": -258.5905456542969, "logps/rejected": -231.02549743652344, "loss": 0.6487, "rewards/accuracies": 0.25, "rewards/chosen": 0.059263478964567184, "rewards/margins": 0.0008697099983692169, "rewards/rejected": 0.05839376524090767, "step": 500 }, { "epoch": 0.30483158055174514, "eval_logits/chosen": 0.05119064450263977, "eval_logits/rejected": 0.1229129433631897, "eval_logps/chosen": -161.55780029296875, "eval_logps/rejected": -108.90592193603516, "eval_loss": 0.6521755456924438, "eval_rewards/accuracies": 0.6060606241226196, "eval_rewards/chosen": 0.10773464292287827, "eval_rewards/margins": 0.07899720221757889, "eval_rewards/rejected": 0.028737450018525124, "eval_runtime": 39.2311, "eval_samples_per_second": 6.729, "eval_steps_per_second": 0.841, "step": 500 }, { "epoch": 0.30544124371284864, "grad_norm": 54.512994578320736, "learning_rate": 2.1599999999999998e-08, "logits/chosen": -0.19323980808258057, "logits/rejected": 0.19141802191734314, "logps/chosen": -122.07552337646484, "logps/rejected": -51.705169677734375, "loss": 0.6675, "rewards/accuracies": 0.75, "rewards/chosen": -0.00966480653733015, "rewards/margins": 0.031769122928380966, "rewards/rejected": -0.04143393039703369, "step": 501 }, { "epoch": 0.30605090687395214, "grad_norm": 66.01792835999346, "learning_rate": 2.164390243902439e-08, "logits/chosen": 0.2453068345785141, "logits/rejected": 0.23593895137310028, "logps/chosen": -256.44146728515625, "logps/rejected": -243.70411682128906, "loss": 0.6598, "rewards/accuracies": 0.5, "rewards/chosen": 0.18942949175834656, "rewards/margins": 0.08905204385519028, "rewards/rejected": 0.10037745535373688, "step": 502 }, { "epoch": 0.30666057003505565, "grad_norm": 58.327484640099456, "learning_rate": 2.1687804878048778e-08, "logits/chosen": 0.13334240019321442, "logits/rejected": 0.1443691849708557, "logps/chosen": -109.47007751464844, "logps/rejected": -176.93238830566406, "loss": 0.6313, "rewards/accuracies": 0.5, "rewards/chosen": 0.040549665689468384, "rewards/margins": 0.02135545201599598, "rewards/rejected": 0.019194208085536957, "step": 503 }, { "epoch": 0.30727023319615915, "grad_norm": 64.57842170183204, "learning_rate": 2.173170731707317e-08, "logits/chosen": 0.0882779061794281, "logits/rejected": 0.019421786069869995, "logps/chosen": -98.05096435546875, "logps/rejected": -113.34808349609375, "loss": 0.6554, "rewards/accuracies": 0.75, "rewards/chosen": 0.11694498360157013, "rewards/margins": 0.12198923528194427, "rewards/rejected": -0.005044247955083847, "step": 504 }, { "epoch": 0.3078798963572626, "grad_norm": 60.71924137109685, "learning_rate": 2.1775609756097558e-08, "logits/chosen": -0.025103464722633362, "logits/rejected": -0.01396152377128601, "logps/chosen": -132.69093322753906, "logps/rejected": -148.00677490234375, "loss": 0.6409, "rewards/accuracies": 0.5, "rewards/chosen": 0.10391964763402939, "rewards/margins": 0.02545750141143799, "rewards/rejected": 0.0784621462225914, "step": 505 }, { "epoch": 0.3084895595183661, "grad_norm": 64.21484896317976, "learning_rate": 2.181951219512195e-08, "logits/chosen": 0.14935000240802765, "logits/rejected": 0.26964497566223145, "logps/chosen": -89.99710083007812, "logps/rejected": -61.493988037109375, "loss": 0.651, "rewards/accuracies": 0.75, "rewards/chosen": 0.172231987118721, "rewards/margins": 0.16800904273986816, "rewards/rejected": 0.0042229327373206615, "step": 506 }, { "epoch": 0.3090992226794696, "grad_norm": 68.52761020448763, "learning_rate": 2.186341463414634e-08, "logits/chosen": 0.1626947522163391, "logits/rejected": 0.300977498292923, "logps/chosen": -204.31988525390625, "logps/rejected": -131.5135040283203, "loss": 0.6806, "rewards/accuracies": 0.25, "rewards/chosen": 0.20247884094715118, "rewards/margins": 0.03694174811244011, "rewards/rejected": 0.16553710401058197, "step": 507 }, { "epoch": 0.3097088858405731, "grad_norm": 75.73589330780636, "learning_rate": 2.190731707317073e-08, "logits/chosen": -0.043320655822753906, "logits/rejected": -0.17623624205589294, "logps/chosen": -141.2119598388672, "logps/rejected": -193.40248107910156, "loss": 0.6823, "rewards/accuracies": 0.0, "rewards/chosen": 0.029924681410193443, "rewards/margins": -0.10780796408653259, "rewards/rejected": 0.13773265480995178, "step": 508 }, { "epoch": 0.31031854900167655, "grad_norm": 73.67312848325095, "learning_rate": 2.1951219512195122e-08, "logits/chosen": 0.09232969582080841, "logits/rejected": 0.07798745483160019, "logps/chosen": -214.9461212158203, "logps/rejected": -155.3734893798828, "loss": 0.6579, "rewards/accuracies": 0.75, "rewards/chosen": 0.10752177238464355, "rewards/margins": 0.11612154543399811, "rewards/rejected": -0.008599769324064255, "step": 509 }, { "epoch": 0.31092821216278005, "grad_norm": 69.67169645034262, "learning_rate": 2.199512195121951e-08, "logits/chosen": 0.17757156491279602, "logits/rejected": 0.14873629808425903, "logps/chosen": -140.59393310546875, "logps/rejected": -138.13308715820312, "loss": 0.6346, "rewards/accuracies": 0.5, "rewards/chosen": 0.07352197170257568, "rewards/margins": 0.045384474098682404, "rewards/rejected": 0.02813749387860298, "step": 510 }, { "epoch": 0.31153787532388355, "grad_norm": 66.8733357522459, "learning_rate": 2.2039024390243902e-08, "logits/chosen": 0.129908949136734, "logits/rejected": 0.20046426355838776, "logps/chosen": -265.88330078125, "logps/rejected": -248.80580139160156, "loss": 0.6551, "rewards/accuracies": 1.0, "rewards/chosen": 0.21615388989448547, "rewards/margins": 0.056390319019556046, "rewards/rejected": 0.15976357460021973, "step": 511 }, { "epoch": 0.31214753848498705, "grad_norm": 66.97302911001606, "learning_rate": 2.208292682926829e-08, "logits/chosen": 0.19567690789699554, "logits/rejected": 0.42208293080329895, "logps/chosen": -201.44737243652344, "logps/rejected": -131.23837280273438, "loss": 0.6529, "rewards/accuracies": 0.5, "rewards/chosen": 0.14756140112876892, "rewards/margins": 0.18925122916698456, "rewards/rejected": -0.041689835488796234, "step": 512 }, { "epoch": 0.31275720164609055, "grad_norm": 67.92666175878672, "learning_rate": 2.2126829268292682e-08, "logits/chosen": 0.00613509863615036, "logits/rejected": 0.0940328985452652, "logps/chosen": -113.25761413574219, "logps/rejected": -125.87974548339844, "loss": 0.6497, "rewards/accuracies": 0.5, "rewards/chosen": 0.10114751756191254, "rewards/margins": 0.04731915518641472, "rewards/rejected": 0.05382836237549782, "step": 513 }, { "epoch": 0.313366864807194, "grad_norm": 59.70995852571872, "learning_rate": 2.217073170731707e-08, "logits/chosen": -0.24690371751785278, "logits/rejected": -0.135573610663414, "logps/chosen": -265.9849548339844, "logps/rejected": -128.41067504882812, "loss": 0.6592, "rewards/accuracies": 1.0, "rewards/chosen": 0.32507163286209106, "rewards/margins": 0.31498491764068604, "rewards/rejected": 0.010086726397275925, "step": 514 }, { "epoch": 0.3139765279682975, "grad_norm": 70.7049647088918, "learning_rate": 2.2214634146341462e-08, "logits/chosen": 0.19668368995189667, "logits/rejected": 0.18600206077098846, "logps/chosen": -104.42039489746094, "logps/rejected": -74.9419174194336, "loss": 0.6313, "rewards/accuracies": 1.0, "rewards/chosen": 0.17575618624687195, "rewards/margins": 0.18259717524051666, "rewards/rejected": -0.006841002032160759, "step": 515 }, { "epoch": 0.314586191129401, "grad_norm": 63.080128882078014, "learning_rate": 2.2258536585365854e-08, "logits/chosen": 0.08212514966726303, "logits/rejected": -0.007908239960670471, "logps/chosen": -23.616920471191406, "logps/rejected": -70.23422241210938, "loss": 0.6324, "rewards/accuracies": 0.75, "rewards/chosen": -0.06444795429706573, "rewards/margins": 0.04576684534549713, "rewards/rejected": -0.11021479964256287, "step": 516 }, { "epoch": 0.3151958542905045, "grad_norm": 60.7181333066584, "learning_rate": 2.2302439024390242e-08, "logits/chosen": -0.053847894072532654, "logits/rejected": 0.23323538899421692, "logps/chosen": -251.07766723632812, "logps/rejected": -144.61624145507812, "loss": 0.6492, "rewards/accuracies": 1.0, "rewards/chosen": 0.1936260312795639, "rewards/margins": 0.14233648777008057, "rewards/rejected": 0.051289550960063934, "step": 517 }, { "epoch": 0.315805517451608, "grad_norm": 61.019751894456455, "learning_rate": 2.2346341463414634e-08, "logits/chosen": -0.14775434136390686, "logits/rejected": 0.1770121157169342, "logps/chosen": -433.21710205078125, "logps/rejected": -132.9249267578125, "loss": 0.6587, "rewards/accuracies": 1.0, "rewards/chosen": 0.33931484818458557, "rewards/margins": 0.3179498314857483, "rewards/rejected": 0.021365046501159668, "step": 518 }, { "epoch": 0.31641518061271146, "grad_norm": 82.2978366121133, "learning_rate": 2.2390243902439022e-08, "logits/chosen": 0.1245865672826767, "logits/rejected": 0.052594467997550964, "logps/chosen": -242.2100830078125, "logps/rejected": -366.5093688964844, "loss": 0.6718, "rewards/accuracies": 0.5, "rewards/chosen": -0.040619898587465286, "rewards/margins": -0.1416400671005249, "rewards/rejected": 0.10102016478776932, "step": 519 }, { "epoch": 0.31702484377381496, "grad_norm": 66.87296880096541, "learning_rate": 2.2434146341463414e-08, "logits/chosen": 0.37499451637268066, "logits/rejected": 0.380936861038208, "logps/chosen": -191.12860107421875, "logps/rejected": -161.9185791015625, "loss": 0.6564, "rewards/accuracies": 1.0, "rewards/chosen": 0.04185749590396881, "rewards/margins": 0.08646363019943237, "rewards/rejected": -0.04460614174604416, "step": 520 }, { "epoch": 0.31763450693491846, "grad_norm": 72.23456407773578, "learning_rate": 2.2478048780487802e-08, "logits/chosen": 0.27072739601135254, "logits/rejected": 0.36028990149497986, "logps/chosen": -156.28427124023438, "logps/rejected": -105.19752502441406, "loss": 0.6365, "rewards/accuracies": 0.25, "rewards/chosen": 0.00997886061668396, "rewards/margins": -0.12803566455841064, "rewards/rejected": 0.1380145400762558, "step": 521 }, { "epoch": 0.31824417009602196, "grad_norm": 66.88803954158226, "learning_rate": 2.2521951219512194e-08, "logits/chosen": 0.30075281858444214, "logits/rejected": 0.1766529381275177, "logps/chosen": -33.087581634521484, "logps/rejected": -53.23389434814453, "loss": 0.6681, "rewards/accuracies": 1.0, "rewards/chosen": 0.05227477476000786, "rewards/margins": 0.07364573329687119, "rewards/rejected": -0.021370958536863327, "step": 522 }, { "epoch": 0.31885383325712546, "grad_norm": 63.14207407142708, "learning_rate": 2.2565853658536586e-08, "logits/chosen": 0.1084495335817337, "logits/rejected": 0.05239188298583031, "logps/chosen": -147.90431213378906, "logps/rejected": -254.87730407714844, "loss": 0.6116, "rewards/accuracies": 0.75, "rewards/chosen": 0.018667571246623993, "rewards/margins": 0.12227742373943329, "rewards/rejected": -0.1036098524928093, "step": 523 }, { "epoch": 0.3194634964182289, "grad_norm": 53.073831628957116, "learning_rate": 2.2609756097560974e-08, "logits/chosen": 0.16721415519714355, "logits/rejected": 0.18646599352359772, "logps/chosen": -146.1288299560547, "logps/rejected": -74.22055053710938, "loss": 0.6414, "rewards/accuracies": 0.5, "rewards/chosen": 0.2748778462409973, "rewards/margins": 0.17356166243553162, "rewards/rejected": 0.10131621360778809, "step": 524 }, { "epoch": 0.3200731595793324, "grad_norm": 65.1383885375395, "learning_rate": 2.2653658536585366e-08, "logits/chosen": 0.24064935743808746, "logits/rejected": 0.2412213236093521, "logps/chosen": -24.251789093017578, "logps/rejected": -13.947285652160645, "loss": 0.6341, "rewards/accuracies": 0.5, "rewards/chosen": -0.002303923014551401, "rewards/margins": 0.0070059895515441895, "rewards/rejected": -0.009309912100434303, "step": 525 }, { "epoch": 0.3206828227404359, "grad_norm": 65.56934674682218, "learning_rate": 2.2697560975609754e-08, "logits/chosen": 0.3520042300224304, "logits/rejected": 0.0972924530506134, "logps/chosen": -71.07540893554688, "logps/rejected": -154.98980712890625, "loss": 0.6716, "rewards/accuracies": 0.25, "rewards/chosen": -0.018268859013915062, "rewards/margins": -0.175251767039299, "rewards/rejected": 0.1569828987121582, "step": 526 }, { "epoch": 0.3212924859015394, "grad_norm": 68.21147111224226, "learning_rate": 2.2741463414634146e-08, "logits/chosen": 0.06844624876976013, "logits/rejected": 0.07510246336460114, "logps/chosen": -305.83209228515625, "logps/rejected": -314.4248352050781, "loss": 0.6482, "rewards/accuracies": 0.75, "rewards/chosen": 0.3196808099746704, "rewards/margins": 0.11593285948038101, "rewards/rejected": 0.2037479281425476, "step": 527 }, { "epoch": 0.3219021490626429, "grad_norm": 67.98983586043427, "learning_rate": 2.2785365853658534e-08, "logits/chosen": -0.11653149127960205, "logits/rejected": -0.13821932673454285, "logps/chosen": -87.24510955810547, "logps/rejected": -108.01617431640625, "loss": 0.6435, "rewards/accuracies": 0.75, "rewards/chosen": 0.13828030228614807, "rewards/margins": 0.0777205377817154, "rewards/rejected": 0.06055976822972298, "step": 528 }, { "epoch": 0.32251181222374636, "grad_norm": 78.38926868013422, "learning_rate": 2.2829268292682926e-08, "logits/chosen": 0.1056259423494339, "logits/rejected": 0.052796632051467896, "logps/chosen": -328.3974609375, "logps/rejected": -224.63218688964844, "loss": 0.6294, "rewards/accuracies": 1.0, "rewards/chosen": 0.1256667822599411, "rewards/margins": 0.1184113547205925, "rewards/rejected": 0.007255414500832558, "step": 529 }, { "epoch": 0.32312147538484987, "grad_norm": 67.09081275699884, "learning_rate": 2.2873170731707317e-08, "logits/chosen": 0.18698683381080627, "logits/rejected": -0.042787060141563416, "logps/chosen": -15.280387878417969, "logps/rejected": -164.86746215820312, "loss": 0.6887, "rewards/accuracies": 0.25, "rewards/chosen": -0.09855636209249496, "rewards/margins": -0.0753563866019249, "rewards/rejected": -0.023199977353215218, "step": 530 }, { "epoch": 0.32373113854595337, "grad_norm": 68.6934870738876, "learning_rate": 2.2917073170731706e-08, "logits/chosen": 0.08309216797351837, "logits/rejected": 0.07431484758853912, "logps/chosen": -52.191993713378906, "logps/rejected": -73.80946350097656, "loss": 0.6592, "rewards/accuracies": 0.5, "rewards/chosen": 0.04072005674242973, "rewards/margins": -0.010257862508296967, "rewards/rejected": 0.0509779192507267, "step": 531 }, { "epoch": 0.32434080170705687, "grad_norm": 71.91083212544031, "learning_rate": 2.2960975609756097e-08, "logits/chosen": 0.20694595575332642, "logits/rejected": 0.2705208659172058, "logps/chosen": -229.86709594726562, "logps/rejected": -244.8079833984375, "loss": 0.6034, "rewards/accuracies": 0.75, "rewards/chosen": 0.36465075612068176, "rewards/margins": 0.37818261981010437, "rewards/rejected": -0.013531871140003204, "step": 532 }, { "epoch": 0.3249504648681603, "grad_norm": 60.6253480805561, "learning_rate": 2.3004878048780486e-08, "logits/chosen": 0.06808324158191681, "logits/rejected": 0.08612547814846039, "logps/chosen": -123.4905776977539, "logps/rejected": -94.32249450683594, "loss": 0.6406, "rewards/accuracies": 0.75, "rewards/chosen": 0.16934238374233246, "rewards/margins": 0.1628831923007965, "rewards/rejected": 0.006459187716245651, "step": 533 }, { "epoch": 0.3255601280292638, "grad_norm": 61.22671591917024, "learning_rate": 2.3048780487804878e-08, "logits/chosen": 0.16294969618320465, "logits/rejected": 0.35293275117874146, "logps/chosen": -184.82269287109375, "logps/rejected": -99.59314727783203, "loss": 0.6244, "rewards/accuracies": 0.5, "rewards/chosen": 0.1992354691028595, "rewards/margins": 0.0191505029797554, "rewards/rejected": 0.1800849735736847, "step": 534 }, { "epoch": 0.3261697911903673, "grad_norm": 62.82742071842365, "learning_rate": 2.309268292682927e-08, "logits/chosen": 0.06455646455287933, "logits/rejected": 0.20298981666564941, "logps/chosen": -290.6690673828125, "logps/rejected": -134.99990844726562, "loss": 0.6624, "rewards/accuracies": 0.5, "rewards/chosen": 0.21653535962104797, "rewards/margins": 0.08200506120920181, "rewards/rejected": 0.13453030586242676, "step": 535 }, { "epoch": 0.3267794543514708, "grad_norm": 65.34483004409513, "learning_rate": 2.3136585365853658e-08, "logits/chosen": 0.08539235591888428, "logits/rejected": 0.01889946684241295, "logps/chosen": -30.40401840209961, "logps/rejected": -52.76930236816406, "loss": 0.6115, "rewards/accuracies": 0.75, "rewards/chosen": 0.05695812776684761, "rewards/margins": 0.10740622878074646, "rewards/rejected": -0.05044809728860855, "step": 536 }, { "epoch": 0.3273891175125743, "grad_norm": 65.11274057229112, "learning_rate": 2.318048780487805e-08, "logits/chosen": 0.03304370492696762, "logits/rejected": 0.12191274762153625, "logps/chosen": -131.54193115234375, "logps/rejected": -61.98631286621094, "loss": 0.6131, "rewards/accuracies": 0.5, "rewards/chosen": 0.2209971845149994, "rewards/margins": 0.14555290341377258, "rewards/rejected": 0.07544426620006561, "step": 537 }, { "epoch": 0.32799878067367777, "grad_norm": 59.33708176810461, "learning_rate": 2.3224390243902438e-08, "logits/chosen": 0.3022095859050751, "logits/rejected": 0.24946600198745728, "logps/chosen": -273.614501953125, "logps/rejected": -171.8970947265625, "loss": 0.6172, "rewards/accuracies": 1.0, "rewards/chosen": 0.2901607155799866, "rewards/margins": 0.3553035855293274, "rewards/rejected": -0.06514289230108261, "step": 538 }, { "epoch": 0.32860844383478127, "grad_norm": 65.66457853221078, "learning_rate": 2.326829268292683e-08, "logits/chosen": -0.1431131213903427, "logits/rejected": 0.35114675760269165, "logps/chosen": -268.68603515625, "logps/rejected": -232.76162719726562, "loss": 0.6342, "rewards/accuracies": 0.75, "rewards/chosen": 0.17569391429424286, "rewards/margins": 0.11183520406484604, "rewards/rejected": 0.06385871767997742, "step": 539 }, { "epoch": 0.3292181069958848, "grad_norm": 60.60200061745704, "learning_rate": 2.3312195121951218e-08, "logits/chosen": -0.08310922980308533, "logits/rejected": 0.19544613361358643, "logps/chosen": -364.769287109375, "logps/rejected": -413.97247314453125, "loss": 0.6362, "rewards/accuracies": 0.75, "rewards/chosen": 0.4459804594516754, "rewards/margins": -0.013006415218114853, "rewards/rejected": 0.45898687839508057, "step": 540 }, { "epoch": 0.3298277701569883, "grad_norm": 59.5024423401539, "learning_rate": 2.335609756097561e-08, "logits/chosen": -0.17287477850914001, "logits/rejected": -0.12527330219745636, "logps/chosen": -251.46896362304688, "logps/rejected": -162.64642333984375, "loss": 0.6182, "rewards/accuracies": 1.0, "rewards/chosen": 0.31934016942977905, "rewards/margins": 0.1540745496749878, "rewards/rejected": 0.16526560485363007, "step": 541 }, { "epoch": 0.3304374333180918, "grad_norm": 66.87614981420792, "learning_rate": 2.34e-08, "logits/chosen": 0.156075119972229, "logits/rejected": 0.4485046863555908, "logps/chosen": -497.4200439453125, "logps/rejected": -308.34283447265625, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": 0.3739921748638153, "rewards/margins": 0.38053667545318604, "rewards/rejected": -0.006544498726725578, "step": 542 }, { "epoch": 0.3310470964791952, "grad_norm": 65.61149390934, "learning_rate": 2.344390243902439e-08, "logits/chosen": 0.06245467811822891, "logits/rejected": 0.21399304270744324, "logps/chosen": -416.5924987792969, "logps/rejected": -322.1553649902344, "loss": 0.6671, "rewards/accuracies": 0.5, "rewards/chosen": 0.22536620497703552, "rewards/margins": 0.2942011058330536, "rewards/rejected": -0.06883488595485687, "step": 543 }, { "epoch": 0.3316567596402987, "grad_norm": 67.07083115943713, "learning_rate": 2.348780487804878e-08, "logits/chosen": -0.0005963281728327274, "logits/rejected": 0.07341817021369934, "logps/chosen": -107.6330337524414, "logps/rejected": -145.70623779296875, "loss": 0.6575, "rewards/accuracies": 0.5, "rewards/chosen": -0.001352405408397317, "rewards/margins": -0.07420498877763748, "rewards/rejected": 0.07285258173942566, "step": 544 }, { "epoch": 0.33226642280140223, "grad_norm": 81.63252931962576, "learning_rate": 2.353170731707317e-08, "logits/chosen": 0.16309864819049835, "logits/rejected": 0.42371779680252075, "logps/chosen": -288.0001525878906, "logps/rejected": -364.6177978515625, "loss": 0.6596, "rewards/accuracies": 0.0, "rewards/chosen": 0.04256870597600937, "rewards/margins": -0.13648854196071625, "rewards/rejected": 0.17905724048614502, "step": 545 }, { "epoch": 0.33287608596250573, "grad_norm": 75.45374995454429, "learning_rate": 2.357560975609756e-08, "logits/chosen": 0.16709935665130615, "logits/rejected": 0.05223012715578079, "logps/chosen": -65.19556427001953, "logps/rejected": -85.7304916381836, "loss": 0.6486, "rewards/accuracies": 0.75, "rewards/chosen": 0.07292532920837402, "rewards/margins": 0.1342860907316208, "rewards/rejected": -0.061360765248537064, "step": 546 }, { "epoch": 0.33348574912360923, "grad_norm": 72.95823694610114, "learning_rate": 2.361951219512195e-08, "logits/chosen": -0.14129552245140076, "logits/rejected": 0.22636649012565613, "logps/chosen": -236.18309020996094, "logps/rejected": -157.30853271484375, "loss": 0.6344, "rewards/accuracies": 0.75, "rewards/chosen": 0.3008248209953308, "rewards/margins": 0.20586901903152466, "rewards/rejected": 0.09495582431554794, "step": 547 }, { "epoch": 0.3340954122847127, "grad_norm": 71.19061168092637, "learning_rate": 2.366341463414634e-08, "logits/chosen": 0.19440124928951263, "logits/rejected": 0.21193870902061462, "logps/chosen": -179.04261779785156, "logps/rejected": -165.11044311523438, "loss": 0.622, "rewards/accuracies": 1.0, "rewards/chosen": 0.15733900666236877, "rewards/margins": 0.35409873723983765, "rewards/rejected": -0.19675976037979126, "step": 548 }, { "epoch": 0.3347050754458162, "grad_norm": 63.34199159951003, "learning_rate": 2.3707317073170733e-08, "logits/chosen": 0.249933123588562, "logits/rejected": 0.23116062581539154, "logps/chosen": -184.37347412109375, "logps/rejected": -255.3083038330078, "loss": 0.5696, "rewards/accuracies": 1.0, "rewards/chosen": 0.2683231234550476, "rewards/margins": 0.49700772762298584, "rewards/rejected": -0.22868461906909943, "step": 549 }, { "epoch": 0.3353147386069197, "grad_norm": 55.812426895975854, "learning_rate": 2.375121951219512e-08, "logits/chosen": 0.1399931013584137, "logits/rejected": 0.1378995031118393, "logps/chosen": -126.45514678955078, "logps/rejected": -93.26830291748047, "loss": 0.6243, "rewards/accuracies": 0.5, "rewards/chosen": 0.318451464176178, "rewards/margins": 0.1935044676065445, "rewards/rejected": 0.12494699656963348, "step": 550 }, { "epoch": 0.3359244017680232, "grad_norm": 63.73615632089044, "learning_rate": 2.3795121951219513e-08, "logits/chosen": -0.05238550901412964, "logits/rejected": -0.06300175189971924, "logps/chosen": -62.070892333984375, "logps/rejected": -65.6217041015625, "loss": 0.6442, "rewards/accuracies": 0.25, "rewards/chosen": 0.06361308693885803, "rewards/margins": -0.04511312395334244, "rewards/rejected": 0.10872621089220047, "step": 551 }, { "epoch": 0.33653406492912663, "grad_norm": 60.98861820658699, "learning_rate": 2.38390243902439e-08, "logits/chosen": 0.00033330172300338745, "logits/rejected": 0.49101191759109497, "logps/chosen": -167.7310791015625, "logps/rejected": -69.52841186523438, "loss": 0.6688, "rewards/accuracies": 0.25, "rewards/chosen": 0.1323298215866089, "rewards/margins": -0.07170376926660538, "rewards/rejected": 0.20403359830379486, "step": 552 }, { "epoch": 0.33714372809023013, "grad_norm": 62.068599127478905, "learning_rate": 2.3882926829268293e-08, "logits/chosen": 0.07346828281879425, "logits/rejected": 0.02286439761519432, "logps/chosen": -62.5282096862793, "logps/rejected": -20.32370376586914, "loss": 0.6152, "rewards/accuracies": 1.0, "rewards/chosen": 0.07491831481456757, "rewards/margins": 0.1205805242061615, "rewards/rejected": -0.045662201941013336, "step": 553 }, { "epoch": 0.33775339125133363, "grad_norm": 71.62768706552319, "learning_rate": 2.392682926829268e-08, "logits/chosen": 0.12269076704978943, "logits/rejected": 0.061230845749378204, "logps/chosen": -83.42967224121094, "logps/rejected": -191.16082763671875, "loss": 0.6555, "rewards/accuracies": 0.5, "rewards/chosen": 0.0845014825463295, "rewards/margins": -0.11915123462677002, "rewards/rejected": 0.20365272462368011, "step": 554 }, { "epoch": 0.33836305441243714, "grad_norm": 80.26393332917321, "learning_rate": 2.3970731707317073e-08, "logits/chosen": 0.30099964141845703, "logits/rejected": 0.3035103380680084, "logps/chosen": -101.5709228515625, "logps/rejected": -15.632425308227539, "loss": 0.6439, "rewards/accuracies": 0.75, "rewards/chosen": 0.0711577907204628, "rewards/margins": 0.25126728415489197, "rewards/rejected": -0.18010951578617096, "step": 555 }, { "epoch": 0.33897271757354064, "grad_norm": 71.05295871031151, "learning_rate": 2.4014634146341465e-08, "logits/chosen": -0.012122197076678276, "logits/rejected": 0.025360623374581337, "logps/chosen": -188.68231201171875, "logps/rejected": -147.80508422851562, "loss": 0.6335, "rewards/accuracies": 0.5, "rewards/chosen": 0.04689235985279083, "rewards/margins": -0.04648181423544884, "rewards/rejected": 0.09337417036294937, "step": 556 }, { "epoch": 0.3395823807346441, "grad_norm": 59.3051777736367, "learning_rate": 2.4058536585365853e-08, "logits/chosen": 0.1398051679134369, "logits/rejected": 0.09347712993621826, "logps/chosen": -144.3924102783203, "logps/rejected": -133.54249572753906, "loss": 0.6568, "rewards/accuracies": 0.75, "rewards/chosen": 0.14586153626441956, "rewards/margins": 0.03632477670907974, "rewards/rejected": 0.10953675210475922, "step": 557 }, { "epoch": 0.3401920438957476, "grad_norm": 89.3280318886537, "learning_rate": 2.4102439024390245e-08, "logits/chosen": 0.26811841130256653, "logits/rejected": 0.23905262351036072, "logps/chosen": -41.126434326171875, "logps/rejected": -100.11540222167969, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": 0.055931802839040756, "rewards/margins": 0.059824489057064056, "rewards/rejected": -0.003892684355378151, "step": 558 }, { "epoch": 0.3408017070568511, "grad_norm": 71.71546246082835, "learning_rate": 2.4146341463414633e-08, "logits/chosen": -0.025946272537112236, "logits/rejected": -0.0661325678229332, "logps/chosen": -365.43255615234375, "logps/rejected": -279.7004089355469, "loss": 0.6458, "rewards/accuracies": 0.5, "rewards/chosen": 0.09921663254499435, "rewards/margins": 0.05765338987112045, "rewards/rejected": 0.0415632463991642, "step": 559 }, { "epoch": 0.3414113702179546, "grad_norm": 60.206821993863926, "learning_rate": 2.4190243902439025e-08, "logits/chosen": -0.03434105962514877, "logits/rejected": -0.031055085361003876, "logps/chosen": -103.45355224609375, "logps/rejected": -94.31544494628906, "loss": 0.6573, "rewards/accuracies": 0.75, "rewards/chosen": 0.12553460896015167, "rewards/margins": 0.10331953316926956, "rewards/rejected": 0.022215068340301514, "step": 560 }, { "epoch": 0.3420210333790581, "grad_norm": 61.698290186140916, "learning_rate": 2.4234146341463413e-08, "logits/chosen": 0.38516178727149963, "logits/rejected": 0.11057097464799881, "logps/chosen": -31.34721565246582, "logps/rejected": -58.38990783691406, "loss": 0.6082, "rewards/accuracies": 1.0, "rewards/chosen": 0.02334180474281311, "rewards/margins": 0.1805194914340973, "rewards/rejected": -0.15717768669128418, "step": 561 }, { "epoch": 0.34263069654016154, "grad_norm": 68.31080004404484, "learning_rate": 2.4278048780487805e-08, "logits/chosen": 0.1655484139919281, "logits/rejected": 0.1072499006986618, "logps/chosen": -163.44290161132812, "logps/rejected": -214.94918823242188, "loss": 0.6252, "rewards/accuracies": 1.0, "rewards/chosen": 0.4357205927371979, "rewards/margins": 0.15322071313858032, "rewards/rejected": 0.28249990940093994, "step": 562 }, { "epoch": 0.34324035970126504, "grad_norm": 63.549523457687265, "learning_rate": 2.4321951219512197e-08, "logits/chosen": 0.17718505859375, "logits/rejected": 0.32033926248550415, "logps/chosen": -126.55470275878906, "logps/rejected": -42.37413024902344, "loss": 0.6359, "rewards/accuracies": 0.75, "rewards/chosen": 0.2075459212064743, "rewards/margins": 0.22927585244178772, "rewards/rejected": -0.02172992192208767, "step": 563 }, { "epoch": 0.34385002286236854, "grad_norm": 69.94560030516728, "learning_rate": 2.4365853658536585e-08, "logits/chosen": 0.04947260022163391, "logits/rejected": 0.13389050960540771, "logps/chosen": -206.68463134765625, "logps/rejected": -107.59349822998047, "loss": 0.6131, "rewards/accuracies": 1.0, "rewards/chosen": 0.33965569734573364, "rewards/margins": 0.27065369486808777, "rewards/rejected": 0.06900197267532349, "step": 564 }, { "epoch": 0.34445968602347204, "grad_norm": 63.260715615579315, "learning_rate": 2.4409756097560977e-08, "logits/chosen": 0.0563773475587368, "logits/rejected": 0.27324387431144714, "logps/chosen": -266.93096923828125, "logps/rejected": -228.7593994140625, "loss": 0.6155, "rewards/accuracies": 0.75, "rewards/chosen": 0.17397937178611755, "rewards/margins": 0.08989344537258148, "rewards/rejected": 0.08408594131469727, "step": 565 }, { "epoch": 0.34506934918457555, "grad_norm": 59.0380784420323, "learning_rate": 2.4453658536585365e-08, "logits/chosen": -0.053584471344947815, "logits/rejected": 0.08549386262893677, "logps/chosen": -189.98947143554688, "logps/rejected": -146.95396423339844, "loss": 0.6506, "rewards/accuracies": 0.75, "rewards/chosen": 0.2520523965358734, "rewards/margins": 0.15931786596775055, "rewards/rejected": 0.09273453056812286, "step": 566 }, { "epoch": 0.345679012345679, "grad_norm": 62.21306142356516, "learning_rate": 2.4497560975609757e-08, "logits/chosen": 0.22712036967277527, "logits/rejected": 0.28604844212532043, "logps/chosen": -157.5207061767578, "logps/rejected": -165.08287048339844, "loss": 0.6307, "rewards/accuracies": 1.0, "rewards/chosen": 0.14640095829963684, "rewards/margins": 0.2731837034225464, "rewards/rejected": -0.12678274512290955, "step": 567 }, { "epoch": 0.3462886755067825, "grad_norm": 63.807436263494004, "learning_rate": 2.4541463414634145e-08, "logits/chosen": -0.3708324432373047, "logits/rejected": -0.13896696269512177, "logps/chosen": -153.66146850585938, "logps/rejected": -145.76602172851562, "loss": 0.6455, "rewards/accuracies": 0.75, "rewards/chosen": 0.08404551446437836, "rewards/margins": 0.06497307866811752, "rewards/rejected": 0.019072428345680237, "step": 568 }, { "epoch": 0.346898338667886, "grad_norm": 72.64496190421848, "learning_rate": 2.4585365853658537e-08, "logits/chosen": 0.250077486038208, "logits/rejected": 0.24369224905967712, "logps/chosen": -26.746715545654297, "logps/rejected": -8.553413391113281, "loss": 0.6595, "rewards/accuracies": 0.75, "rewards/chosen": -0.04704287275671959, "rewards/margins": 0.07402165234088898, "rewards/rejected": -0.12106452882289886, "step": 569 }, { "epoch": 0.3475080018289895, "grad_norm": 66.27267922513238, "learning_rate": 2.462926829268293e-08, "logits/chosen": 0.16420923173427582, "logits/rejected": 0.26274797320365906, "logps/chosen": -132.50790405273438, "logps/rejected": -165.24411010742188, "loss": 0.6342, "rewards/accuracies": 1.0, "rewards/chosen": 0.3622353971004486, "rewards/margins": 0.5996338129043579, "rewards/rejected": -0.2373984009027481, "step": 570 }, { "epoch": 0.348117664990093, "grad_norm": 66.13326481370741, "learning_rate": 2.4673170731707317e-08, "logits/chosen": 0.20698092877864838, "logits/rejected": 0.1994612067937851, "logps/chosen": -127.6097412109375, "logps/rejected": -132.74673461914062, "loss": 0.6664, "rewards/accuracies": 0.75, "rewards/chosen": 0.24468494951725006, "rewards/margins": 0.17810288071632385, "rewards/rejected": 0.06658206135034561, "step": 571 }, { "epoch": 0.34872732815119645, "grad_norm": 57.7065467173085, "learning_rate": 2.471707317073171e-08, "logits/chosen": 0.3937755823135376, "logits/rejected": 0.3192245662212372, "logps/chosen": -169.89508056640625, "logps/rejected": -196.94338989257812, "loss": 0.6427, "rewards/accuracies": 0.5, "rewards/chosen": 0.28047704696655273, "rewards/margins": 0.1190960705280304, "rewards/rejected": 0.16138097643852234, "step": 572 }, { "epoch": 0.34933699131229995, "grad_norm": 61.772205677062786, "learning_rate": 2.4760975609756094e-08, "logits/chosen": 0.03855567425489426, "logits/rejected": 0.19162461161613464, "logps/chosen": -150.66961669921875, "logps/rejected": -136.82655334472656, "loss": 0.6092, "rewards/accuracies": 1.0, "rewards/chosen": 0.3261014521121979, "rewards/margins": 0.2646014392375946, "rewards/rejected": 0.061499979346990585, "step": 573 }, { "epoch": 0.34994665447340345, "grad_norm": 79.087034481911, "learning_rate": 2.4804878048780485e-08, "logits/chosen": -0.046487804502248764, "logits/rejected": -0.0954296812415123, "logps/chosen": -118.04635620117188, "logps/rejected": -134.03256225585938, "loss": 0.6336, "rewards/accuracies": 0.25, "rewards/chosen": 0.09429363906383514, "rewards/margins": -0.03607354685664177, "rewards/rejected": 0.1303671896457672, "step": 574 }, { "epoch": 0.35055631763450695, "grad_norm": 61.058163521118374, "learning_rate": 2.4848780487804874e-08, "logits/chosen": 0.3075591027736664, "logits/rejected": 0.25359150767326355, "logps/chosen": -80.47167205810547, "logps/rejected": -81.86988067626953, "loss": 0.6882, "rewards/accuracies": 0.5, "rewards/chosen": 0.029255548492074013, "rewards/margins": 0.03396584466099739, "rewards/rejected": -0.004710295237600803, "step": 575 }, { "epoch": 0.3511659807956104, "grad_norm": 68.89227737141198, "learning_rate": 2.4892682926829265e-08, "logits/chosen": -0.1473444551229477, "logits/rejected": 0.18843889236450195, "logps/chosen": -91.84371185302734, "logps/rejected": -104.91816711425781, "loss": 0.6436, "rewards/accuracies": 0.0, "rewards/chosen": -0.044121529906988144, "rewards/margins": -0.1411334127187729, "rewards/rejected": 0.09701188653707504, "step": 576 }, { "epoch": 0.3517756439567139, "grad_norm": 64.52521815440315, "learning_rate": 2.4936585365853654e-08, "logits/chosen": 0.2751694619655609, "logits/rejected": 0.1315845549106598, "logps/chosen": -6.20741605758667, "logps/rejected": -28.119037628173828, "loss": 0.6317, "rewards/accuracies": 0.5, "rewards/chosen": 0.004677858203649521, "rewards/margins": -0.004088805988430977, "rewards/rejected": 0.008766662329435349, "step": 577 }, { "epoch": 0.3523853071178174, "grad_norm": 61.48646710998834, "learning_rate": 2.4980487804878046e-08, "logits/chosen": 0.10936979949474335, "logits/rejected": 0.008752191439270973, "logps/chosen": -75.4015884399414, "logps/rejected": -107.59015655517578, "loss": 0.6482, "rewards/accuracies": 0.75, "rewards/chosen": 0.06906801462173462, "rewards/margins": 0.146926149725914, "rewards/rejected": -0.07785812765359879, "step": 578 }, { "epoch": 0.3529949702789209, "grad_norm": 63.98283651957225, "learning_rate": 2.5024390243902437e-08, "logits/chosen": 0.39335334300994873, "logits/rejected": 0.28164124488830566, "logps/chosen": -45.73773956298828, "logps/rejected": -87.71354675292969, "loss": 0.6305, "rewards/accuracies": 0.75, "rewards/chosen": -0.043201033025979996, "rewards/margins": 0.014671443030238152, "rewards/rejected": -0.0578724704682827, "step": 579 }, { "epoch": 0.3536046334400244, "grad_norm": 72.69712815816003, "learning_rate": 2.5068292682926826e-08, "logits/chosen": -0.1359988898038864, "logits/rejected": -0.0037320032715797424, "logps/chosen": -393.5069885253906, "logps/rejected": -309.19940185546875, "loss": 0.6311, "rewards/accuracies": 0.25, "rewards/chosen": 0.3374473452568054, "rewards/margins": -0.012655630707740784, "rewards/rejected": 0.3501029908657074, "step": 580 }, { "epoch": 0.35421429660112785, "grad_norm": 70.34952103315145, "learning_rate": 2.5112195121951217e-08, "logits/chosen": -0.2626177668571472, "logits/rejected": -0.10716196149587631, "logps/chosen": -189.8940887451172, "logps/rejected": -88.66100311279297, "loss": 0.6163, "rewards/accuracies": 0.75, "rewards/chosen": 0.2558242678642273, "rewards/margins": 0.19720950722694397, "rewards/rejected": 0.058614760637283325, "step": 581 }, { "epoch": 0.35482395976223136, "grad_norm": 60.92662912150131, "learning_rate": 2.5156097560975606e-08, "logits/chosen": 0.17372733354568481, "logits/rejected": 0.09753230959177017, "logps/chosen": -40.65727996826172, "logps/rejected": -53.03999710083008, "loss": 0.6426, "rewards/accuracies": 0.75, "rewards/chosen": 0.01947101578116417, "rewards/margins": 0.16377048194408417, "rewards/rejected": -0.1442994624376297, "step": 582 }, { "epoch": 0.35543362292333486, "grad_norm": 65.82111635184187, "learning_rate": 2.5199999999999997e-08, "logits/chosen": 0.15699997544288635, "logits/rejected": 0.1702466607093811, "logps/chosen": -108.42682647705078, "logps/rejected": -67.42084503173828, "loss": 0.633, "rewards/accuracies": 1.0, "rewards/chosen": 0.08658311516046524, "rewards/margins": 0.3346436321735382, "rewards/rejected": -0.24806050956249237, "step": 583 }, { "epoch": 0.35604328608443836, "grad_norm": 61.589723908941714, "learning_rate": 2.5243902439024386e-08, "logits/chosen": 0.24087904393672943, "logits/rejected": 0.4224730134010315, "logps/chosen": -124.71540832519531, "logps/rejected": -80.59169006347656, "loss": 0.6058, "rewards/accuracies": 1.0, "rewards/chosen": 0.06363040953874588, "rewards/margins": 0.09655527770519257, "rewards/rejected": -0.032924868166446686, "step": 584 }, { "epoch": 0.35665294924554186, "grad_norm": 69.94307749141346, "learning_rate": 2.5287804878048777e-08, "logits/chosen": -0.1330254077911377, "logits/rejected": 0.2797281742095947, "logps/chosen": -322.4261169433594, "logps/rejected": -200.09893798828125, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": 0.354610800743103, "rewards/margins": 0.17412516474723816, "rewards/rejected": 0.18048560619354248, "step": 585 }, { "epoch": 0.3572626124066453, "grad_norm": 60.674530083099555, "learning_rate": 2.533170731707317e-08, "logits/chosen": 0.15709838271141052, "logits/rejected": 0.09432573616504669, "logps/chosen": -201.58251953125, "logps/rejected": -287.25811767578125, "loss": 0.6596, "rewards/accuracies": 0.75, "rewards/chosen": 0.22978299856185913, "rewards/margins": 0.34129127860069275, "rewards/rejected": -0.11150828003883362, "step": 586 }, { "epoch": 0.3578722755677488, "grad_norm": 57.70496811338579, "learning_rate": 2.5375609756097557e-08, "logits/chosen": 0.13312861323356628, "logits/rejected": 0.14644429087638855, "logps/chosen": -26.847915649414062, "logps/rejected": -12.667652130126953, "loss": 0.6401, "rewards/accuracies": 0.5, "rewards/chosen": -0.05336686596274376, "rewards/margins": 0.060440223664045334, "rewards/rejected": -0.11380708962678909, "step": 587 }, { "epoch": 0.3584819387288523, "grad_norm": 66.64224583434539, "learning_rate": 2.541951219512195e-08, "logits/chosen": 0.07735345512628555, "logits/rejected": -0.014005718752741814, "logps/chosen": -289.8581237792969, "logps/rejected": -347.80267333984375, "loss": 0.6078, "rewards/accuracies": 1.0, "rewards/chosen": 0.4404403567314148, "rewards/margins": 0.3935664892196655, "rewards/rejected": 0.04687386006116867, "step": 588 }, { "epoch": 0.3590916018899558, "grad_norm": 70.91864831755105, "learning_rate": 2.5463414634146338e-08, "logits/chosen": -0.07984452694654465, "logits/rejected": -0.11352888494729996, "logps/chosen": -50.47224044799805, "logps/rejected": -70.70494842529297, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": 0.0006060954183340073, "rewards/margins": 0.10464400053024292, "rewards/rejected": -0.10403790324926376, "step": 589 }, { "epoch": 0.3597012650510593, "grad_norm": 58.66353988250958, "learning_rate": 2.550731707317073e-08, "logits/chosen": 0.1725425124168396, "logits/rejected": 0.13134923577308655, "logps/chosen": -162.28570556640625, "logps/rejected": -213.63504028320312, "loss": 0.6036, "rewards/accuracies": 1.0, "rewards/chosen": 0.2620026767253876, "rewards/margins": 0.5470747947692871, "rewards/rejected": -0.2850721478462219, "step": 590 }, { "epoch": 0.36031092821216276, "grad_norm": 67.7212065027267, "learning_rate": 2.555121951219512e-08, "logits/chosen": 0.3118616044521332, "logits/rejected": 0.37517768144607544, "logps/chosen": -32.91007995605469, "logps/rejected": -36.30499267578125, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.0029077772051095963, "rewards/margins": -0.015037847682833672, "rewards/rejected": 0.017945624887943268, "step": 591 }, { "epoch": 0.36092059137326626, "grad_norm": 61.84826057047049, "learning_rate": 2.559512195121951e-08, "logits/chosen": 0.09155456721782684, "logits/rejected": 0.17857593297958374, "logps/chosen": -71.72034454345703, "logps/rejected": -34.7586669921875, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": -0.006021538749337196, "rewards/margins": 0.09854938089847565, "rewards/rejected": -0.10457092523574829, "step": 592 }, { "epoch": 0.36153025453436977, "grad_norm": 59.44271924749285, "learning_rate": 2.56390243902439e-08, "logits/chosen": 0.14848221838474274, "logits/rejected": 0.13061706721782684, "logps/chosen": -45.13452911376953, "logps/rejected": -103.86456298828125, "loss": 0.5884, "rewards/accuracies": 0.5, "rewards/chosen": -0.042713914066553116, "rewards/margins": 0.01090623065829277, "rewards/rejected": -0.053620144724845886, "step": 593 }, { "epoch": 0.36213991769547327, "grad_norm": 55.28440428018266, "learning_rate": 2.568292682926829e-08, "logits/chosen": 0.007122687995433807, "logits/rejected": 0.0011376794427633286, "logps/chosen": -131.5517120361328, "logps/rejected": -125.87528991699219, "loss": 0.6337, "rewards/accuracies": 0.75, "rewards/chosen": 0.07693450897932053, "rewards/margins": 0.0759023129940033, "rewards/rejected": 0.0010321959853172302, "step": 594 }, { "epoch": 0.3627495808565767, "grad_norm": 57.442289571363595, "learning_rate": 2.572682926829268e-08, "logits/chosen": 0.26929086446762085, "logits/rejected": 0.06947465986013412, "logps/chosen": -281.8733215332031, "logps/rejected": -255.4256134033203, "loss": 0.6305, "rewards/accuracies": 0.75, "rewards/chosen": 0.21762491762638092, "rewards/margins": 0.6367673277854919, "rewards/rejected": -0.41914239525794983, "step": 595 }, { "epoch": 0.3633592440176802, "grad_norm": 60.725394018101916, "learning_rate": 2.577073170731707e-08, "logits/chosen": 0.13580162823200226, "logits/rejected": 0.1205705925822258, "logps/chosen": -12.920661926269531, "logps/rejected": -41.98260498046875, "loss": 0.5966, "rewards/accuracies": 0.5, "rewards/chosen": -0.025257421657443047, "rewards/margins": 0.2696884274482727, "rewards/rejected": -0.2949458658695221, "step": 596 }, { "epoch": 0.3639689071787837, "grad_norm": 61.11923315504998, "learning_rate": 2.581463414634146e-08, "logits/chosen": 0.2698628902435303, "logits/rejected": 0.24275220930576324, "logps/chosen": -144.9392547607422, "logps/rejected": -102.06745910644531, "loss": 0.5998, "rewards/accuracies": 0.5, "rewards/chosen": 0.3213733732700348, "rewards/margins": 0.4839481711387634, "rewards/rejected": -0.16257481276988983, "step": 597 }, { "epoch": 0.3645785703398872, "grad_norm": 78.61711851550798, "learning_rate": 2.5858536585365853e-08, "logits/chosen": 0.050785765051841736, "logits/rejected": 0.19618116319179535, "logps/chosen": -277.87274169921875, "logps/rejected": -258.81658935546875, "loss": 0.6219, "rewards/accuracies": 0.5, "rewards/chosen": 0.07663650810718536, "rewards/margins": 0.04727335274219513, "rewards/rejected": 0.029363155364990234, "step": 598 }, { "epoch": 0.3651882335009907, "grad_norm": 54.04566101972143, "learning_rate": 2.590243902439024e-08, "logits/chosen": 0.11316078901290894, "logits/rejected": 0.1296045333147049, "logps/chosen": -11.70112419128418, "logps/rejected": -17.967309951782227, "loss": 0.6094, "rewards/accuracies": 0.75, "rewards/chosen": -0.22652791440486908, "rewards/margins": 0.0768495574593544, "rewards/rejected": -0.3033774495124817, "step": 599 }, { "epoch": 0.36579789666209417, "grad_norm": 82.02657260746992, "learning_rate": 2.5946341463414633e-08, "logits/chosen": 0.0012502595782279968, "logits/rejected": -0.0025876015424728394, "logps/chosen": -196.02114868164062, "logps/rejected": -176.1319580078125, "loss": 0.6134, "rewards/accuracies": 0.75, "rewards/chosen": 0.20056456327438354, "rewards/margins": 0.05789215862751007, "rewards/rejected": 0.14267241954803467, "step": 600 }, { "epoch": 0.36640755982319767, "grad_norm": 73.01668902389659, "learning_rate": 2.599024390243902e-08, "logits/chosen": 0.08000324666500092, "logits/rejected": 0.10965539515018463, "logps/chosen": -262.96356201171875, "logps/rejected": -286.09930419921875, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.32981452345848083, "rewards/margins": 0.4419025182723999, "rewards/rejected": -0.11208801716566086, "step": 601 }, { "epoch": 0.3670172229843012, "grad_norm": 62.326988624410966, "learning_rate": 2.6034146341463413e-08, "logits/chosen": 0.21029147505760193, "logits/rejected": 0.18066349625587463, "logps/chosen": -10.674514770507812, "logps/rejected": -10.799851417541504, "loss": 0.6504, "rewards/accuracies": 0.75, "rewards/chosen": -0.0390445850789547, "rewards/margins": 0.20681487023830414, "rewards/rejected": -0.24585945904254913, "step": 602 }, { "epoch": 0.3676268861454047, "grad_norm": 88.21455280476565, "learning_rate": 2.60780487804878e-08, "logits/chosen": 0.22360165417194366, "logits/rejected": 0.3238444924354553, "logps/chosen": -282.7586669921875, "logps/rejected": -458.24884033203125, "loss": 0.6415, "rewards/accuracies": 1.0, "rewards/chosen": 0.21803241968154907, "rewards/margins": 0.10435838997364044, "rewards/rejected": 0.11367402970790863, "step": 603 }, { "epoch": 0.3682365493065082, "grad_norm": 74.28301102381702, "learning_rate": 2.6121951219512193e-08, "logits/chosen": 0.09303002059459686, "logits/rejected": 0.0715787410736084, "logps/chosen": -138.5469512939453, "logps/rejected": -226.74530029296875, "loss": 0.5986, "rewards/accuracies": 0.25, "rewards/chosen": 0.2518017590045929, "rewards/margins": 0.0004120338708162308, "rewards/rejected": 0.2513897120952606, "step": 604 }, { "epoch": 0.3688462124676116, "grad_norm": 64.03420564684374, "learning_rate": 2.6165853658536585e-08, "logits/chosen": 0.04821214824914932, "logits/rejected": 0.02449846640229225, "logps/chosen": -17.56277847290039, "logps/rejected": -19.87277603149414, "loss": 0.6, "rewards/accuracies": 0.5, "rewards/chosen": -0.028018198907375336, "rewards/margins": 0.03261035308241844, "rewards/rejected": -0.060628555715084076, "step": 605 }, { "epoch": 0.3694558756287151, "grad_norm": 85.19802946627, "learning_rate": 2.6209756097560973e-08, "logits/chosen": 0.04915054887533188, "logits/rejected": 0.17558442056179047, "logps/chosen": -292.1290588378906, "logps/rejected": -250.21385192871094, "loss": 0.6789, "rewards/accuracies": 0.75, "rewards/chosen": 0.2755224406719208, "rewards/margins": 0.2095935046672821, "rewards/rejected": 0.06592892855405807, "step": 606 }, { "epoch": 0.3700655387898186, "grad_norm": 67.86329763868036, "learning_rate": 2.6253658536585365e-08, "logits/chosen": 0.10586974769830704, "logits/rejected": 0.2383757382631302, "logps/chosen": -30.485618591308594, "logps/rejected": -30.937406539916992, "loss": 0.6889, "rewards/accuracies": 0.25, "rewards/chosen": -0.23874562978744507, "rewards/margins": -0.07639502733945847, "rewards/rejected": -0.162350594997406, "step": 607 }, { "epoch": 0.37067520195092213, "grad_norm": 65.64589668684825, "learning_rate": 2.6297560975609753e-08, "logits/chosen": 0.4129244089126587, "logits/rejected": 0.43831074237823486, "logps/chosen": -120.45503234863281, "logps/rejected": -67.74662780761719, "loss": 0.6077, "rewards/accuracies": 1.0, "rewards/chosen": 0.21287314593791962, "rewards/margins": 0.41677525639533997, "rewards/rejected": -0.20390209555625916, "step": 608 }, { "epoch": 0.37128486511202563, "grad_norm": 66.54793285466044, "learning_rate": 2.6341463414634145e-08, "logits/chosen": -0.08084888756275177, "logits/rejected": 0.12200430035591125, "logps/chosen": -184.31590270996094, "logps/rejected": -74.2082748413086, "loss": 0.649, "rewards/accuracies": 0.75, "rewards/chosen": -0.030466752126812935, "rewards/margins": 0.13831934332847595, "rewards/rejected": -0.16878609359264374, "step": 609 }, { "epoch": 0.3718945282731291, "grad_norm": 65.99674430249515, "learning_rate": 2.6385365853658533e-08, "logits/chosen": -0.0379466786980629, "logits/rejected": -0.013483867049217224, "logps/chosen": -125.67252349853516, "logps/rejected": -161.0394287109375, "loss": 0.6256, "rewards/accuracies": 1.0, "rewards/chosen": 0.09288941323757172, "rewards/margins": 0.2684478461742401, "rewards/rejected": -0.1755584180355072, "step": 610 }, { "epoch": 0.3725041914342326, "grad_norm": 76.62280528901553, "learning_rate": 2.6429268292682925e-08, "logits/chosen": 0.07567193359136581, "logits/rejected": 0.07395648211240768, "logps/chosen": -209.67477416992188, "logps/rejected": -286.802001953125, "loss": 0.5782, "rewards/accuracies": 0.25, "rewards/chosen": 0.17806631326675415, "rewards/margins": -0.10854325443506241, "rewards/rejected": 0.28660956025123596, "step": 611 }, { "epoch": 0.3731138545953361, "grad_norm": 62.54991344920887, "learning_rate": 2.6473170731707317e-08, "logits/chosen": 0.3633056879043579, "logits/rejected": 0.004547901451587677, "logps/chosen": -409.0431213378906, "logps/rejected": -439.3016662597656, "loss": 0.5697, "rewards/accuracies": 0.75, "rewards/chosen": 0.4457567632198334, "rewards/margins": 0.5710694193840027, "rewards/rejected": -0.12531261146068573, "step": 612 }, { "epoch": 0.3737235177564396, "grad_norm": 66.9626481658841, "learning_rate": 2.6517073170731705e-08, "logits/chosen": -0.004068553447723389, "logits/rejected": 0.010545983910560608, "logps/chosen": -14.627309799194336, "logps/rejected": -17.238407135009766, "loss": 0.6377, "rewards/accuracies": 0.5, "rewards/chosen": -0.005788658745586872, "rewards/margins": 0.06385204941034317, "rewards/rejected": -0.06964070349931717, "step": 613 }, { "epoch": 0.3743331809175431, "grad_norm": 74.93035243234006, "learning_rate": 2.6560975609756097e-08, "logits/chosen": 0.08081544935703278, "logits/rejected": 0.40592074394226074, "logps/chosen": -182.25711059570312, "logps/rejected": -106.70711517333984, "loss": 0.6102, "rewards/accuracies": 0.5, "rewards/chosen": 0.11156254261732101, "rewards/margins": 0.03907126933336258, "rewards/rejected": 0.07249126583337784, "step": 614 }, { "epoch": 0.37494284407864653, "grad_norm": 61.75363896667214, "learning_rate": 2.6604878048780485e-08, "logits/chosen": 0.2579991817474365, "logits/rejected": 0.18988223373889923, "logps/chosen": -255.18344116210938, "logps/rejected": -276.07342529296875, "loss": 0.599, "rewards/accuracies": 0.75, "rewards/chosen": 0.6108578443527222, "rewards/margins": 1.0364336967468262, "rewards/rejected": -0.42557579278945923, "step": 615 }, { "epoch": 0.37555250723975003, "grad_norm": 63.01762850157472, "learning_rate": 2.6648780487804877e-08, "logits/chosen": 0.2430221140384674, "logits/rejected": 0.2056267410516739, "logps/chosen": -56.53331756591797, "logps/rejected": -109.94190216064453, "loss": 0.632, "rewards/accuracies": 0.75, "rewards/chosen": -0.002829909324645996, "rewards/margins": 0.08583585917949677, "rewards/rejected": -0.08866576850414276, "step": 616 }, { "epoch": 0.37616217040085353, "grad_norm": 61.26783468198263, "learning_rate": 2.6692682926829265e-08, "logits/chosen": 0.05079840123653412, "logits/rejected": 0.24230441451072693, "logps/chosen": -239.391845703125, "logps/rejected": -182.6611785888672, "loss": 0.5432, "rewards/accuracies": 0.75, "rewards/chosen": 0.3134889006614685, "rewards/margins": 0.33168116211891174, "rewards/rejected": -0.01819225214421749, "step": 617 }, { "epoch": 0.37677183356195704, "grad_norm": 72.05342352266527, "learning_rate": 2.6736585365853657e-08, "logits/chosen": 0.03983582556247711, "logits/rejected": 0.2674873173236847, "logps/chosen": -188.82066345214844, "logps/rejected": -243.94949340820312, "loss": 0.6583, "rewards/accuracies": 0.75, "rewards/chosen": 0.36443835496902466, "rewards/margins": 0.336212694644928, "rewards/rejected": 0.02822566032409668, "step": 618 }, { "epoch": 0.3773814967230605, "grad_norm": 53.29340159227129, "learning_rate": 2.678048780487805e-08, "logits/chosen": 0.08060707151889801, "logits/rejected": 0.2136414647102356, "logps/chosen": -106.3077392578125, "logps/rejected": -167.1398468017578, "loss": 0.6813, "rewards/accuracies": 0.5, "rewards/chosen": 0.12663131952285767, "rewards/margins": -0.055943287909030914, "rewards/rejected": 0.18257459998130798, "step": 619 }, { "epoch": 0.377991159884164, "grad_norm": 66.32848817785538, "learning_rate": 2.6824390243902437e-08, "logits/chosen": 0.1777549833059311, "logits/rejected": 0.20667846500873566, "logps/chosen": -265.8216857910156, "logps/rejected": -93.6600341796875, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 0.17918093502521515, "rewards/margins": 0.2043815553188324, "rewards/rejected": -0.0252006184309721, "step": 620 }, { "epoch": 0.3786008230452675, "grad_norm": 64.159257404677, "learning_rate": 2.686829268292683e-08, "logits/chosen": 0.028806494548916817, "logits/rejected": 0.1387287676334381, "logps/chosen": -235.42311096191406, "logps/rejected": -163.48081970214844, "loss": 0.604, "rewards/accuracies": 0.5, "rewards/chosen": 0.31116387248039246, "rewards/margins": 0.46241557598114014, "rewards/rejected": -0.15125170350074768, "step": 621 }, { "epoch": 0.379210486206371, "grad_norm": 61.34920112663299, "learning_rate": 2.6912195121951217e-08, "logits/chosen": 0.2551673650741577, "logits/rejected": 0.2296556830406189, "logps/chosen": -73.38729095458984, "logps/rejected": -97.46251678466797, "loss": 0.6072, "rewards/accuracies": 0.5, "rewards/chosen": 0.06867722421884537, "rewards/margins": 0.24628911912441254, "rewards/rejected": -0.17761187255382538, "step": 622 }, { "epoch": 0.3798201493674745, "grad_norm": 64.58048001676416, "learning_rate": 2.695609756097561e-08, "logits/chosen": 0.16222016513347626, "logits/rejected": 0.4692246913909912, "logps/chosen": -123.88907623291016, "logps/rejected": -281.60882568359375, "loss": 0.5987, "rewards/accuracies": 1.0, "rewards/chosen": 0.24188756942749023, "rewards/margins": 0.4141005575656891, "rewards/rejected": -0.17221297323703766, "step": 623 }, { "epoch": 0.38042981252857794, "grad_norm": 64.59196605516046, "learning_rate": 2.6999999999999997e-08, "logits/chosen": 0.2751791477203369, "logits/rejected": 0.020809587091207504, "logps/chosen": -121.01636505126953, "logps/rejected": -124.5457992553711, "loss": 0.6112, "rewards/accuracies": 0.75, "rewards/chosen": 0.2425137460231781, "rewards/margins": 0.3062474727630615, "rewards/rejected": -0.06373371928930283, "step": 624 }, { "epoch": 0.38103947568968144, "grad_norm": 76.26461460607403, "learning_rate": 2.704390243902439e-08, "logits/chosen": 0.3228560984134674, "logits/rejected": 0.26304954290390015, "logps/chosen": -377.4947509765625, "logps/rejected": -228.72023010253906, "loss": 0.6185, "rewards/accuracies": 0.5, "rewards/chosen": 0.9364129900932312, "rewards/margins": 0.6026533842086792, "rewards/rejected": 0.333759605884552, "step": 625 }, { "epoch": 0.38164913885078494, "grad_norm": 61.474582734685946, "learning_rate": 2.708780487804878e-08, "logits/chosen": 0.04169579595327377, "logits/rejected": -0.024029143154621124, "logps/chosen": -77.93717193603516, "logps/rejected": -43.741764068603516, "loss": 0.604, "rewards/accuracies": 1.0, "rewards/chosen": 0.03765379264950752, "rewards/margins": 0.2079656720161438, "rewards/rejected": -0.17031188309192657, "step": 626 }, { "epoch": 0.38225880201188844, "grad_norm": 78.40463661409733, "learning_rate": 2.713170731707317e-08, "logits/chosen": 0.18412914872169495, "logits/rejected": 0.25874099135398865, "logps/chosen": -205.51585388183594, "logps/rejected": -260.1470642089844, "loss": 0.6656, "rewards/accuracies": 0.0, "rewards/chosen": -0.02864876016974449, "rewards/margins": -0.2759316861629486, "rewards/rejected": 0.24728290736675262, "step": 627 }, { "epoch": 0.38286846517299195, "grad_norm": 62.60707704672063, "learning_rate": 2.717560975609756e-08, "logits/chosen": 0.15112635493278503, "logits/rejected": 0.19681477546691895, "logps/chosen": -54.30095672607422, "logps/rejected": -87.17314147949219, "loss": 0.6241, "rewards/accuracies": 0.5, "rewards/chosen": 0.03811212629079819, "rewards/margins": -0.048415981233119965, "rewards/rejected": 0.08652810752391815, "step": 628 }, { "epoch": 0.3834781283340954, "grad_norm": 58.090678691824294, "learning_rate": 2.721951219512195e-08, "logits/chosen": -0.14929108321666718, "logits/rejected": -0.1105942651629448, "logps/chosen": -111.02405548095703, "logps/rejected": -58.99564743041992, "loss": 0.5989, "rewards/accuracies": 0.75, "rewards/chosen": 0.041006699204444885, "rewards/margins": 0.2893308401107788, "rewards/rejected": -0.24832415580749512, "step": 629 }, { "epoch": 0.3840877914951989, "grad_norm": 72.43955388155143, "learning_rate": 2.726341463414634e-08, "logits/chosen": 0.35203802585601807, "logits/rejected": 0.2940349876880646, "logps/chosen": -56.03721618652344, "logps/rejected": -87.06900024414062, "loss": 0.6378, "rewards/accuracies": 0.75, "rewards/chosen": 0.26018720865249634, "rewards/margins": 0.35242322087287903, "rewards/rejected": -0.0922359973192215, "step": 630 }, { "epoch": 0.3846974546563024, "grad_norm": 77.05555555464018, "learning_rate": 2.7307317073170732e-08, "logits/chosen": 0.02849944308400154, "logits/rejected": 0.06376279145479202, "logps/chosen": -233.94810485839844, "logps/rejected": -178.75323486328125, "loss": 0.6211, "rewards/accuracies": 0.25, "rewards/chosen": 0.23587554693222046, "rewards/margins": 0.2200503647327423, "rewards/rejected": 0.01582520082592964, "step": 631 }, { "epoch": 0.3853071178174059, "grad_norm": 67.95815795868499, "learning_rate": 2.735121951219512e-08, "logits/chosen": 0.16180747747421265, "logits/rejected": 0.1460152268409729, "logps/chosen": -79.08220672607422, "logps/rejected": -144.6344757080078, "loss": 0.7137, "rewards/accuracies": 0.5, "rewards/chosen": 0.15169088542461395, "rewards/margins": 0.2004600316286087, "rewards/rejected": -0.04876915365457535, "step": 632 }, { "epoch": 0.3859167809785094, "grad_norm": 66.57615580150977, "learning_rate": 2.7395121951219512e-08, "logits/chosen": 0.05967415124177933, "logits/rejected": 0.04310440272092819, "logps/chosen": -130.2741241455078, "logps/rejected": -153.22393798828125, "loss": 0.6184, "rewards/accuracies": 0.5, "rewards/chosen": -0.07257428020238876, "rewards/margins": 0.13157668709754944, "rewards/rejected": -0.2041509598493576, "step": 633 }, { "epoch": 0.38652644413961285, "grad_norm": 70.27046849210986, "learning_rate": 2.74390243902439e-08, "logits/chosen": 0.07667946815490723, "logits/rejected": 0.1487220972776413, "logps/chosen": -220.3509979248047, "logps/rejected": -176.93246459960938, "loss": 0.6569, "rewards/accuracies": 0.75, "rewards/chosen": 0.22415469586849213, "rewards/margins": 0.02412354201078415, "rewards/rejected": 0.20003116130828857, "step": 634 }, { "epoch": 0.38713610730071635, "grad_norm": 77.25348241348446, "learning_rate": 2.7482926829268292e-08, "logits/chosen": 0.197032630443573, "logits/rejected": 0.2097669243812561, "logps/chosen": -259.7798156738281, "logps/rejected": -191.10232543945312, "loss": 0.6192, "rewards/accuracies": 1.0, "rewards/chosen": 0.2788991630077362, "rewards/margins": 0.4779600501060486, "rewards/rejected": -0.19906088709831238, "step": 635 }, { "epoch": 0.38774577046181985, "grad_norm": 88.06831080556853, "learning_rate": 2.752682926829268e-08, "logits/chosen": -0.2233126014471054, "logits/rejected": 0.20519399642944336, "logps/chosen": -324.0554504394531, "logps/rejected": -229.15579223632812, "loss": 0.6627, "rewards/accuracies": 0.75, "rewards/chosen": 0.42614394426345825, "rewards/margins": 0.22994571924209595, "rewards/rejected": 0.1961982250213623, "step": 636 }, { "epoch": 0.38835543362292335, "grad_norm": 59.53481307899062, "learning_rate": 2.7570731707317072e-08, "logits/chosen": 0.2308369278907776, "logits/rejected": 0.1919964998960495, "logps/chosen": -342.44024658203125, "logps/rejected": -247.9741668701172, "loss": 0.6123, "rewards/accuracies": 0.25, "rewards/chosen": 0.4173545241355896, "rewards/margins": 0.3117733597755432, "rewards/rejected": 0.10558116436004639, "step": 637 }, { "epoch": 0.3889650967840268, "grad_norm": 72.4663834849674, "learning_rate": 2.7614634146341464e-08, "logits/chosen": -0.15110087394714355, "logits/rejected": -0.034562867134809494, "logps/chosen": -423.80523681640625, "logps/rejected": -349.52862548828125, "loss": 0.6287, "rewards/accuracies": 0.5, "rewards/chosen": 0.10292831063270569, "rewards/margins": 0.027959585189819336, "rewards/rejected": 0.07496871054172516, "step": 638 }, { "epoch": 0.3895747599451303, "grad_norm": 64.31086506734529, "learning_rate": 2.7658536585365852e-08, "logits/chosen": 0.28114038705825806, "logits/rejected": 0.18702393770217896, "logps/chosen": -143.953857421875, "logps/rejected": -242.09315490722656, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 0.38662075996398926, "rewards/margins": 0.841455340385437, "rewards/rejected": -0.45483461022377014, "step": 639 }, { "epoch": 0.3901844231062338, "grad_norm": 74.62046947229135, "learning_rate": 2.7702439024390244e-08, "logits/chosen": 0.09527795016765594, "logits/rejected": 0.19550162553787231, "logps/chosen": -236.70750427246094, "logps/rejected": -175.36721801757812, "loss": 0.6556, "rewards/accuracies": 0.75, "rewards/chosen": 0.3368799388408661, "rewards/margins": 0.3067728579044342, "rewards/rejected": 0.03010706976056099, "step": 640 }, { "epoch": 0.3907940862673373, "grad_norm": 58.84896249669811, "learning_rate": 2.7746341463414632e-08, "logits/chosen": 0.21856433153152466, "logits/rejected": 0.5366150140762329, "logps/chosen": -138.56478881835938, "logps/rejected": -122.19781494140625, "loss": 0.6536, "rewards/accuracies": 0.25, "rewards/chosen": -0.12763060629367828, "rewards/margins": -0.14063982665538788, "rewards/rejected": 0.013009227812290192, "step": 641 }, { "epoch": 0.3914037494284408, "grad_norm": 74.98145898545785, "learning_rate": 2.7790243902439024e-08, "logits/chosen": 0.009811624884605408, "logits/rejected": -0.10907647758722305, "logps/chosen": -280.00177001953125, "logps/rejected": -388.580078125, "loss": 0.5937, "rewards/accuracies": 0.5, "rewards/chosen": 0.1680523157119751, "rewards/margins": -0.2368372678756714, "rewards/rejected": 0.4048895835876465, "step": 642 }, { "epoch": 0.39201341258954425, "grad_norm": 63.40548124910095, "learning_rate": 2.7834146341463412e-08, "logits/chosen": 0.13475245237350464, "logits/rejected": 0.07800167798995972, "logps/chosen": -103.65362548828125, "logps/rejected": -132.5025634765625, "loss": 0.6457, "rewards/accuracies": 0.75, "rewards/chosen": 0.22668324410915375, "rewards/margins": 0.20280678570270538, "rewards/rejected": 0.023876458406448364, "step": 643 }, { "epoch": 0.39262307575064775, "grad_norm": 59.45460211674044, "learning_rate": 2.7878048780487804e-08, "logits/chosen": 0.05927290767431259, "logits/rejected": 0.15233726799488068, "logps/chosen": -156.35134887695312, "logps/rejected": -108.80286407470703, "loss": 0.6305, "rewards/accuracies": 0.5, "rewards/chosen": 0.2818257808685303, "rewards/margins": 0.09953179210424423, "rewards/rejected": 0.18229399621486664, "step": 644 }, { "epoch": 0.39323273891175126, "grad_norm": 74.82741469493133, "learning_rate": 2.7921951219512196e-08, "logits/chosen": -0.12207479774951935, "logits/rejected": -0.033077970147132874, "logps/chosen": -167.2114715576172, "logps/rejected": -316.16888427734375, "loss": 0.6721, "rewards/accuracies": 0.5, "rewards/chosen": 0.09867286682128906, "rewards/margins": -0.18251752853393555, "rewards/rejected": 0.2811903953552246, "step": 645 }, { "epoch": 0.39384240207285476, "grad_norm": 58.8966000715914, "learning_rate": 2.7965853658536584e-08, "logits/chosen": 0.1431388258934021, "logits/rejected": 0.15884003043174744, "logps/chosen": -183.662353515625, "logps/rejected": -136.40989685058594, "loss": 0.584, "rewards/accuracies": 0.75, "rewards/chosen": 0.3585079312324524, "rewards/margins": 0.18456687033176422, "rewards/rejected": 0.17394104599952698, "step": 646 }, { "epoch": 0.39445206523395826, "grad_norm": 66.56144323024465, "learning_rate": 2.8009756097560976e-08, "logits/chosen": -0.05217306315898895, "logits/rejected": 0.31576934456825256, "logps/chosen": -491.3421630859375, "logps/rejected": -256.726806640625, "loss": 0.6575, "rewards/accuracies": 0.5, "rewards/chosen": 0.40523847937583923, "rewards/margins": 0.09070847928524017, "rewards/rejected": 0.31453001499176025, "step": 647 }, { "epoch": 0.3950617283950617, "grad_norm": 65.05690657077261, "learning_rate": 2.8053658536585364e-08, "logits/chosen": 0.10608386993408203, "logits/rejected": 0.3763849437236786, "logps/chosen": -301.9100036621094, "logps/rejected": -167.71836853027344, "loss": 0.5863, "rewards/accuracies": 1.0, "rewards/chosen": 0.43449002504348755, "rewards/margins": 0.6286348700523376, "rewards/rejected": -0.1941448450088501, "step": 648 }, { "epoch": 0.3956713915561652, "grad_norm": 55.662315019410975, "learning_rate": 2.8097560975609756e-08, "logits/chosen": 0.08954723179340363, "logits/rejected": 0.25629645586013794, "logps/chosen": -235.14169311523438, "logps/rejected": -162.6898956298828, "loss": 0.5806, "rewards/accuracies": 1.0, "rewards/chosen": 0.31468138098716736, "rewards/margins": 0.3700261116027832, "rewards/rejected": -0.05534471571445465, "step": 649 }, { "epoch": 0.3962810547172687, "grad_norm": 54.99441906497402, "learning_rate": 2.8141463414634144e-08, "logits/chosen": 0.379417359828949, "logits/rejected": 0.3932171165943146, "logps/chosen": -134.65982055664062, "logps/rejected": -186.13771057128906, "loss": 0.5903, "rewards/accuracies": 0.75, "rewards/chosen": 0.26799216866493225, "rewards/margins": 0.41732701659202576, "rewards/rejected": -0.1493348479270935, "step": 650 }, { "epoch": 0.3968907178783722, "grad_norm": 63.65642017121965, "learning_rate": 2.8185365853658536e-08, "logits/chosen": 0.18639619648456573, "logits/rejected": 0.11451603472232819, "logps/chosen": -37.6251220703125, "logps/rejected": -46.764347076416016, "loss": 0.6263, "rewards/accuracies": 0.0, "rewards/chosen": -0.009578034281730652, "rewards/margins": -0.06655454635620117, "rewards/rejected": 0.05697651207447052, "step": 651 }, { "epoch": 0.3975003810394757, "grad_norm": 61.55800631767217, "learning_rate": 2.8229268292682928e-08, "logits/chosen": 0.21169736981391907, "logits/rejected": 0.054191380739212036, "logps/chosen": -64.55624389648438, "logps/rejected": -203.20297241210938, "loss": 0.5654, "rewards/accuracies": 1.0, "rewards/chosen": 0.36324232816696167, "rewards/margins": 0.8434259295463562, "rewards/rejected": -0.4801836311817169, "step": 652 }, { "epoch": 0.39811004420057916, "grad_norm": 67.02171009729318, "learning_rate": 2.8273170731707316e-08, "logits/chosen": 0.3522152304649353, "logits/rejected": 0.22334620356559753, "logps/chosen": -42.5975456237793, "logps/rejected": -94.68328094482422, "loss": 0.5757, "rewards/accuracies": 1.0, "rewards/chosen": 0.35991886258125305, "rewards/margins": 0.660916268825531, "rewards/rejected": -0.30099746584892273, "step": 653 }, { "epoch": 0.39871970736168266, "grad_norm": 60.33990850522962, "learning_rate": 2.8317073170731708e-08, "logits/chosen": -0.06947705149650574, "logits/rejected": -0.008604679256677628, "logps/chosen": -624.090576171875, "logps/rejected": -350.2554931640625, "loss": 0.6184, "rewards/accuracies": 1.0, "rewards/chosen": 0.5592214465141296, "rewards/margins": 0.21749423444271088, "rewards/rejected": 0.3417271375656128, "step": 654 }, { "epoch": 0.39932937052278616, "grad_norm": 72.4216369967682, "learning_rate": 2.8360975609756096e-08, "logits/chosen": 0.08260777592658997, "logits/rejected": 0.41911986470222473, "logps/chosen": -200.58506774902344, "logps/rejected": -147.6679229736328, "loss": 0.6381, "rewards/accuracies": 0.0, "rewards/chosen": -0.11888381093740463, "rewards/margins": -0.4000794291496277, "rewards/rejected": 0.28119561076164246, "step": 655 }, { "epoch": 0.39993903368388967, "grad_norm": 59.37134808479706, "learning_rate": 2.8404878048780488e-08, "logits/chosen": 0.09101929515600204, "logits/rejected": -0.015520691871643066, "logps/chosen": -25.448081970214844, "logps/rejected": -34.50359344482422, "loss": 0.6068, "rewards/accuracies": 1.0, "rewards/chosen": -0.1263624131679535, "rewards/margins": 0.1127481684088707, "rewards/rejected": -0.23911058902740479, "step": 656 }, { "epoch": 0.40054869684499317, "grad_norm": 73.76492939555055, "learning_rate": 2.8448780487804876e-08, "logits/chosen": -0.08089803159236908, "logits/rejected": -0.1267295479774475, "logps/chosen": -129.6185302734375, "logps/rejected": -112.44964599609375, "loss": 0.5982, "rewards/accuracies": 0.5, "rewards/chosen": -0.33473771810531616, "rewards/margins": 0.12635132670402527, "rewards/rejected": -0.46108904480934143, "step": 657 }, { "epoch": 0.4011583600060966, "grad_norm": 68.49005292734218, "learning_rate": 2.8492682926829268e-08, "logits/chosen": 0.049766190350055695, "logits/rejected": 0.5010601282119751, "logps/chosen": -229.0572967529297, "logps/rejected": -115.39363861083984, "loss": 0.6345, "rewards/accuracies": 1.0, "rewards/chosen": 0.2862285077571869, "rewards/margins": 0.4368448853492737, "rewards/rejected": -0.1506163626909256, "step": 658 }, { "epoch": 0.4017680231672001, "grad_norm": 57.899060720807014, "learning_rate": 2.853658536585366e-08, "logits/chosen": -0.002681232988834381, "logits/rejected": 0.18251019716262817, "logps/chosen": -274.3082275390625, "logps/rejected": -143.87628173828125, "loss": 0.6054, "rewards/accuracies": 0.75, "rewards/chosen": 0.4599049389362335, "rewards/margins": 0.3964844346046448, "rewards/rejected": 0.06342048943042755, "step": 659 }, { "epoch": 0.4023776863283036, "grad_norm": 58.124639074578916, "learning_rate": 2.8580487804878048e-08, "logits/chosen": -0.04075242951512337, "logits/rejected": -0.0851912871003151, "logps/chosen": -100.02337646484375, "logps/rejected": -100.20783233642578, "loss": 0.6088, "rewards/accuracies": 0.75, "rewards/chosen": 0.2120712846517563, "rewards/margins": 0.08371191471815109, "rewards/rejected": 0.1283593773841858, "step": 660 }, { "epoch": 0.4029873494894071, "grad_norm": 56.53324777516352, "learning_rate": 2.862439024390244e-08, "logits/chosen": 0.26801446080207825, "logits/rejected": 0.19303756952285767, "logps/chosen": -99.32714080810547, "logps/rejected": -147.7371063232422, "loss": 0.6217, "rewards/accuracies": 0.5, "rewards/chosen": 0.012360095977783203, "rewards/margins": 0.24427659809589386, "rewards/rejected": -0.23191648721694946, "step": 661 }, { "epoch": 0.40359701265051057, "grad_norm": 64.62602867593822, "learning_rate": 2.8668292682926828e-08, "logits/chosen": 0.39362233877182007, "logits/rejected": 0.3923282027244568, "logps/chosen": -154.65005493164062, "logps/rejected": -84.58635711669922, "loss": 0.6036, "rewards/accuracies": 0.5, "rewards/chosen": 0.1573656052350998, "rewards/margins": 0.14125274121761322, "rewards/rejected": 0.016112878918647766, "step": 662 }, { "epoch": 0.40420667581161407, "grad_norm": 67.35757001483378, "learning_rate": 2.871219512195122e-08, "logits/chosen": 0.08849135041236877, "logits/rejected": 0.013642445206642151, "logps/chosen": -12.219024658203125, "logps/rejected": -46.77790069580078, "loss": 0.6035, "rewards/accuracies": 0.5, "rewards/chosen": -0.00918569229543209, "rewards/margins": 0.018711738288402557, "rewards/rejected": -0.027897430583834648, "step": 663 }, { "epoch": 0.40481633897271757, "grad_norm": 66.45965976248175, "learning_rate": 2.875609756097561e-08, "logits/chosen": 0.10113909840583801, "logits/rejected": 0.3675948977470398, "logps/chosen": -184.7261199951172, "logps/rejected": -80.58538055419922, "loss": 0.6096, "rewards/accuracies": 0.5, "rewards/chosen": 0.2339383214712143, "rewards/margins": 0.21842002868652344, "rewards/rejected": 0.015518282540142536, "step": 664 }, { "epoch": 0.4054260021338211, "grad_norm": 67.09031137593702, "learning_rate": 2.88e-08, "logits/chosen": 0.06466561555862427, "logits/rejected": 0.1334514021873474, "logps/chosen": -119.71806335449219, "logps/rejected": -60.799110412597656, "loss": 0.6022, "rewards/accuracies": 0.75, "rewards/chosen": 0.189555361866951, "rewards/margins": 0.3266139030456543, "rewards/rejected": -0.1370585411787033, "step": 665 }, { "epoch": 0.4060356652949246, "grad_norm": 68.8168633458203, "learning_rate": 2.884390243902439e-08, "logits/chosen": 0.17734110355377197, "logits/rejected": 0.3334428071975708, "logps/chosen": -156.93003845214844, "logps/rejected": -160.6712188720703, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": 0.3025633692741394, "rewards/margins": 0.23625826835632324, "rewards/rejected": 0.06630511581897736, "step": 666 }, { "epoch": 0.406645328456028, "grad_norm": 55.28685825452296, "learning_rate": 2.888780487804878e-08, "logits/chosen": 0.2348455935716629, "logits/rejected": 0.2635599374771118, "logps/chosen": -31.70298957824707, "logps/rejected": -49.15739440917969, "loss": 0.5776, "rewards/accuracies": 0.75, "rewards/chosen": -0.13425521552562714, "rewards/margins": 0.05660948157310486, "rewards/rejected": -0.1908647119998932, "step": 667 }, { "epoch": 0.4072549916171315, "grad_norm": 65.06682601829672, "learning_rate": 2.893170731707317e-08, "logits/chosen": 0.08870015293359756, "logits/rejected": -0.01683107763528824, "logps/chosen": -234.7543182373047, "logps/rejected": -288.1865539550781, "loss": 0.5969, "rewards/accuracies": 1.0, "rewards/chosen": -0.09488782286643982, "rewards/margins": 0.6672009229660034, "rewards/rejected": -0.7620887160301208, "step": 668 }, { "epoch": 0.407864654778235, "grad_norm": 69.1607193783598, "learning_rate": 2.897560975609756e-08, "logits/chosen": 0.18325266242027283, "logits/rejected": 0.15346255898475647, "logps/chosen": -9.335733413696289, "logps/rejected": -34.26789093017578, "loss": 0.6066, "rewards/accuracies": 0.25, "rewards/chosen": -0.21492107212543488, "rewards/margins": -0.04538650065660477, "rewards/rejected": -0.1695345640182495, "step": 669 }, { "epoch": 0.4084743179393385, "grad_norm": 61.33412935798923, "learning_rate": 2.901951219512195e-08, "logits/chosen": 0.20572735369205475, "logits/rejected": 0.2354370504617691, "logps/chosen": -287.7242736816406, "logps/rejected": -196.66990661621094, "loss": 0.6019, "rewards/accuracies": 1.0, "rewards/chosen": 0.4633364975452423, "rewards/margins": 0.35491088032722473, "rewards/rejected": 0.10842561721801758, "step": 670 }, { "epoch": 0.40908398110044203, "grad_norm": 63.76018813216311, "learning_rate": 2.9063414634146343e-08, "logits/chosen": 0.3203410506248474, "logits/rejected": 0.2411998212337494, "logps/chosen": -88.51042938232422, "logps/rejected": -224.8445587158203, "loss": 0.6055, "rewards/accuracies": 0.75, "rewards/chosen": 0.17546065151691437, "rewards/margins": 0.21716642379760742, "rewards/rejected": -0.041705772280693054, "step": 671 }, { "epoch": 0.4096936442615455, "grad_norm": 59.626468760791596, "learning_rate": 2.9107317073170732e-08, "logits/chosen": 0.18035395443439484, "logits/rejected": 0.0916692316532135, "logps/chosen": -27.960620880126953, "logps/rejected": -65.0535888671875, "loss": 0.6099, "rewards/accuracies": 0.75, "rewards/chosen": -0.0217488631606102, "rewards/margins": 0.33223849534988403, "rewards/rejected": -0.35398733615875244, "step": 672 }, { "epoch": 0.410303307422649, "grad_norm": 60.57315382535163, "learning_rate": 2.9151219512195123e-08, "logits/chosen": -0.18432225286960602, "logits/rejected": 0.004740915726870298, "logps/chosen": -175.02383422851562, "logps/rejected": -133.32498168945312, "loss": 0.5955, "rewards/accuracies": 0.75, "rewards/chosen": 0.11643826961517334, "rewards/margins": 0.003228917717933655, "rewards/rejected": 0.11320935189723969, "step": 673 }, { "epoch": 0.4109129705837525, "grad_norm": 62.77459814493296, "learning_rate": 2.9195121951219512e-08, "logits/chosen": 0.16996510326862335, "logits/rejected": 0.2245965600013733, "logps/chosen": -117.73829650878906, "logps/rejected": -110.91796112060547, "loss": 0.5902, "rewards/accuracies": 1.0, "rewards/chosen": 0.2513304054737091, "rewards/margins": 0.13217899203300476, "rewards/rejected": 0.11915141344070435, "step": 674 }, { "epoch": 0.411522633744856, "grad_norm": 59.23703862462656, "learning_rate": 2.9239024390243903e-08, "logits/chosen": 0.006302252411842346, "logits/rejected": 0.4283897578716278, "logps/chosen": -104.32003784179688, "logps/rejected": -91.69224548339844, "loss": 0.6458, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008779680356383324, "rewards/margins": -0.0031674085184931755, "rewards/rejected": 0.002289438620209694, "step": 675 }, { "epoch": 0.4121322969059595, "grad_norm": 67.56419466850504, "learning_rate": 2.928292682926829e-08, "logits/chosen": 0.04366625100374222, "logits/rejected": 0.001990571618080139, "logps/chosen": -24.013965606689453, "logps/rejected": -29.999656677246094, "loss": 0.6031, "rewards/accuracies": 0.75, "rewards/chosen": -0.020657753571867943, "rewards/margins": 0.2397357076406479, "rewards/rejected": -0.2603934705257416, "step": 676 }, { "epoch": 0.41274196006706293, "grad_norm": 66.70018573927544, "learning_rate": 2.932682926829268e-08, "logits/chosen": 0.13844181597232819, "logits/rejected": 0.11456754058599472, "logps/chosen": -126.58256530761719, "logps/rejected": -96.23551940917969, "loss": 0.6074, "rewards/accuracies": 0.5, "rewards/chosen": 0.02874419093132019, "rewards/margins": 0.3369070887565613, "rewards/rejected": -0.3081628680229187, "step": 677 }, { "epoch": 0.41335162322816643, "grad_norm": 73.67012423016767, "learning_rate": 2.937073170731707e-08, "logits/chosen": 0.22111207246780396, "logits/rejected": 0.0935177430510521, "logps/chosen": -200.04986572265625, "logps/rejected": -174.7672882080078, "loss": 0.595, "rewards/accuracies": 0.75, "rewards/chosen": 0.3721103072166443, "rewards/margins": 0.1897158920764923, "rewards/rejected": 0.18239441514015198, "step": 678 }, { "epoch": 0.41396128638926993, "grad_norm": 69.02246934189627, "learning_rate": 2.941463414634146e-08, "logits/chosen": 0.16653214395046234, "logits/rejected": 0.27699148654937744, "logps/chosen": -318.82958984375, "logps/rejected": -242.1185302734375, "loss": 0.5718, "rewards/accuracies": 0.75, "rewards/chosen": 0.32409214973449707, "rewards/margins": 0.6411502957344055, "rewards/rejected": -0.31705817580223083, "step": 679 }, { "epoch": 0.41457094955037344, "grad_norm": 54.36427580713745, "learning_rate": 2.9458536585365852e-08, "logits/chosen": 0.031883664429187775, "logits/rejected": 0.08884549885988235, "logps/chosen": -35.28754425048828, "logps/rejected": -49.85274124145508, "loss": 0.5745, "rewards/accuracies": 1.0, "rewards/chosen": -0.031313348561525345, "rewards/margins": 0.12778198719024658, "rewards/rejected": -0.15909533202648163, "step": 680 }, { "epoch": 0.4151806127114769, "grad_norm": 70.23710329187017, "learning_rate": 2.950243902439024e-08, "logits/chosen": 0.40195852518081665, "logits/rejected": 0.1879790872335434, "logps/chosen": -227.4771728515625, "logps/rejected": -247.98159790039062, "loss": 0.6005, "rewards/accuracies": 1.0, "rewards/chosen": 0.17011643946170807, "rewards/margins": 0.34443551301956177, "rewards/rejected": -0.1743190735578537, "step": 681 }, { "epoch": 0.4157902758725804, "grad_norm": 66.99963384126403, "learning_rate": 2.9546341463414632e-08, "logits/chosen": 0.11015382409095764, "logits/rejected": 0.12308009713888168, "logps/chosen": -291.40081787109375, "logps/rejected": -224.32606506347656, "loss": 0.603, "rewards/accuracies": 0.25, "rewards/chosen": 0.16739743947982788, "rewards/margins": 0.07820190489292145, "rewards/rejected": 0.08919551968574524, "step": 682 }, { "epoch": 0.4163999390336839, "grad_norm": 64.95623940593457, "learning_rate": 2.959024390243902e-08, "logits/chosen": 0.33314457535743713, "logits/rejected": 0.2583604156970978, "logps/chosen": -101.11124420166016, "logps/rejected": -135.29530334472656, "loss": 0.5881, "rewards/accuracies": 0.5, "rewards/chosen": -0.22422949969768524, "rewards/margins": 0.09729447960853577, "rewards/rejected": -0.3215239644050598, "step": 683 }, { "epoch": 0.4170096021947874, "grad_norm": 58.67034444294828, "learning_rate": 2.9634146341463412e-08, "logits/chosen": 0.06277957558631897, "logits/rejected": -0.015074517577886581, "logps/chosen": -152.89991760253906, "logps/rejected": -185.5033721923828, "loss": 0.5628, "rewards/accuracies": 0.75, "rewards/chosen": 0.2418362945318222, "rewards/margins": 0.06912834942340851, "rewards/rejected": 0.1727079302072525, "step": 684 }, { "epoch": 0.4176192653558909, "grad_norm": 50.70518998130048, "learning_rate": 2.96780487804878e-08, "logits/chosen": 0.02843397855758667, "logits/rejected": -0.058416157960891724, "logps/chosen": -117.25051879882812, "logps/rejected": -99.71661376953125, "loss": 0.5401, "rewards/accuracies": 1.0, "rewards/chosen": 0.39249762892723083, "rewards/margins": 0.43272140622138977, "rewards/rejected": -0.040223799645900726, "step": 685 }, { "epoch": 0.41822892851699434, "grad_norm": 74.10719820300136, "learning_rate": 2.9721951219512192e-08, "logits/chosen": 0.22975647449493408, "logits/rejected": -0.20629757642745972, "logps/chosen": -252.9994659423828, "logps/rejected": -247.56211853027344, "loss": 0.6202, "rewards/accuracies": 0.25, "rewards/chosen": 0.09769769012928009, "rewards/margins": -0.06985299289226532, "rewards/rejected": 0.1675506830215454, "step": 686 }, { "epoch": 0.41883859167809784, "grad_norm": 71.0633430218036, "learning_rate": 2.9765853658536584e-08, "logits/chosen": 0.011537957936525345, "logits/rejected": -0.014044620096683502, "logps/chosen": -163.05384826660156, "logps/rejected": -109.02774047851562, "loss": 0.5604, "rewards/accuracies": 0.75, "rewards/chosen": 0.21511390805244446, "rewards/margins": 0.24831253290176392, "rewards/rejected": -0.03319861367344856, "step": 687 }, { "epoch": 0.41944825483920134, "grad_norm": 60.030546752215145, "learning_rate": 2.980975609756097e-08, "logits/chosen": 0.03469804674386978, "logits/rejected": 0.09261640161275864, "logps/chosen": -114.98950958251953, "logps/rejected": -114.08073425292969, "loss": 0.5694, "rewards/accuracies": 0.75, "rewards/chosen": 0.0984640121459961, "rewards/margins": 0.2803994119167328, "rewards/rejected": -0.1819353997707367, "step": 688 }, { "epoch": 0.42005791800030484, "grad_norm": 67.9177216292696, "learning_rate": 2.985365853658536e-08, "logits/chosen": 0.1416039764881134, "logits/rejected": 0.2830074429512024, "logps/chosen": -146.13885498046875, "logps/rejected": -79.72393798828125, "loss": 0.5778, "rewards/accuracies": 0.75, "rewards/chosen": 0.18982058763504028, "rewards/margins": 0.2407340556383133, "rewards/rejected": -0.050913479179143906, "step": 689 }, { "epoch": 0.42066758116140834, "grad_norm": 58.77329784263347, "learning_rate": 2.9897560975609756e-08, "logits/chosen": 0.19391655921936035, "logits/rejected": 0.008816692978143692, "logps/chosen": -16.752595901489258, "logps/rejected": -24.750043869018555, "loss": 0.5741, "rewards/accuracies": 0.5, "rewards/chosen": -0.014958436600863934, "rewards/margins": 0.28227686882019043, "rewards/rejected": -0.2972353398799896, "step": 690 }, { "epoch": 0.4212772443225118, "grad_norm": 67.38783502416696, "learning_rate": 2.9941463414634144e-08, "logits/chosen": 0.06132356822490692, "logits/rejected": 0.1812308132648468, "logps/chosen": -61.43677520751953, "logps/rejected": -55.48193359375, "loss": 0.6256, "rewards/accuracies": 0.75, "rewards/chosen": -0.05021069198846817, "rewards/margins": 0.13919878005981445, "rewards/rejected": -0.18940944969654083, "step": 691 }, { "epoch": 0.4218869074836153, "grad_norm": 57.59018466854295, "learning_rate": 2.998536585365853e-08, "logits/chosen": 0.22436045110225677, "logits/rejected": 0.09355799108743668, "logps/chosen": -144.72726440429688, "logps/rejected": -72.65602111816406, "loss": 0.5584, "rewards/accuracies": 1.0, "rewards/chosen": 0.25349390506744385, "rewards/margins": 0.3784920275211334, "rewards/rejected": -0.12499812245368958, "step": 692 }, { "epoch": 0.4224965706447188, "grad_norm": 54.43371138409108, "learning_rate": 3.002926829268292e-08, "logits/chosen": 0.09764564037322998, "logits/rejected": 0.07500110566616058, "logps/chosen": -63.83013153076172, "logps/rejected": -87.51184844970703, "loss": 0.5458, "rewards/accuracies": 0.75, "rewards/chosen": 0.21895256638526917, "rewards/margins": 0.5135568976402283, "rewards/rejected": -0.2946043312549591, "step": 693 }, { "epoch": 0.4231062338058223, "grad_norm": 63.66155495833237, "learning_rate": 3.0073170731707316e-08, "logits/chosen": 0.11742034554481506, "logits/rejected": 0.1333976536989212, "logps/chosen": -63.16908264160156, "logps/rejected": -106.1064224243164, "loss": 0.5838, "rewards/accuracies": 0.25, "rewards/chosen": -0.26999402046203613, "rewards/margins": -0.02053636871278286, "rewards/rejected": -0.24945765733718872, "step": 694 }, { "epoch": 0.4237158969669258, "grad_norm": 72.38315149284581, "learning_rate": 3.0117073170731704e-08, "logits/chosen": 0.26569753885269165, "logits/rejected": -0.08579610288143158, "logps/chosen": -88.88272094726562, "logps/rejected": -156.12924194335938, "loss": 0.6408, "rewards/accuracies": 0.5, "rewards/chosen": -0.042749255895614624, "rewards/margins": -0.1455920785665512, "rewards/rejected": 0.1028428003191948, "step": 695 }, { "epoch": 0.42432556012802924, "grad_norm": 56.84359588093456, "learning_rate": 3.016097560975609e-08, "logits/chosen": -0.04833687096834183, "logits/rejected": 0.22585663199424744, "logps/chosen": -249.55300903320312, "logps/rejected": -227.1507110595703, "loss": 0.608, "rewards/accuracies": 1.0, "rewards/chosen": 0.2435014843940735, "rewards/margins": 0.49496057629585266, "rewards/rejected": -0.2514590919017792, "step": 696 }, { "epoch": 0.42493522328913275, "grad_norm": 69.98617860582188, "learning_rate": 3.020487804878049e-08, "logits/chosen": 0.12984676659107208, "logits/rejected": 0.15866082906723022, "logps/chosen": -122.64289093017578, "logps/rejected": -97.21748352050781, "loss": 0.6185, "rewards/accuracies": 0.5, "rewards/chosen": 0.0028444305062294006, "rewards/margins": -0.02080116793513298, "rewards/rejected": 0.02364560216665268, "step": 697 }, { "epoch": 0.42554488645023625, "grad_norm": 70.86364125078781, "learning_rate": 3.0248780487804876e-08, "logits/chosen": 0.22259321808815002, "logits/rejected": 0.601961076259613, "logps/chosen": -230.63555908203125, "logps/rejected": -80.44770050048828, "loss": 0.6267, "rewards/accuracies": 0.75, "rewards/chosen": 0.2698615491390228, "rewards/margins": 0.30879026651382446, "rewards/rejected": -0.03892870992422104, "step": 698 }, { "epoch": 0.42615454961133975, "grad_norm": 77.52768160366813, "learning_rate": 3.0292682926829264e-08, "logits/chosen": 0.2709192633628845, "logits/rejected": 0.26073017716407776, "logps/chosen": -533.0921020507812, "logps/rejected": -514.8524169921875, "loss": 0.6034, "rewards/accuracies": 0.75, "rewards/chosen": 0.5236976742744446, "rewards/margins": 0.33411556482315063, "rewards/rejected": 0.18958207964897156, "step": 699 }, { "epoch": 0.42676421277244325, "grad_norm": 62.52940453105315, "learning_rate": 3.033658536585366e-08, "logits/chosen": -0.030992362648248672, "logits/rejected": -0.010659024119377136, "logps/chosen": -117.72844696044922, "logps/rejected": -120.53231048583984, "loss": 0.5939, "rewards/accuracies": 0.5, "rewards/chosen": 0.11818039417266846, "rewards/margins": 0.2509658634662628, "rewards/rejected": -0.13278548419475555, "step": 700 }, { "epoch": 0.4273738759335467, "grad_norm": 52.743191936731016, "learning_rate": 3.038048780487805e-08, "logits/chosen": 0.3419884443283081, "logits/rejected": 0.3479151725769043, "logps/chosen": -28.024581909179688, "logps/rejected": -8.396501541137695, "loss": 0.5528, "rewards/accuracies": 0.75, "rewards/chosen": -0.19714289903640747, "rewards/margins": 0.2800540626049042, "rewards/rejected": -0.47719693183898926, "step": 701 }, { "epoch": 0.4279835390946502, "grad_norm": 67.83314550822708, "learning_rate": 3.0424390243902436e-08, "logits/chosen": 0.16552375257015228, "logits/rejected": 0.2708529531955719, "logps/chosen": -193.01100158691406, "logps/rejected": -141.35247802734375, "loss": 0.6163, "rewards/accuracies": 0.5, "rewards/chosen": 0.1269087791442871, "rewards/margins": 0.08467017859220505, "rewards/rejected": 0.04223859682679176, "step": 702 }, { "epoch": 0.4285932022557537, "grad_norm": 72.33856499657284, "learning_rate": 3.0468292682926824e-08, "logits/chosen": 0.16995982825756073, "logits/rejected": 0.3033416271209717, "logps/chosen": -49.045066833496094, "logps/rejected": -54.02178955078125, "loss": 0.6279, "rewards/accuracies": 0.75, "rewards/chosen": -0.08551420271396637, "rewards/margins": 0.43687137961387634, "rewards/rejected": -0.5223855972290039, "step": 703 }, { "epoch": 0.4292028654168572, "grad_norm": 69.2924140296883, "learning_rate": 3.051219512195122e-08, "logits/chosen": 0.09492413699626923, "logits/rejected": 0.39009159803390503, "logps/chosen": -131.39935302734375, "logps/rejected": -21.53441047668457, "loss": 0.5929, "rewards/accuracies": 0.25, "rewards/chosen": -0.35798388719558716, "rewards/margins": -0.26453471183776855, "rewards/rejected": -0.0934491753578186, "step": 704 }, { "epoch": 0.42981252857796065, "grad_norm": 61.93779746328726, "learning_rate": 3.055609756097561e-08, "logits/chosen": 0.252395898103714, "logits/rejected": 0.07689037919044495, "logps/chosen": -129.8458709716797, "logps/rejected": -188.48562622070312, "loss": 0.578, "rewards/accuracies": 0.75, "rewards/chosen": 0.34194260835647583, "rewards/margins": 0.27869632840156555, "rewards/rejected": 0.06324627995491028, "step": 705 }, { "epoch": 0.43042219173906415, "grad_norm": 71.25144144882567, "learning_rate": 3.0599999999999996e-08, "logits/chosen": -0.12025362253189087, "logits/rejected": -0.15497341752052307, "logps/chosen": -54.242740631103516, "logps/rejected": -93.62307739257812, "loss": 0.6163, "rewards/accuracies": 0.5, "rewards/chosen": -0.23059195280075073, "rewards/margins": -0.023295823484659195, "rewards/rejected": -0.20729614794254303, "step": 706 }, { "epoch": 0.43103185490016765, "grad_norm": 49.678861831397526, "learning_rate": 3.064390243902439e-08, "logits/chosen": 0.13708041608333588, "logits/rejected": 0.11697007715702057, "logps/chosen": -133.09588623046875, "logps/rejected": -121.47759246826172, "loss": 0.544, "rewards/accuracies": 0.75, "rewards/chosen": 0.4279193580150604, "rewards/margins": 0.4899780750274658, "rewards/rejected": -0.06205863878130913, "step": 707 }, { "epoch": 0.43164151806127116, "grad_norm": 62.799527935945854, "learning_rate": 3.068780487804878e-08, "logits/chosen": 0.08546426892280579, "logits/rejected": 0.35718727111816406, "logps/chosen": -164.70767211914062, "logps/rejected": -302.91119384765625, "loss": 0.6283, "rewards/accuracies": 0.25, "rewards/chosen": -0.013271257281303406, "rewards/margins": -0.034261077642440796, "rewards/rejected": 0.020989812910556793, "step": 708 }, { "epoch": 0.43225118122237466, "grad_norm": 64.40455509713055, "learning_rate": 3.073170731707317e-08, "logits/chosen": 0.1983512043952942, "logits/rejected": 0.155623197555542, "logps/chosen": -110.73567199707031, "logps/rejected": -110.76133728027344, "loss": 0.5777, "rewards/accuracies": 0.75, "rewards/chosen": 0.2301923781633377, "rewards/margins": 0.6113991737365723, "rewards/rejected": -0.38120681047439575, "step": 709 }, { "epoch": 0.4328608443834781, "grad_norm": 69.52137292022668, "learning_rate": 3.0775609756097556e-08, "logits/chosen": 0.3022085428237915, "logits/rejected": 0.28429025411605835, "logps/chosen": -240.02157592773438, "logps/rejected": -189.4278106689453, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": 0.41948139667510986, "rewards/margins": 0.3640276789665222, "rewards/rejected": 0.05545370653271675, "step": 710 }, { "epoch": 0.4334705075445816, "grad_norm": 61.579931805538884, "learning_rate": 3.081951219512195e-08, "logits/chosen": 0.04821467399597168, "logits/rejected": 0.01756604015827179, "logps/chosen": -68.94324493408203, "logps/rejected": -69.59725952148438, "loss": 0.5596, "rewards/accuracies": 0.75, "rewards/chosen": -0.04010574519634247, "rewards/margins": 0.022175416350364685, "rewards/rejected": -0.06228116154670715, "step": 711 }, { "epoch": 0.4340801707056851, "grad_norm": 58.74671687754795, "learning_rate": 3.086341463414634e-08, "logits/chosen": -0.02244146168231964, "logits/rejected": 0.3006266951560974, "logps/chosen": -388.98675537109375, "logps/rejected": -187.48379516601562, "loss": 0.4945, "rewards/accuracies": 0.75, "rewards/chosen": 0.9284478425979614, "rewards/margins": 0.9021939635276794, "rewards/rejected": 0.026253893971443176, "step": 712 }, { "epoch": 0.4346898338667886, "grad_norm": 66.28226054586283, "learning_rate": 3.090731707317073e-08, "logits/chosen": 0.3966588079929352, "logits/rejected": 0.44131675362586975, "logps/chosen": -29.034849166870117, "logps/rejected": -33.7619514465332, "loss": 0.5545, "rewards/accuracies": 0.75, "rewards/chosen": 0.01138560101389885, "rewards/margins": 0.20000213384628296, "rewards/rejected": -0.18861651420593262, "step": 713 }, { "epoch": 0.4352994970278921, "grad_norm": 62.41945677588309, "learning_rate": 3.095121951219512e-08, "logits/chosen": 0.10338619351387024, "logits/rejected": 0.2881770431995392, "logps/chosen": -201.3583526611328, "logps/rejected": -242.2822723388672, "loss": 0.575, "rewards/accuracies": 0.75, "rewards/chosen": 0.3309534788131714, "rewards/margins": 0.6583471298217773, "rewards/rejected": -0.32739365100860596, "step": 714 }, { "epoch": 0.43590916018899556, "grad_norm": 61.40612035038867, "learning_rate": 3.099512195121951e-08, "logits/chosen": 0.10096565634012222, "logits/rejected": 0.09104756265878677, "logps/chosen": -163.83914184570312, "logps/rejected": -104.75081634521484, "loss": 0.6446, "rewards/accuracies": 0.75, "rewards/chosen": 0.284401535987854, "rewards/margins": 0.19382619857788086, "rewards/rejected": 0.09057533740997314, "step": 715 }, { "epoch": 0.43651882335009906, "grad_norm": 65.1815971155587, "learning_rate": 3.10390243902439e-08, "logits/chosen": 0.13695839047431946, "logits/rejected": 0.049626439809799194, "logps/chosen": -129.7420196533203, "logps/rejected": -202.95199584960938, "loss": 0.6115, "rewards/accuracies": 0.0, "rewards/chosen": -0.12976928055286407, "rewards/margins": -0.125985786318779, "rewards/rejected": -0.0037834858521819115, "step": 716 }, { "epoch": 0.43712848651120256, "grad_norm": 62.22586883955595, "learning_rate": 3.108292682926829e-08, "logits/chosen": -0.07620099186897278, "logits/rejected": -0.012126855552196503, "logps/chosen": -160.33538818359375, "logps/rejected": -143.08425903320312, "loss": 0.6466, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032648593187332153, "rewards/margins": 0.24996376037597656, "rewards/rejected": -0.25322863459587097, "step": 717 }, { "epoch": 0.43773814967230606, "grad_norm": 54.70946843812598, "learning_rate": 3.112682926829268e-08, "logits/chosen": 0.11618568748235703, "logits/rejected": 0.07170344144105911, "logps/chosen": -225.55813598632812, "logps/rejected": -188.16357421875, "loss": 0.4905, "rewards/accuracies": 1.0, "rewards/chosen": 0.5569877028465271, "rewards/margins": 0.6170770525932312, "rewards/rejected": -0.060089416801929474, "step": 718 }, { "epoch": 0.43834781283340957, "grad_norm": 73.89704569216683, "learning_rate": 3.117073170731707e-08, "logits/chosen": 0.005682835355401039, "logits/rejected": 0.06556177884340286, "logps/chosen": -310.00128173828125, "logps/rejected": -245.61236572265625, "loss": 0.6426, "rewards/accuracies": 0.75, "rewards/chosen": 0.6474918723106384, "rewards/margins": 0.668273389339447, "rewards/rejected": -0.020781513303518295, "step": 719 }, { "epoch": 0.438957475994513, "grad_norm": 69.77210354800309, "learning_rate": 3.121463414634146e-08, "logits/chosen": -0.1030096784234047, "logits/rejected": 0.20871181786060333, "logps/chosen": -204.392578125, "logps/rejected": -116.71359252929688, "loss": 0.5929, "rewards/accuracies": 1.0, "rewards/chosen": 0.3954564034938812, "rewards/margins": 0.3550819456577301, "rewards/rejected": 0.04037447273731232, "step": 720 }, { "epoch": 0.4395671391556165, "grad_norm": 56.44675678828737, "learning_rate": 3.1258536585365855e-08, "logits/chosen": -0.025892585515975952, "logits/rejected": 0.22776633501052856, "logps/chosen": -143.7974395751953, "logps/rejected": -73.87933349609375, "loss": 0.6297, "rewards/accuracies": 1.0, "rewards/chosen": -0.024740634486079216, "rewards/margins": 0.2636898159980774, "rewards/rejected": -0.28843045234680176, "step": 721 }, { "epoch": 0.44017680231672, "grad_norm": 71.2787091506665, "learning_rate": 3.130243902439024e-08, "logits/chosen": 0.23216727375984192, "logits/rejected": 0.16670744121074677, "logps/chosen": -52.42512130737305, "logps/rejected": -98.56019592285156, "loss": 0.6078, "rewards/accuracies": 0.75, "rewards/chosen": 0.2387477457523346, "rewards/margins": 0.1126733347773552, "rewards/rejected": 0.1260744035243988, "step": 722 }, { "epoch": 0.4407864654778235, "grad_norm": 64.68698901349826, "learning_rate": 3.134634146341463e-08, "logits/chosen": 0.39107462763786316, "logits/rejected": 0.0693548172712326, "logps/chosen": -133.65731811523438, "logps/rejected": -388.3984375, "loss": 0.6925, "rewards/accuracies": 0.75, "rewards/chosen": 0.06935802102088928, "rewards/margins": 0.1404474377632141, "rewards/rejected": -0.07108942419290543, "step": 723 }, { "epoch": 0.44139612863892697, "grad_norm": 64.92084998872986, "learning_rate": 3.139024390243902e-08, "logits/chosen": 0.22215789556503296, "logits/rejected": 0.5344216227531433, "logps/chosen": -234.95579528808594, "logps/rejected": -183.60125732421875, "loss": 0.5487, "rewards/accuracies": 0.75, "rewards/chosen": 0.61018306016922, "rewards/margins": 0.33607998490333557, "rewards/rejected": 0.2741030156612396, "step": 724 }, { "epoch": 0.44200579180003047, "grad_norm": 64.20521057241812, "learning_rate": 3.1434146341463415e-08, "logits/chosen": 0.1445278376340866, "logits/rejected": 0.24122527241706848, "logps/chosen": -255.03555297851562, "logps/rejected": -215.16542053222656, "loss": 0.5486, "rewards/accuracies": 0.5, "rewards/chosen": 0.309909850358963, "rewards/margins": 0.14310120046138763, "rewards/rejected": 0.16680864989757538, "step": 725 }, { "epoch": 0.44261545496113397, "grad_norm": 60.5035617542339, "learning_rate": 3.1478048780487803e-08, "logits/chosen": 0.28575438261032104, "logits/rejected": 0.19118987023830414, "logps/chosen": -41.21653747558594, "logps/rejected": -101.06883239746094, "loss": 0.5956, "rewards/accuracies": 1.0, "rewards/chosen": 0.09021880477666855, "rewards/margins": 0.3644470274448395, "rewards/rejected": -0.2742282450199127, "step": 726 }, { "epoch": 0.44322511812223747, "grad_norm": 68.43650913601698, "learning_rate": 3.152195121951219e-08, "logits/chosen": 0.15166331827640533, "logits/rejected": 0.1310874968767166, "logps/chosen": -208.40158081054688, "logps/rejected": -109.47624206542969, "loss": 0.5784, "rewards/accuracies": 0.75, "rewards/chosen": 0.16175362467765808, "rewards/margins": 0.06366458535194397, "rewards/rejected": 0.09808903932571411, "step": 727 }, { "epoch": 0.443834781283341, "grad_norm": 64.958778101188, "learning_rate": 3.156585365853659e-08, "logits/chosen": 0.39212292432785034, "logits/rejected": 0.32695478200912476, "logps/chosen": -361.8396301269531, "logps/rejected": -218.62094116210938, "loss": 0.6119, "rewards/accuracies": 0.5, "rewards/chosen": 0.31615012884140015, "rewards/margins": 0.035128410905599594, "rewards/rejected": 0.28102171421051025, "step": 728 }, { "epoch": 0.4444444444444444, "grad_norm": 57.81165897409726, "learning_rate": 3.1609756097560975e-08, "logits/chosen": 0.2175525724887848, "logits/rejected": 0.17773234844207764, "logps/chosen": -161.65342712402344, "logps/rejected": -194.79393005371094, "loss": 0.5842, "rewards/accuracies": 0.5, "rewards/chosen": 0.27169203758239746, "rewards/margins": 0.369501531124115, "rewards/rejected": -0.09780949354171753, "step": 729 }, { "epoch": 0.4450541076055479, "grad_norm": 68.99268646805703, "learning_rate": 3.1653658536585363e-08, "logits/chosen": 0.13952380418777466, "logits/rejected": 0.15980111062526703, "logps/chosen": -12.834968566894531, "logps/rejected": -6.9011125564575195, "loss": 0.6208, "rewards/accuracies": 0.5, "rewards/chosen": -0.17561447620391846, "rewards/margins": -0.17670869827270508, "rewards/rejected": 0.0010942351073026657, "step": 730 }, { "epoch": 0.4456637707666514, "grad_norm": 81.46389111150688, "learning_rate": 3.169756097560975e-08, "logits/chosen": 0.15873503684997559, "logits/rejected": 0.10875527560710907, "logps/chosen": -317.6955871582031, "logps/rejected": -255.65602111816406, "loss": 0.686, "rewards/accuracies": 0.75, "rewards/chosen": 0.5661007165908813, "rewards/margins": 0.5059922337532043, "rewards/rejected": 0.06010846048593521, "step": 731 }, { "epoch": 0.4462734339277549, "grad_norm": 77.1070906210175, "learning_rate": 3.174146341463415e-08, "logits/chosen": 0.2867441177368164, "logits/rejected": 0.1727285385131836, "logps/chosen": -242.66256713867188, "logps/rejected": -271.26776123046875, "loss": 0.6342, "rewards/accuracies": 1.0, "rewards/chosen": 0.3202134370803833, "rewards/margins": 0.5896958112716675, "rewards/rejected": -0.26948240399360657, "step": 732 }, { "epoch": 0.4468830970888584, "grad_norm": 58.25214993847489, "learning_rate": 3.1785365853658535e-08, "logits/chosen": -0.1107330471277237, "logits/rejected": 0.035387180745601654, "logps/chosen": -206.23092651367188, "logps/rejected": -154.17787170410156, "loss": 0.5676, "rewards/accuracies": 0.25, "rewards/chosen": 0.06631675362586975, "rewards/margins": -0.09842417389154434, "rewards/rejected": 0.1647409200668335, "step": 733 }, { "epoch": 0.4474927602499619, "grad_norm": 59.82856209161144, "learning_rate": 3.1829268292682924e-08, "logits/chosen": -0.1534700095653534, "logits/rejected": 0.06522519141435623, "logps/chosen": -416.1253662109375, "logps/rejected": -306.8860168457031, "loss": 0.5623, "rewards/accuracies": 1.0, "rewards/chosen": 0.9827251434326172, "rewards/margins": 0.9360064268112183, "rewards/rejected": 0.04671870172023773, "step": 734 }, { "epoch": 0.4481024234110654, "grad_norm": 61.92935271858862, "learning_rate": 3.187317073170732e-08, "logits/chosen": -0.005850538611412048, "logits/rejected": -0.1219499334692955, "logps/chosen": -98.82186889648438, "logps/rejected": -199.60157775878906, "loss": 0.5855, "rewards/accuracies": 0.5, "rewards/chosen": 0.30541688203811646, "rewards/margins": 0.47873765230178833, "rewards/rejected": -0.1733207404613495, "step": 735 }, { "epoch": 0.4487120865721689, "grad_norm": 58.759158962511094, "learning_rate": 3.191707317073171e-08, "logits/chosen": -0.1131751537322998, "logits/rejected": 0.20346003770828247, "logps/chosen": -393.7450866699219, "logps/rejected": -288.71002197265625, "loss": 0.5451, "rewards/accuracies": 0.75, "rewards/chosen": 0.8758038282394409, "rewards/margins": 1.149061679840088, "rewards/rejected": -0.273257851600647, "step": 736 }, { "epoch": 0.4493217497332724, "grad_norm": 77.27702057249535, "learning_rate": 3.1960975609756095e-08, "logits/chosen": 0.20635244250297546, "logits/rejected": -0.15667811036109924, "logps/chosen": -94.55105590820312, "logps/rejected": -290.2279052734375, "loss": 0.6516, "rewards/accuracies": 1.0, "rewards/chosen": 0.2631429433822632, "rewards/margins": 0.18857741355895996, "rewards/rejected": 0.07456552982330322, "step": 737 }, { "epoch": 0.4499314128943759, "grad_norm": 62.02774374273467, "learning_rate": 3.2004878048780484e-08, "logits/chosen": 0.18018031120300293, "logits/rejected": 0.17460641264915466, "logps/chosen": -21.496278762817383, "logps/rejected": -11.181031227111816, "loss": 0.6009, "rewards/accuracies": 0.5, "rewards/chosen": -0.506889820098877, "rewards/margins": -0.26636621356010437, "rewards/rejected": -0.24052362143993378, "step": 738 }, { "epoch": 0.45054107605547933, "grad_norm": 59.67016395528802, "learning_rate": 3.204878048780488e-08, "logits/chosen": 0.2724151015281677, "logits/rejected": 0.16841967403888702, "logps/chosen": -197.8876953125, "logps/rejected": -262.4444274902344, "loss": 0.5878, "rewards/accuracies": 0.5, "rewards/chosen": 0.4625575542449951, "rewards/margins": 0.21330539882183075, "rewards/rejected": 0.24925214052200317, "step": 739 }, { "epoch": 0.45115073921658283, "grad_norm": 73.864837429737, "learning_rate": 3.209268292682927e-08, "logits/chosen": 0.020081382244825363, "logits/rejected": 0.18595314025878906, "logps/chosen": -279.2730712890625, "logps/rejected": -325.4978942871094, "loss": 0.6138, "rewards/accuracies": 0.75, "rewards/chosen": 1.013555884361267, "rewards/margins": 0.3410736918449402, "rewards/rejected": 0.6724821925163269, "step": 740 }, { "epoch": 0.45176040237768633, "grad_norm": 65.65281963931344, "learning_rate": 3.2136585365853655e-08, "logits/chosen": -0.26649999618530273, "logits/rejected": -0.09467080235481262, "logps/chosen": -210.10275268554688, "logps/rejected": -148.3544464111328, "loss": 0.5708, "rewards/accuracies": 0.5, "rewards/chosen": 0.09878063201904297, "rewards/margins": -0.03505063056945801, "rewards/rejected": 0.13383126258850098, "step": 741 }, { "epoch": 0.45237006553878983, "grad_norm": 75.23993241177425, "learning_rate": 3.218048780487805e-08, "logits/chosen": 0.0372052900493145, "logits/rejected": 0.13422900438308716, "logps/chosen": -291.0384826660156, "logps/rejected": -262.3160400390625, "loss": 0.6137, "rewards/accuracies": 0.75, "rewards/chosen": 0.5097771883010864, "rewards/margins": 0.5051177740097046, "rewards/rejected": 0.004659377038478851, "step": 742 }, { "epoch": 0.45297972869989334, "grad_norm": 61.379429711158096, "learning_rate": 3.222439024390244e-08, "logits/chosen": 0.32957613468170166, "logits/rejected": 0.33307385444641113, "logps/chosen": -110.95623016357422, "logps/rejected": -84.92561340332031, "loss": 0.5974, "rewards/accuracies": 1.0, "rewards/chosen": -0.02196323871612549, "rewards/margins": 0.2710154354572296, "rewards/rejected": -0.2929787039756775, "step": 743 }, { "epoch": 0.4535893918609968, "grad_norm": 68.18531020727382, "learning_rate": 3.226829268292683e-08, "logits/chosen": 0.32582759857177734, "logits/rejected": 0.25220736861228943, "logps/chosen": -272.1997375488281, "logps/rejected": -230.9661102294922, "loss": 0.6077, "rewards/accuracies": 1.0, "rewards/chosen": 0.2933657467365265, "rewards/margins": 0.34890851378440857, "rewards/rejected": -0.055542752146720886, "step": 744 }, { "epoch": 0.4541990550221003, "grad_norm": 58.3393249286719, "learning_rate": 3.2312195121951216e-08, "logits/chosen": 0.42729368805885315, "logits/rejected": 0.3558027148246765, "logps/chosen": -98.97964477539062, "logps/rejected": -104.35940551757812, "loss": 0.5867, "rewards/accuracies": 0.75, "rewards/chosen": 0.4682806730270386, "rewards/margins": 0.07165151834487915, "rewards/rejected": 0.3966291844844818, "step": 745 }, { "epoch": 0.4548087181832038, "grad_norm": 66.89483590330624, "learning_rate": 3.235609756097561e-08, "logits/chosen": 0.08102698624134064, "logits/rejected": -0.11477988958358765, "logps/chosen": -257.57708740234375, "logps/rejected": -305.8965148925781, "loss": 0.6073, "rewards/accuracies": 0.75, "rewards/chosen": 0.38230252265930176, "rewards/margins": 0.29078590869903564, "rewards/rejected": 0.09151659905910492, "step": 746 }, { "epoch": 0.4554183813443073, "grad_norm": 70.641732269457, "learning_rate": 3.24e-08, "logits/chosen": -0.10593121498823166, "logits/rejected": -0.029123404994606972, "logps/chosen": -417.87298583984375, "logps/rejected": -265.234619140625, "loss": 0.5788, "rewards/accuracies": 1.0, "rewards/chosen": 0.7471208572387695, "rewards/margins": 0.828920841217041, "rewards/rejected": -0.08180002123117447, "step": 747 }, { "epoch": 0.45602804450541073, "grad_norm": 55.11463059725924, "learning_rate": 3.244390243902439e-08, "logits/chosen": 0.4029293656349182, "logits/rejected": 0.32838955521583557, "logps/chosen": -159.41629028320312, "logps/rejected": -205.91880798339844, "loss": 0.6031, "rewards/accuracies": 1.0, "rewards/chosen": -0.11138274520635605, "rewards/margins": 0.7855024933815002, "rewards/rejected": -0.8968852758407593, "step": 748 }, { "epoch": 0.45663770766651424, "grad_norm": 63.761896045577224, "learning_rate": 3.248780487804878e-08, "logits/chosen": -0.12749917805194855, "logits/rejected": 0.07667224854230881, "logps/chosen": -210.73178100585938, "logps/rejected": -122.18229675292969, "loss": 0.5588, "rewards/accuracies": 0.75, "rewards/chosen": 0.33733201026916504, "rewards/margins": 0.7119225263595581, "rewards/rejected": -0.37459051609039307, "step": 749 }, { "epoch": 0.45724737082761774, "grad_norm": 62.32785592849077, "learning_rate": 3.253170731707317e-08, "logits/chosen": 0.1593983918428421, "logits/rejected": -0.07764932513237, "logps/chosen": -70.96792602539062, "logps/rejected": -189.618408203125, "loss": 0.6099, "rewards/accuracies": 0.75, "rewards/chosen": 0.015467909164726734, "rewards/margins": -0.08667121827602386, "rewards/rejected": 0.10213913023471832, "step": 750 }, { "epoch": 0.45785703398872124, "grad_norm": 60.625771850025146, "learning_rate": 3.257560975609756e-08, "logits/chosen": 0.17726343870162964, "logits/rejected": 0.08285678178071976, "logps/chosen": -150.5395050048828, "logps/rejected": -177.4819793701172, "loss": 0.5451, "rewards/accuracies": 1.0, "rewards/chosen": 0.4170660078525543, "rewards/margins": 0.6691288352012634, "rewards/rejected": -0.2520628273487091, "step": 751 }, { "epoch": 0.45846669714982474, "grad_norm": 62.306705278437036, "learning_rate": 3.261951219512195e-08, "logits/chosen": 0.4701884090900421, "logits/rejected": 0.14265279471874237, "logps/chosen": -153.5452423095703, "logps/rejected": -329.3506164550781, "loss": 0.5386, "rewards/accuracies": 0.75, "rewards/chosen": -0.16269707679748535, "rewards/margins": 0.24957947432994843, "rewards/rejected": -0.41227656602859497, "step": 752 }, { "epoch": 0.4590763603109282, "grad_norm": 56.55226551304226, "learning_rate": 3.266341463414634e-08, "logits/chosen": 0.1673717498779297, "logits/rejected": 0.3038822114467621, "logps/chosen": -96.20490264892578, "logps/rejected": -66.98352813720703, "loss": 0.5467, "rewards/accuracies": 0.75, "rewards/chosen": 0.062169261276721954, "rewards/margins": 0.11970269680023193, "rewards/rejected": -0.05753343924880028, "step": 753 }, { "epoch": 0.4596860234720317, "grad_norm": 78.33667368470162, "learning_rate": 3.270731707317073e-08, "logits/chosen": 0.06571709364652634, "logits/rejected": 0.0390804223716259, "logps/chosen": -42.88839340209961, "logps/rejected": -102.15044403076172, "loss": 0.6561, "rewards/accuracies": 0.5, "rewards/chosen": 0.013066366314888, "rewards/margins": 0.48513418436050415, "rewards/rejected": -0.47206783294677734, "step": 754 }, { "epoch": 0.4602956866331352, "grad_norm": 64.30184268769287, "learning_rate": 3.275121951219512e-08, "logits/chosen": 0.3882122337818146, "logits/rejected": 0.3242022395133972, "logps/chosen": -19.027729034423828, "logps/rejected": -46.444732666015625, "loss": 0.6074, "rewards/accuracies": 1.0, "rewards/chosen": 0.08555036038160324, "rewards/margins": 0.20770028233528137, "rewards/rejected": -0.12214992195367813, "step": 755 }, { "epoch": 0.4609053497942387, "grad_norm": 65.97320436866737, "learning_rate": 3.2795121951219514e-08, "logits/chosen": -0.12974077463150024, "logits/rejected": -0.003422953188419342, "logps/chosen": -273.21502685546875, "logps/rejected": -384.43841552734375, "loss": 0.6027, "rewards/accuracies": 0.5, "rewards/chosen": 0.04085254669189453, "rewards/margins": -0.15066151320934296, "rewards/rejected": 0.1915140599012375, "step": 756 }, { "epoch": 0.4615150129553422, "grad_norm": 69.53782871578429, "learning_rate": 3.28390243902439e-08, "logits/chosen": -0.09778957068920135, "logits/rejected": 0.13234618306159973, "logps/chosen": -320.2364501953125, "logps/rejected": -250.09844970703125, "loss": 0.6215, "rewards/accuracies": 0.75, "rewards/chosen": 0.20290826261043549, "rewards/margins": -0.07620492577552795, "rewards/rejected": 0.27911320328712463, "step": 757 }, { "epoch": 0.46212467611644564, "grad_norm": 53.91068098227483, "learning_rate": 3.288292682926829e-08, "logits/chosen": -0.09435955435037613, "logits/rejected": 0.3796374201774597, "logps/chosen": -120.08363342285156, "logps/rejected": -117.78384399414062, "loss": 0.5211, "rewards/accuracies": 1.0, "rewards/chosen": 0.44105276465415955, "rewards/margins": 0.37706083059310913, "rewards/rejected": 0.06399193406105042, "step": 758 }, { "epoch": 0.46273433927754914, "grad_norm": 61.732824810998856, "learning_rate": 3.292682926829268e-08, "logits/chosen": 0.15379494428634644, "logits/rejected": 0.1573495864868164, "logps/chosen": -17.440587997436523, "logps/rejected": -26.378684997558594, "loss": 0.5885, "rewards/accuracies": 0.5, "rewards/chosen": -0.09881146252155304, "rewards/margins": 0.1264808624982834, "rewards/rejected": -0.22529232501983643, "step": 759 }, { "epoch": 0.46334400243865265, "grad_norm": 67.92257104269588, "learning_rate": 3.2970731707317074e-08, "logits/chosen": 0.20818915963172913, "logits/rejected": 0.18402251601219177, "logps/chosen": -188.99795532226562, "logps/rejected": -129.03713989257812, "loss": 0.609, "rewards/accuracies": 1.0, "rewards/chosen": 0.17216423153877258, "rewards/margins": 0.8478347063064575, "rewards/rejected": -0.6756705045700073, "step": 760 }, { "epoch": 0.46395366559975615, "grad_norm": 58.53609886318093, "learning_rate": 3.301463414634146e-08, "logits/chosen": 0.1354576051235199, "logits/rejected": 0.09735407680273056, "logps/chosen": -47.41239547729492, "logps/rejected": -47.80207443237305, "loss": 0.5519, "rewards/accuracies": 1.0, "rewards/chosen": 0.21174819767475128, "rewards/margins": 0.41935431957244873, "rewards/rejected": -0.20760615170001984, "step": 761 }, { "epoch": 0.46456332876085965, "grad_norm": 68.31152706043322, "learning_rate": 3.305853658536585e-08, "logits/chosen": 0.46516382694244385, "logits/rejected": 0.38613224029541016, "logps/chosen": -70.07086181640625, "logps/rejected": -119.79405975341797, "loss": 0.6268, "rewards/accuracies": 0.75, "rewards/chosen": 0.27662283182144165, "rewards/margins": 0.43011096119880676, "rewards/rejected": -0.15348811447620392, "step": 762 }, { "epoch": 0.4651729919219631, "grad_norm": 56.83479023691454, "learning_rate": 3.3102439024390246e-08, "logits/chosen": 0.08870130777359009, "logits/rejected": 0.08552171289920807, "logps/chosen": -178.08047485351562, "logps/rejected": -160.9407501220703, "loss": 0.5903, "rewards/accuracies": 1.0, "rewards/chosen": 0.7953975796699524, "rewards/margins": 0.7640186548233032, "rewards/rejected": 0.031378939747810364, "step": 763 }, { "epoch": 0.4657826550830666, "grad_norm": 55.16864538813876, "learning_rate": 3.3146341463414634e-08, "logits/chosen": 0.22163109481334686, "logits/rejected": 0.15488380193710327, "logps/chosen": -106.62368774414062, "logps/rejected": -110.352783203125, "loss": 0.5992, "rewards/accuracies": 0.75, "rewards/chosen": 0.18241918087005615, "rewards/margins": 0.18216146528720856, "rewards/rejected": 0.0002577081322669983, "step": 764 }, { "epoch": 0.4663923182441701, "grad_norm": 59.708401288405774, "learning_rate": 3.319024390243902e-08, "logits/chosen": -0.07213751971721649, "logits/rejected": 0.07934854179620743, "logps/chosen": -121.12588500976562, "logps/rejected": -86.45989227294922, "loss": 0.5534, "rewards/accuracies": 0.75, "rewards/chosen": -0.13942264020442963, "rewards/margins": 0.250957727432251, "rewards/rejected": -0.3903804123401642, "step": 765 }, { "epoch": 0.4670019814052736, "grad_norm": 71.78748459918226, "learning_rate": 3.323414634146341e-08, "logits/chosen": 0.11978058516979218, "logits/rejected": 0.14904363453388214, "logps/chosen": -119.83893585205078, "logps/rejected": -123.9186782836914, "loss": 0.559, "rewards/accuracies": 0.75, "rewards/chosen": 0.0574684701859951, "rewards/margins": 0.0947904884815216, "rewards/rejected": -0.0373220220208168, "step": 766 }, { "epoch": 0.46761164456637705, "grad_norm": 64.72349909503079, "learning_rate": 3.3278048780487806e-08, "logits/chosen": 0.1507500261068344, "logits/rejected": 0.18580779433250427, "logps/chosen": -165.88743591308594, "logps/rejected": -148.12942504882812, "loss": 0.5935, "rewards/accuracies": 0.75, "rewards/chosen": 0.2788294553756714, "rewards/margins": 0.4257279634475708, "rewards/rejected": -0.1468985229730606, "step": 767 }, { "epoch": 0.46822130772748055, "grad_norm": 70.78417825316278, "learning_rate": 3.3321951219512195e-08, "logits/chosen": 0.1303892731666565, "logits/rejected": 0.1609259992837906, "logps/chosen": -254.47705078125, "logps/rejected": -325.6808776855469, "loss": 0.6062, "rewards/accuracies": 0.5, "rewards/chosen": 0.01449042558670044, "rewards/margins": -0.027748242020606995, "rewards/rejected": 0.04223868250846863, "step": 768 }, { "epoch": 0.46883097088858405, "grad_norm": 60.23105195383743, "learning_rate": 3.336585365853658e-08, "logits/chosen": 0.22307035326957703, "logits/rejected": 0.25752493739128113, "logps/chosen": -244.8804931640625, "logps/rejected": -125.85395050048828, "loss": 0.5232, "rewards/accuracies": 0.75, "rewards/chosen": 0.41586071252822876, "rewards/margins": 0.7935232520103455, "rewards/rejected": -0.3776625394821167, "step": 769 }, { "epoch": 0.46944063404968756, "grad_norm": 77.77191005675465, "learning_rate": 3.340975609756098e-08, "logits/chosen": -0.04936651140451431, "logits/rejected": -0.1360277533531189, "logps/chosen": -56.69487762451172, "logps/rejected": -89.06639099121094, "loss": 0.6283, "rewards/accuracies": 0.5, "rewards/chosen": 0.02953328937292099, "rewards/margins": 0.020573187619447708, "rewards/rejected": 0.00896010547876358, "step": 770 }, { "epoch": 0.47005029721079106, "grad_norm": 72.08810281987006, "learning_rate": 3.3453658536585366e-08, "logits/chosen": 0.43544837832450867, "logits/rejected": 0.35976696014404297, "logps/chosen": -77.24356842041016, "logps/rejected": -106.05728149414062, "loss": 0.6727, "rewards/accuracies": 0.25, "rewards/chosen": 0.12813350558280945, "rewards/margins": 0.019798681139945984, "rewards/rejected": 0.10833483189344406, "step": 771 }, { "epoch": 0.4706599603718945, "grad_norm": 65.0640779404551, "learning_rate": 3.3497560975609755e-08, "logits/chosen": 0.005504929460585117, "logits/rejected": 0.35064205527305603, "logps/chosen": -111.17489624023438, "logps/rejected": -67.89197540283203, "loss": 0.6024, "rewards/accuracies": 0.75, "rewards/chosen": 0.017039109021425247, "rewards/margins": 0.2605786621570587, "rewards/rejected": -0.24353954195976257, "step": 772 }, { "epoch": 0.471269623532998, "grad_norm": 71.73403148285288, "learning_rate": 3.354146341463414e-08, "logits/chosen": 0.22017183899879456, "logits/rejected": 0.06910734623670578, "logps/chosen": -134.75439453125, "logps/rejected": -162.3470916748047, "loss": 0.6302, "rewards/accuracies": 0.5, "rewards/chosen": -0.06771198660135269, "rewards/margins": -0.23933552205562592, "rewards/rejected": 0.17162355780601501, "step": 773 }, { "epoch": 0.4718792866941015, "grad_norm": 64.68294632257992, "learning_rate": 3.358536585365854e-08, "logits/chosen": 0.04541367292404175, "logits/rejected": 0.33783969283103943, "logps/chosen": -463.33154296875, "logps/rejected": -363.5344543457031, "loss": 0.4604, "rewards/accuracies": 0.75, "rewards/chosen": 0.3243526816368103, "rewards/margins": 0.3368990421295166, "rewards/rejected": -0.012546353042125702, "step": 774 }, { "epoch": 0.472488949855205, "grad_norm": 56.07396666651526, "learning_rate": 3.3629268292682926e-08, "logits/chosen": 0.055941544473171234, "logits/rejected": 0.06496048718690872, "logps/chosen": -14.561441421508789, "logps/rejected": -14.488389015197754, "loss": 0.5139, "rewards/accuracies": 1.0, "rewards/chosen": 0.004245104733854532, "rewards/margins": 0.3938329219818115, "rewards/rejected": -0.3895878493785858, "step": 775 }, { "epoch": 0.4730986130163085, "grad_norm": 75.1658068785713, "learning_rate": 3.3673170731707315e-08, "logits/chosen": 0.3035067617893219, "logits/rejected": 0.17092974483966827, "logps/chosen": -83.666015625, "logps/rejected": -142.3246612548828, "loss": 0.6228, "rewards/accuracies": 0.75, "rewards/chosen": 0.009215408936142921, "rewards/margins": 0.10090556740760803, "rewards/rejected": -0.09169016033411026, "step": 776 }, { "epoch": 0.47370827617741196, "grad_norm": 71.25566605177507, "learning_rate": 3.371707317073171e-08, "logits/chosen": -0.01590004190802574, "logits/rejected": 0.060013532638549805, "logps/chosen": -80.48770904541016, "logps/rejected": -15.409080505371094, "loss": 0.6352, "rewards/accuracies": 1.0, "rewards/chosen": -0.018049947917461395, "rewards/margins": 0.41123759746551514, "rewards/rejected": -0.4292875826358795, "step": 777 }, { "epoch": 0.47431793933851546, "grad_norm": 64.25828490732628, "learning_rate": 3.376097560975609e-08, "logits/chosen": -0.13783808052539825, "logits/rejected": -0.1846710592508316, "logps/chosen": -90.84169006347656, "logps/rejected": -165.5963897705078, "loss": 0.5637, "rewards/accuracies": 0.75, "rewards/chosen": 0.2065526843070984, "rewards/margins": 0.4452173709869385, "rewards/rejected": -0.2386646866798401, "step": 778 }, { "epoch": 0.47492760249961896, "grad_norm": 58.87728516311014, "learning_rate": 3.3804878048780487e-08, "logits/chosen": 0.06521575152873993, "logits/rejected": 0.16959774494171143, "logps/chosen": -236.0845947265625, "logps/rejected": -152.92738342285156, "loss": 0.6085, "rewards/accuracies": 0.75, "rewards/chosen": 0.16076885163784027, "rewards/margins": 0.25768551230430603, "rewards/rejected": -0.09691667556762695, "step": 779 }, { "epoch": 0.47553726566072246, "grad_norm": 68.22586588198327, "learning_rate": 3.3848780487804875e-08, "logits/chosen": 0.14443618059158325, "logits/rejected": 0.10864615440368652, "logps/chosen": -75.4122543334961, "logps/rejected": -193.24659729003906, "loss": 0.5583, "rewards/accuracies": 0.75, "rewards/chosen": 0.258346289396286, "rewards/margins": 1.209718108177185, "rewards/rejected": -0.9513717889785767, "step": 780 }, { "epoch": 0.47614692882182597, "grad_norm": 58.829637383991205, "learning_rate": 3.3892682926829263e-08, "logits/chosen": 0.1799907088279724, "logits/rejected": 0.1539163887500763, "logps/chosen": -20.66031265258789, "logps/rejected": -24.9801082611084, "loss": 0.555, "rewards/accuracies": 0.75, "rewards/chosen": -0.4109232723712921, "rewards/margins": 0.29648423194885254, "rewards/rejected": -0.7074074745178223, "step": 781 }, { "epoch": 0.4767565919829294, "grad_norm": 73.41252613376706, "learning_rate": 3.393658536585365e-08, "logits/chosen": 0.049098651856184006, "logits/rejected": 0.28672829270362854, "logps/chosen": -61.963924407958984, "logps/rejected": -24.833805084228516, "loss": 0.5672, "rewards/accuracies": 0.25, "rewards/chosen": -0.2053811103105545, "rewards/margins": -0.0545276403427124, "rewards/rejected": -0.1508534699678421, "step": 782 }, { "epoch": 0.4773662551440329, "grad_norm": 66.08018083342746, "learning_rate": 3.398048780487805e-08, "logits/chosen": 0.45004844665527344, "logits/rejected": 0.1189308911561966, "logps/chosen": -116.97759246826172, "logps/rejected": -171.6212615966797, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": 0.443662166595459, "rewards/margins": 0.3807649314403534, "rewards/rejected": 0.06289726495742798, "step": 783 }, { "epoch": 0.4779759183051364, "grad_norm": 75.15703338972078, "learning_rate": 3.4024390243902435e-08, "logits/chosen": 0.23602449893951416, "logits/rejected": 0.30505746603012085, "logps/chosen": -216.32568359375, "logps/rejected": -34.69765853881836, "loss": 0.6908, "rewards/accuracies": 0.75, "rewards/chosen": 0.19814635813236237, "rewards/margins": 0.23832713067531586, "rewards/rejected": -0.040180787444114685, "step": 784 }, { "epoch": 0.4785855814662399, "grad_norm": 73.88348076281862, "learning_rate": 3.4068292682926823e-08, "logits/chosen": 0.32259875535964966, "logits/rejected": 0.0982908308506012, "logps/chosen": -185.67340087890625, "logps/rejected": -297.8865661621094, "loss": 0.5996, "rewards/accuracies": 0.5, "rewards/chosen": 0.5188018083572388, "rewards/margins": 0.3751254975795746, "rewards/rejected": 0.1436762809753418, "step": 785 }, { "epoch": 0.4791952446273434, "grad_norm": 51.28441397571893, "learning_rate": 3.411219512195122e-08, "logits/chosen": -0.10330458730459213, "logits/rejected": 0.06638757139444351, "logps/chosen": -153.25030517578125, "logps/rejected": -143.166015625, "loss": 0.525, "rewards/accuracies": 0.75, "rewards/chosen": 0.29190418124198914, "rewards/margins": 0.3344435691833496, "rewards/rejected": -0.042539406567811966, "step": 786 }, { "epoch": 0.47980490778844687, "grad_norm": 65.09062711909723, "learning_rate": 3.415609756097561e-08, "logits/chosen": 0.33637189865112305, "logits/rejected": 0.19565415382385254, "logps/chosen": -157.9989013671875, "logps/rejected": -210.083251953125, "loss": 0.5036, "rewards/accuracies": 0.5, "rewards/chosen": 0.3962502181529999, "rewards/margins": 0.6286591291427612, "rewards/rejected": -0.23240889608860016, "step": 787 }, { "epoch": 0.48041457094955037, "grad_norm": 77.862511455403, "learning_rate": 3.4199999999999995e-08, "logits/chosen": 0.09237313270568848, "logits/rejected": 0.25440582633018494, "logps/chosen": -161.5814666748047, "logps/rejected": -174.3079071044922, "loss": 0.7826, "rewards/accuracies": 0.75, "rewards/chosen": 0.13777244091033936, "rewards/margins": 0.24795600771903992, "rewards/rejected": -0.11018357425928116, "step": 788 }, { "epoch": 0.48102423411065387, "grad_norm": 66.8724467149785, "learning_rate": 3.4243902439024384e-08, "logits/chosen": 0.10143324732780457, "logits/rejected": -0.1377866119146347, "logps/chosen": -98.13770294189453, "logps/rejected": -253.71200561523438, "loss": 0.541, "rewards/accuracies": 1.0, "rewards/chosen": 0.03572654724121094, "rewards/margins": 0.2794824540615082, "rewards/rejected": -0.24375592172145844, "step": 789 }, { "epoch": 0.48163389727175737, "grad_norm": 69.73316263825721, "learning_rate": 3.428780487804878e-08, "logits/chosen": 0.10662485659122467, "logits/rejected": -0.2586067318916321, "logps/chosen": -158.9857940673828, "logps/rejected": -393.30560302734375, "loss": 0.6193, "rewards/accuracies": 0.75, "rewards/chosen": 0.19432978332042694, "rewards/margins": 0.19431820511817932, "rewards/rejected": 1.1574476957321167e-05, "step": 790 }, { "epoch": 0.4822435604328608, "grad_norm": 61.74529493736086, "learning_rate": 3.433170731707317e-08, "logits/chosen": -0.05212089419364929, "logits/rejected": -0.020766697824001312, "logps/chosen": -66.56182861328125, "logps/rejected": -34.273502349853516, "loss": 0.6464, "rewards/accuracies": 0.75, "rewards/chosen": 0.17125204205513, "rewards/margins": 0.32071420550346375, "rewards/rejected": -0.14946216344833374, "step": 791 }, { "epoch": 0.4828532235939643, "grad_norm": 70.43491750749516, "learning_rate": 3.4375609756097555e-08, "logits/chosen": 0.25808200240135193, "logits/rejected": 0.18567781150341034, "logps/chosen": -238.76187133789062, "logps/rejected": -250.26937866210938, "loss": 0.5948, "rewards/accuracies": 0.75, "rewards/chosen": 0.35204005241394043, "rewards/margins": 0.3273167014122009, "rewards/rejected": 0.024723336100578308, "step": 792 }, { "epoch": 0.4834628867550678, "grad_norm": 66.69868840488981, "learning_rate": 3.441951219512195e-08, "logits/chosen": 0.030955903232097626, "logits/rejected": 0.27113527059555054, "logps/chosen": -398.656494140625, "logps/rejected": -278.81207275390625, "loss": 0.6059, "rewards/accuracies": 0.25, "rewards/chosen": 0.07109531760215759, "rewards/margins": 0.1095837950706482, "rewards/rejected": -0.0384884774684906, "step": 793 }, { "epoch": 0.4840725499161713, "grad_norm": 63.418428506006464, "learning_rate": 3.446341463414634e-08, "logits/chosen": -0.17916786670684814, "logits/rejected": 0.534211277961731, "logps/chosen": -233.7653350830078, "logps/rejected": -164.34133911132812, "loss": 0.6669, "rewards/accuracies": 0.75, "rewards/chosen": 0.025973699986934662, "rewards/margins": 0.17903375625610352, "rewards/rejected": -0.15306004881858826, "step": 794 }, { "epoch": 0.4846822130772748, "grad_norm": 76.56433277831285, "learning_rate": 3.450731707317073e-08, "logits/chosen": 0.09799869358539581, "logits/rejected": 0.17340679466724396, "logps/chosen": -222.72579956054688, "logps/rejected": -194.01608276367188, "loss": 0.6114, "rewards/accuracies": 1.0, "rewards/chosen": 0.159549742937088, "rewards/margins": 0.5828905701637268, "rewards/rejected": -0.4233408570289612, "step": 795 }, { "epoch": 0.4852918762383783, "grad_norm": 65.71962843442641, "learning_rate": 3.455121951219512e-08, "logits/chosen": 0.2741386592388153, "logits/rejected": 0.17925170063972473, "logps/chosen": -96.66497802734375, "logps/rejected": -116.22662353515625, "loss": 0.6093, "rewards/accuracies": 0.75, "rewards/chosen": -0.08882968872785568, "rewards/margins": 0.23059532046318054, "rewards/rejected": -0.31942498683929443, "step": 796 }, { "epoch": 0.4859015393994818, "grad_norm": 62.00727816555687, "learning_rate": 3.459512195121951e-08, "logits/chosen": 0.14391323924064636, "logits/rejected": 0.09784625470638275, "logps/chosen": -168.19680786132812, "logps/rejected": -135.7476348876953, "loss": 0.5661, "rewards/accuracies": 0.5, "rewards/chosen": -0.1364220678806305, "rewards/margins": 0.20289844274520874, "rewards/rejected": -0.33932051062583923, "step": 797 }, { "epoch": 0.4865112025605853, "grad_norm": 56.18647820236656, "learning_rate": 3.46390243902439e-08, "logits/chosen": 0.16822977364063263, "logits/rejected": 0.30192238092422485, "logps/chosen": -10.904973983764648, "logps/rejected": -24.79149055480957, "loss": 0.6166, "rewards/accuracies": 0.5, "rewards/chosen": -0.09225364774465561, "rewards/margins": 0.22615376114845276, "rewards/rejected": -0.31840741634368896, "step": 798 }, { "epoch": 0.4871208657216888, "grad_norm": 95.52585477358602, "learning_rate": 3.468292682926829e-08, "logits/chosen": 0.21458560228347778, "logits/rejected": 0.4480295479297638, "logps/chosen": -96.76959228515625, "logps/rejected": -62.71880340576172, "loss": 0.7004, "rewards/accuracies": 0.75, "rewards/chosen": -0.16143067181110382, "rewards/margins": 0.20936936140060425, "rewards/rejected": -0.3708000183105469, "step": 799 }, { "epoch": 0.4877305288827923, "grad_norm": 76.03065118312753, "learning_rate": 3.472682926829268e-08, "logits/chosen": 0.0231266301125288, "logits/rejected": 0.031965479254722595, "logps/chosen": -240.28280639648438, "logps/rejected": -293.67279052734375, "loss": 0.593, "rewards/accuracies": 0.75, "rewards/chosen": 0.24128007888793945, "rewards/margins": 0.6292192339897156, "rewards/rejected": -0.38793909549713135, "step": 800 }, { "epoch": 0.4883401920438957, "grad_norm": 66.69555162121341, "learning_rate": 3.477073170731707e-08, "logits/chosen": 0.1899629682302475, "logits/rejected": -0.011241592466831207, "logps/chosen": -166.8588409423828, "logps/rejected": -318.6372985839844, "loss": 0.65, "rewards/accuracies": 1.0, "rewards/chosen": 0.2232269048690796, "rewards/margins": 0.338846355676651, "rewards/rejected": -0.11561945825815201, "step": 801 }, { "epoch": 0.48894985520499923, "grad_norm": 63.70093626901587, "learning_rate": 3.481463414634146e-08, "logits/chosen": 0.14436128735542297, "logits/rejected": 0.17100831866264343, "logps/chosen": -100.22640228271484, "logps/rejected": -85.49988555908203, "loss": 0.5956, "rewards/accuracies": 0.75, "rewards/chosen": 0.2008926421403885, "rewards/margins": 0.4725527763366699, "rewards/rejected": -0.27166011929512024, "step": 802 }, { "epoch": 0.48955951836610273, "grad_norm": 57.13753896864066, "learning_rate": 3.4858536585365854e-08, "logits/chosen": 0.21252959966659546, "logits/rejected": 0.2079596370458603, "logps/chosen": -17.76156234741211, "logps/rejected": -30.843708038330078, "loss": 0.5982, "rewards/accuracies": 0.75, "rewards/chosen": -0.32302653789520264, "rewards/margins": 0.3257739543914795, "rewards/rejected": -0.6488004922866821, "step": 803 }, { "epoch": 0.49016918152720623, "grad_norm": 83.44429727902448, "learning_rate": 3.490243902439024e-08, "logits/chosen": 0.029475249350070953, "logits/rejected": 0.16109654307365417, "logps/chosen": -177.29476928710938, "logps/rejected": -206.28036499023438, "loss": 0.6798, "rewards/accuracies": 0.25, "rewards/chosen": 0.046961214393377304, "rewards/margins": -0.10674400627613068, "rewards/rejected": 0.15370520949363708, "step": 804 }, { "epoch": 0.49077884468830973, "grad_norm": 55.19283959826443, "learning_rate": 3.494634146341463e-08, "logits/chosen": 0.07019703090190887, "logits/rejected": 0.16316017508506775, "logps/chosen": -129.90567016601562, "logps/rejected": -123.85980224609375, "loss": 0.5812, "rewards/accuracies": 0.5, "rewards/chosen": 0.18906404078006744, "rewards/margins": -0.09188410639762878, "rewards/rejected": 0.28094813227653503, "step": 805 }, { "epoch": 0.4913885078494132, "grad_norm": 71.98262420823814, "learning_rate": 3.499024390243902e-08, "logits/chosen": 0.22487740218639374, "logits/rejected": 0.24013380706310272, "logps/chosen": -52.62399673461914, "logps/rejected": -78.3775634765625, "loss": 0.6248, "rewards/accuracies": 0.25, "rewards/chosen": -0.27006229758262634, "rewards/margins": -0.24889151751995087, "rewards/rejected": -0.021170781925320625, "step": 806 }, { "epoch": 0.4919981710105167, "grad_norm": 63.106620429570484, "learning_rate": 3.5034146341463414e-08, "logits/chosen": 0.17405636608600616, "logits/rejected": 0.15376809239387512, "logps/chosen": -63.981300354003906, "logps/rejected": -71.03421020507812, "loss": 0.5691, "rewards/accuracies": 0.75, "rewards/chosen": -0.07564292848110199, "rewards/margins": 0.6431266069412231, "rewards/rejected": -0.7187695503234863, "step": 807 }, { "epoch": 0.4926078341716202, "grad_norm": 77.21823359518474, "learning_rate": 3.50780487804878e-08, "logits/chosen": 0.2948898375034332, "logits/rejected": 0.07608085125684738, "logps/chosen": -281.1902770996094, "logps/rejected": -407.0839538574219, "loss": 0.5953, "rewards/accuracies": 0.75, "rewards/chosen": 0.11270199716091156, "rewards/margins": 0.30205726623535156, "rewards/rejected": -0.1893552839756012, "step": 808 }, { "epoch": 0.4932174973327237, "grad_norm": 65.03744614118033, "learning_rate": 3.512195121951219e-08, "logits/chosen": -0.0038300901651382446, "logits/rejected": -0.020449087023735046, "logps/chosen": -101.77850341796875, "logps/rejected": -89.84571838378906, "loss": 0.5875, "rewards/accuracies": 0.75, "rewards/chosen": -0.48315876722335815, "rewards/margins": 0.2094717025756836, "rewards/rejected": -0.6926305294036865, "step": 809 }, { "epoch": 0.49382716049382713, "grad_norm": 60.374855244179564, "learning_rate": 3.5165853658536586e-08, "logits/chosen": -0.4043070077896118, "logits/rejected": 0.2836885452270508, "logps/chosen": -317.43115234375, "logps/rejected": -238.12161254882812, "loss": 0.4963, "rewards/accuracies": 0.5, "rewards/chosen": 0.3807636499404907, "rewards/margins": 0.14152394235134125, "rewards/rejected": 0.23923969268798828, "step": 810 }, { "epoch": 0.49443682365493064, "grad_norm": 58.054693130003635, "learning_rate": 3.5209756097560974e-08, "logits/chosen": 0.2596984803676605, "logits/rejected": 0.18857869505882263, "logps/chosen": -195.6874237060547, "logps/rejected": -212.06976318359375, "loss": 0.5753, "rewards/accuracies": 1.0, "rewards/chosen": 0.16618141531944275, "rewards/margins": 1.0704714059829712, "rewards/rejected": -0.904289960861206, "step": 811 }, { "epoch": 0.49504648681603414, "grad_norm": 66.85845810270146, "learning_rate": 3.525365853658536e-08, "logits/chosen": 0.01069004088640213, "logits/rejected": 0.1511375904083252, "logps/chosen": -119.74539184570312, "logps/rejected": -60.879364013671875, "loss": 0.623, "rewards/accuracies": 0.25, "rewards/chosen": -0.029255736619234085, "rewards/margins": 0.06261497735977173, "rewards/rejected": -0.09187071770429611, "step": 812 }, { "epoch": 0.49565614997713764, "grad_norm": 64.44519241903701, "learning_rate": 3.529756097560975e-08, "logits/chosen": -0.05384228006005287, "logits/rejected": -0.09951944649219513, "logps/chosen": -115.72950744628906, "logps/rejected": -226.60693359375, "loss": 0.5969, "rewards/accuracies": 1.0, "rewards/chosen": 0.09954290091991425, "rewards/margins": 0.9953127503395081, "rewards/rejected": -0.8957698345184326, "step": 813 }, { "epoch": 0.49626581313824114, "grad_norm": 64.73999018145003, "learning_rate": 3.5341463414634146e-08, "logits/chosen": -0.02636069990694523, "logits/rejected": 0.27930817008018494, "logps/chosen": -138.90631103515625, "logps/rejected": -101.73639678955078, "loss": 0.6013, "rewards/accuracies": 0.5, "rewards/chosen": 0.15106813609600067, "rewards/margins": 0.04017619788646698, "rewards/rejected": 0.11089195311069489, "step": 814 }, { "epoch": 0.4968754762993446, "grad_norm": 80.06874543046527, "learning_rate": 3.5385365853658534e-08, "logits/chosen": 0.10787045955657959, "logits/rejected": 0.16068562865257263, "logps/chosen": -182.59161376953125, "logps/rejected": -177.74913024902344, "loss": 0.6295, "rewards/accuracies": 1.0, "rewards/chosen": 0.33856871724128723, "rewards/margins": 0.3896040916442871, "rewards/rejected": -0.051035378128290176, "step": 815 }, { "epoch": 0.4974851394604481, "grad_norm": 63.31855345344853, "learning_rate": 3.542926829268292e-08, "logits/chosen": -0.02349713072180748, "logits/rejected": 0.2184727042913437, "logps/chosen": -324.282470703125, "logps/rejected": -248.28179931640625, "loss": 0.5179, "rewards/accuracies": 1.0, "rewards/chosen": 0.9143310785293579, "rewards/margins": 1.4258538484573364, "rewards/rejected": -0.5115228891372681, "step": 816 }, { "epoch": 0.4980948026215516, "grad_norm": 68.31154055717026, "learning_rate": 3.547317073170732e-08, "logits/chosen": 0.2828770577907562, "logits/rejected": 0.13964536786079407, "logps/chosen": -114.00566864013672, "logps/rejected": -157.14462280273438, "loss": 0.5882, "rewards/accuracies": 0.5, "rewards/chosen": 0.21594074368476868, "rewards/margins": -0.019449200481176376, "rewards/rejected": 0.23538993299007416, "step": 817 }, { "epoch": 0.4987044657826551, "grad_norm": 62.69883087366106, "learning_rate": 3.5517073170731706e-08, "logits/chosen": 0.2960382401943207, "logits/rejected": 0.1413854956626892, "logps/chosen": -118.39256286621094, "logps/rejected": -212.56390380859375, "loss": 0.5799, "rewards/accuracies": 0.75, "rewards/chosen": 0.15546150505542755, "rewards/margins": 0.7765097618103027, "rewards/rejected": -0.6210482120513916, "step": 818 }, { "epoch": 0.4993141289437586, "grad_norm": 75.52294565825872, "learning_rate": 3.5560975609756094e-08, "logits/chosen": -0.18223996460437775, "logits/rejected": -0.20841598510742188, "logps/chosen": -236.8354949951172, "logps/rejected": -340.8524169921875, "loss": 0.5831, "rewards/accuracies": 0.5, "rewards/chosen": 0.37170901894569397, "rewards/margins": -0.08836756646633148, "rewards/rejected": 0.46007657051086426, "step": 819 }, { "epoch": 0.49992379210486204, "grad_norm": 53.98723231146931, "learning_rate": 3.560487804878048e-08, "logits/chosen": -0.06694330275058746, "logits/rejected": -0.0628398060798645, "logps/chosen": -155.86611938476562, "logps/rejected": -212.4217071533203, "loss": 0.5504, "rewards/accuracies": 0.75, "rewards/chosen": 0.22390814125537872, "rewards/margins": -0.0001362636685371399, "rewards/rejected": 0.22404442727565765, "step": 820 }, { "epoch": 0.5005334552659656, "grad_norm": 72.27416260264785, "learning_rate": 3.564878048780488e-08, "logits/chosen": -0.04299474135041237, "logits/rejected": -0.007550263777375221, "logps/chosen": -96.05432891845703, "logps/rejected": -57.760093688964844, "loss": 0.5394, "rewards/accuracies": 1.0, "rewards/chosen": 0.42605799436569214, "rewards/margins": 0.6398237347602844, "rewards/rejected": -0.21376578509807587, "step": 821 }, { "epoch": 0.501143118427069, "grad_norm": 55.982740371709816, "learning_rate": 3.5692682926829266e-08, "logits/chosen": 0.37543606758117676, "logits/rejected": 0.1279752552509308, "logps/chosen": -35.941551208496094, "logps/rejected": -66.75108337402344, "loss": 0.525, "rewards/accuracies": 0.25, "rewards/chosen": -0.15648934245109558, "rewards/margins": -0.1225501149892807, "rewards/rejected": -0.03393923491239548, "step": 822 }, { "epoch": 0.5017527815881725, "grad_norm": 64.53581083310333, "learning_rate": 3.5736585365853655e-08, "logits/chosen": 0.1085236594080925, "logits/rejected": 0.1588239073753357, "logps/chosen": -83.08793640136719, "logps/rejected": -34.56549072265625, "loss": 0.6437, "rewards/accuracies": 0.5, "rewards/chosen": 0.0573994405567646, "rewards/margins": 0.02727280557155609, "rewards/rejected": 0.03012663498520851, "step": 823 }, { "epoch": 0.502362444749276, "grad_norm": 67.90415545067161, "learning_rate": 3.578048780487805e-08, "logits/chosen": 0.019908029586076736, "logits/rejected": 0.31911247968673706, "logps/chosen": -297.48858642578125, "logps/rejected": -111.53302764892578, "loss": 0.6178, "rewards/accuracies": 0.0, "rewards/chosen": -0.19981935620307922, "rewards/margins": -0.2561364769935608, "rewards/rejected": 0.056317128241062164, "step": 824 }, { "epoch": 0.5029721079103795, "grad_norm": 74.7628221054287, "learning_rate": 3.582439024390244e-08, "logits/chosen": 0.09637068212032318, "logits/rejected": 0.14802901446819305, "logps/chosen": -271.69989013671875, "logps/rejected": -203.29689025878906, "loss": 0.5978, "rewards/accuracies": 0.5, "rewards/chosen": -0.02346685528755188, "rewards/margins": 0.22066234052181244, "rewards/rejected": -0.24412918090820312, "step": 825 }, { "epoch": 0.503581771071483, "grad_norm": 53.67901711490742, "learning_rate": 3.5868292682926826e-08, "logits/chosen": 0.10175606608390808, "logits/rejected": 0.3559953570365906, "logps/chosen": -196.19017028808594, "logps/rejected": -213.1588897705078, "loss": 0.6125, "rewards/accuracies": 0.5, "rewards/chosen": 0.038393739610910416, "rewards/margins": 0.41897690296173096, "rewards/rejected": -0.38058316707611084, "step": 826 }, { "epoch": 0.5041914342325865, "grad_norm": 61.24527426927978, "learning_rate": 3.5912195121951215e-08, "logits/chosen": 0.2617185711860657, "logits/rejected": 0.2609824538230896, "logps/chosen": -9.725150108337402, "logps/rejected": -24.58201026916504, "loss": 0.5685, "rewards/accuracies": 0.5, "rewards/chosen": -0.2893357276916504, "rewards/margins": -0.005685422569513321, "rewards/rejected": -0.28365030884742737, "step": 827 }, { "epoch": 0.50480109739369, "grad_norm": 60.296376350666556, "learning_rate": 3.595609756097561e-08, "logits/chosen": 0.08205138146877289, "logits/rejected": 0.09890572726726532, "logps/chosen": -92.11471557617188, "logps/rejected": -139.8546905517578, "loss": 0.5405, "rewards/accuracies": 0.5, "rewards/chosen": 0.185858815908432, "rewards/margins": 0.019047502428293228, "rewards/rejected": 0.16681131720542908, "step": 828 }, { "epoch": 0.5054107605547935, "grad_norm": 62.6042647877219, "learning_rate": 3.6e-08, "logits/chosen": 0.007369082421064377, "logits/rejected": -0.05879281833767891, "logps/chosen": -149.5255126953125, "logps/rejected": -332.4320373535156, "loss": 0.5992, "rewards/accuracies": 0.75, "rewards/chosen": 0.43801242113113403, "rewards/margins": 0.6028183698654175, "rewards/rejected": -0.16480599343776703, "step": 829 }, { "epoch": 0.506020423715897, "grad_norm": 68.81408940827944, "learning_rate": 3.6043902439024386e-08, "logits/chosen": -0.05554075911641121, "logits/rejected": 0.07496682554483414, "logps/chosen": -106.6781234741211, "logps/rejected": -69.4856185913086, "loss": 0.5799, "rewards/accuracies": 0.5, "rewards/chosen": -0.16205430030822754, "rewards/margins": 0.10765685886144638, "rewards/rejected": -0.2697111666202545, "step": 830 }, { "epoch": 0.5066300868770005, "grad_norm": 63.766841041160916, "learning_rate": 3.608780487804878e-08, "logits/chosen": 0.2696358561515808, "logits/rejected": 0.11477681994438171, "logps/chosen": -103.76864624023438, "logps/rejected": -169.93309020996094, "loss": 0.6062, "rewards/accuracies": 0.25, "rewards/chosen": -0.2814573645591736, "rewards/margins": -0.2618228495121002, "rewards/rejected": -0.019634537398815155, "step": 831 }, { "epoch": 0.507239750038104, "grad_norm": 66.44566325037502, "learning_rate": 3.613170731707317e-08, "logits/chosen": 0.006435887888073921, "logits/rejected": 0.14947624504566193, "logps/chosen": -276.1130065917969, "logps/rejected": -197.30471801757812, "loss": 0.5337, "rewards/accuracies": 0.5, "rewards/chosen": 0.0059828683733940125, "rewards/margins": -0.0971691757440567, "rewards/rejected": 0.10315204411745071, "step": 832 }, { "epoch": 0.5078494131992074, "grad_norm": 70.33743299351269, "learning_rate": 3.617560975609756e-08, "logits/chosen": 0.12623409926891327, "logits/rejected": 0.08283305913209915, "logps/chosen": -187.23097229003906, "logps/rejected": -140.1327362060547, "loss": 0.539, "rewards/accuracies": 0.75, "rewards/chosen": 0.0500035360455513, "rewards/margins": 0.39642155170440674, "rewards/rejected": -0.34641802310943604, "step": 833 }, { "epoch": 0.508459076360311, "grad_norm": 119.87922314374883, "learning_rate": 3.6219512195121947e-08, "logits/chosen": 0.28614529967308044, "logits/rejected": 0.12593020498752594, "logps/chosen": -44.036285400390625, "logps/rejected": -101.87958526611328, "loss": 0.6763, "rewards/accuracies": 0.75, "rewards/chosen": 0.10472209751605988, "rewards/margins": 0.09965291619300842, "rewards/rejected": 0.0050691841170191765, "step": 834 }, { "epoch": 0.5090687395214144, "grad_norm": 55.442194966055226, "learning_rate": 3.626341463414634e-08, "logits/chosen": 0.0738789290189743, "logits/rejected": 0.09468599408864975, "logps/chosen": -87.6014633178711, "logps/rejected": -74.97404479980469, "loss": 0.4878, "rewards/accuracies": 0.75, "rewards/chosen": 0.07335515320301056, "rewards/margins": 0.3396499752998352, "rewards/rejected": -0.26629480719566345, "step": 835 }, { "epoch": 0.509678402682518, "grad_norm": 69.88154759613471, "learning_rate": 3.630731707317073e-08, "logits/chosen": -0.13200712203979492, "logits/rejected": -0.39851513504981995, "logps/chosen": -58.32075119018555, "logps/rejected": -90.2609634399414, "loss": 0.6329, "rewards/accuracies": 0.75, "rewards/chosen": 0.1759040355682373, "rewards/margins": 0.18853089213371277, "rewards/rejected": -0.012626864947378635, "step": 836 }, { "epoch": 0.5102880658436214, "grad_norm": 77.89392463115561, "learning_rate": 3.635121951219512e-08, "logits/chosen": 0.0990343987941742, "logits/rejected": -0.007436167448759079, "logps/chosen": -202.43553161621094, "logps/rejected": -415.59759521484375, "loss": 0.629, "rewards/accuracies": 1.0, "rewards/chosen": 0.41394180059432983, "rewards/margins": 1.3712553977966309, "rewards/rejected": -0.957313597202301, "step": 837 }, { "epoch": 0.5108977290047249, "grad_norm": 78.6798043342552, "learning_rate": 3.6395121951219513e-08, "logits/chosen": 0.19441884756088257, "logits/rejected": 0.02460932731628418, "logps/chosen": -160.54754638671875, "logps/rejected": -182.2922821044922, "loss": 0.6971, "rewards/accuracies": 0.5, "rewards/chosen": 0.20073196291923523, "rewards/margins": 0.34828808903694153, "rewards/rejected": -0.1475561112165451, "step": 838 }, { "epoch": 0.5115073921658284, "grad_norm": 57.78983825593819, "learning_rate": 3.64390243902439e-08, "logits/chosen": 0.33323365449905396, "logits/rejected": 0.31273379921913147, "logps/chosen": -209.37721252441406, "logps/rejected": -98.8693618774414, "loss": 0.5343, "rewards/accuracies": 0.75, "rewards/chosen": -0.11098715662956238, "rewards/margins": 0.09107886254787445, "rewards/rejected": -0.20206600427627563, "step": 839 }, { "epoch": 0.5121170553269319, "grad_norm": 75.61649299804209, "learning_rate": 3.648292682926829e-08, "logits/chosen": 0.11817595362663269, "logits/rejected": 0.14771093428134918, "logps/chosen": -24.33318519592285, "logps/rejected": -10.833059310913086, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": -0.1404966413974762, "rewards/margins": 0.28867948055267334, "rewards/rejected": -0.42917612195014954, "step": 840 }, { "epoch": 0.5127267184880354, "grad_norm": 65.68720728079661, "learning_rate": 3.652682926829268e-08, "logits/chosen": 0.18452531099319458, "logits/rejected": 0.18431918323040009, "logps/chosen": -7.983917236328125, "logps/rejected": -15.626996994018555, "loss": 0.5683, "rewards/accuracies": 0.25, "rewards/chosen": -0.25485336780548096, "rewards/margins": -0.12557300925254822, "rewards/rejected": -0.12928032875061035, "step": 841 }, { "epoch": 0.5133363816491389, "grad_norm": 72.0016158402061, "learning_rate": 3.6570731707317073e-08, "logits/chosen": 0.04569125175476074, "logits/rejected": 0.0868416428565979, "logps/chosen": -43.657135009765625, "logps/rejected": -40.785888671875, "loss": 0.5914, "rewards/accuracies": 1.0, "rewards/chosen": 0.11346039175987244, "rewards/margins": 0.20358936488628387, "rewards/rejected": -0.09012897312641144, "step": 842 }, { "epoch": 0.5139460448102423, "grad_norm": 72.82023770265735, "learning_rate": 3.661463414634146e-08, "logits/chosen": -0.1773550808429718, "logits/rejected": -0.10552921891212463, "logps/chosen": -317.5165710449219, "logps/rejected": -306.7833251953125, "loss": 0.637, "rewards/accuracies": 1.0, "rewards/chosen": 0.8384914994239807, "rewards/margins": 0.588058590888977, "rewards/rejected": 0.25043290853500366, "step": 843 }, { "epoch": 0.5145557079713459, "grad_norm": 62.38563307817259, "learning_rate": 3.665853658536585e-08, "logits/chosen": 0.003631487488746643, "logits/rejected": 0.038138628005981445, "logps/chosen": -165.54515075683594, "logps/rejected": -152.398681640625, "loss": 0.5624, "rewards/accuracies": 0.75, "rewards/chosen": 0.12048117816448212, "rewards/margins": 0.28906017541885376, "rewards/rejected": -0.16857901215553284, "step": 844 }, { "epoch": 0.5151653711324493, "grad_norm": 59.068977108334586, "learning_rate": 3.6702439024390245e-08, "logits/chosen": 0.6819825172424316, "logits/rejected": 0.3414430618286133, "logps/chosen": -51.40108871459961, "logps/rejected": -103.67996215820312, "loss": 0.5724, "rewards/accuracies": 0.75, "rewards/chosen": -0.02640927955508232, "rewards/margins": 0.23062248528003693, "rewards/rejected": -0.25703176856040955, "step": 845 }, { "epoch": 0.5157750342935528, "grad_norm": 52.870770819009095, "learning_rate": 3.6746341463414634e-08, "logits/chosen": 0.09804527461528778, "logits/rejected": 0.22699229419231415, "logps/chosen": -123.43194580078125, "logps/rejected": -122.18966674804688, "loss": 0.5079, "rewards/accuracies": 0.75, "rewards/chosen": 0.11057642847299576, "rewards/margins": 0.23703187704086304, "rewards/rejected": -0.12645544111728668, "step": 846 }, { "epoch": 0.5163846974546563, "grad_norm": 65.58905706663117, "learning_rate": 3.679024390243902e-08, "logits/chosen": 0.08693084865808487, "logits/rejected": 0.333507776260376, "logps/chosen": -99.66699981689453, "logps/rejected": -41.3652458190918, "loss": 0.5967, "rewards/accuracies": 0.25, "rewards/chosen": -0.32221508026123047, "rewards/margins": -0.25529253482818604, "rewards/rejected": -0.06692254543304443, "step": 847 }, { "epoch": 0.5169943606157598, "grad_norm": 76.42954531141284, "learning_rate": 3.683414634146341e-08, "logits/chosen": -0.02600632980465889, "logits/rejected": 0.11422056704759598, "logps/chosen": -102.6796875, "logps/rejected": -38.83604431152344, "loss": 0.6332, "rewards/accuracies": 0.75, "rewards/chosen": -0.05238047614693642, "rewards/margins": 0.4671940803527832, "rewards/rejected": -0.5195745229721069, "step": 848 }, { "epoch": 0.5176040237768633, "grad_norm": 70.49971776592591, "learning_rate": 3.6878048780487805e-08, "logits/chosen": 0.038019824773073196, "logits/rejected": 0.04546715319156647, "logps/chosen": -92.1142807006836, "logps/rejected": -51.81432342529297, "loss": 0.6389, "rewards/accuracies": 0.25, "rewards/chosen": -0.29714739322662354, "rewards/margins": -0.02509160526096821, "rewards/rejected": -0.2720557749271393, "step": 849 }, { "epoch": 0.5182136869379668, "grad_norm": 98.01020646077737, "learning_rate": 3.6921951219512194e-08, "logits/chosen": -0.25814080238342285, "logits/rejected": 0.328311562538147, "logps/chosen": -208.58871459960938, "logps/rejected": -94.48977661132812, "loss": 0.5648, "rewards/accuracies": 0.75, "rewards/chosen": 0.17906787991523743, "rewards/margins": 0.3485424816608429, "rewards/rejected": -0.16947460174560547, "step": 850 }, { "epoch": 0.5188233500990702, "grad_norm": 59.49566034028254, "learning_rate": 3.696585365853658e-08, "logits/chosen": 0.04231855273246765, "logits/rejected": 0.1485375612974167, "logps/chosen": -116.27388000488281, "logps/rejected": -76.1816177368164, "loss": 0.5772, "rewards/accuracies": 1.0, "rewards/chosen": 0.14461486041545868, "rewards/margins": 0.4042167067527771, "rewards/rejected": -0.259601891040802, "step": 851 }, { "epoch": 0.5194330132601738, "grad_norm": 77.50850284821571, "learning_rate": 3.700975609756098e-08, "logits/chosen": 0.15469692647457123, "logits/rejected": -0.03396916389465332, "logps/chosen": -231.93939208984375, "logps/rejected": -350.9715881347656, "loss": 0.5876, "rewards/accuracies": 0.75, "rewards/chosen": 0.37171655893325806, "rewards/margins": 0.7721027135848999, "rewards/rejected": -0.40038609504699707, "step": 852 }, { "epoch": 0.5200426764212772, "grad_norm": 60.73460630736764, "learning_rate": 3.7053658536585365e-08, "logits/chosen": 0.23656564950942993, "logits/rejected": 0.1381596028804779, "logps/chosen": -174.81961059570312, "logps/rejected": -293.6786193847656, "loss": 0.4706, "rewards/accuracies": 1.0, "rewards/chosen": 0.0001943148672580719, "rewards/margins": 0.24363532662391663, "rewards/rejected": -0.24344101548194885, "step": 853 }, { "epoch": 0.5206523395823808, "grad_norm": 59.29308126765653, "learning_rate": 3.7097560975609754e-08, "logits/chosen": -0.01903049647808075, "logits/rejected": 0.09655285626649857, "logps/chosen": -266.18084716796875, "logps/rejected": -188.98684692382812, "loss": 0.5426, "rewards/accuracies": 1.0, "rewards/chosen": 0.3428747355937958, "rewards/margins": 0.3217155337333679, "rewards/rejected": 0.021159224212169647, "step": 854 }, { "epoch": 0.5212620027434842, "grad_norm": 67.15913121769145, "learning_rate": 3.714146341463414e-08, "logits/chosen": 0.1330699771642685, "logits/rejected": 0.23801834881305695, "logps/chosen": -152.31396484375, "logps/rejected": -93.39073944091797, "loss": 0.5694, "rewards/accuracies": 0.75, "rewards/chosen": 0.2283179610967636, "rewards/margins": 0.4553031921386719, "rewards/rejected": -0.22698521614074707, "step": 855 }, { "epoch": 0.5218716659045877, "grad_norm": 62.83732358509938, "learning_rate": 3.718536585365854e-08, "logits/chosen": 0.05359729379415512, "logits/rejected": 0.08941934257745743, "logps/chosen": -55.36945343017578, "logps/rejected": -112.3162841796875, "loss": 0.6318, "rewards/accuracies": 0.25, "rewards/chosen": -0.4063263237476349, "rewards/margins": -0.3283662497997284, "rewards/rejected": -0.0779600739479065, "step": 856 }, { "epoch": 0.5224813290656912, "grad_norm": 67.01778152575903, "learning_rate": 3.7229268292682926e-08, "logits/chosen": 0.14143003523349762, "logits/rejected": 0.033660512417554855, "logps/chosen": -113.7740249633789, "logps/rejected": -213.09373474121094, "loss": 0.5523, "rewards/accuracies": 0.5, "rewards/chosen": 0.1885804831981659, "rewards/margins": 0.6856738328933716, "rewards/rejected": -0.4970932900905609, "step": 857 }, { "epoch": 0.5230909922267947, "grad_norm": 79.12985412209427, "learning_rate": 3.7273170731707314e-08, "logits/chosen": 0.04618055000901222, "logits/rejected": 0.01835821568965912, "logps/chosen": -317.59649658203125, "logps/rejected": -311.95257568359375, "loss": 0.5556, "rewards/accuracies": 0.5, "rewards/chosen": -0.05542896315455437, "rewards/margins": -0.06943736970424652, "rewards/rejected": 0.01400841772556305, "step": 858 }, { "epoch": 0.5237006553878982, "grad_norm": 63.238133753750255, "learning_rate": 3.731707317073171e-08, "logits/chosen": 0.09808889776468277, "logits/rejected": 0.20892862975597382, "logps/chosen": -304.3889465332031, "logps/rejected": -354.45733642578125, "loss": 0.529, "rewards/accuracies": 0.5, "rewards/chosen": 0.2641480565071106, "rewards/margins": 0.3940035402774811, "rewards/rejected": -0.1298554539680481, "step": 859 }, { "epoch": 0.5243103185490017, "grad_norm": 59.80341584344199, "learning_rate": 3.73609756097561e-08, "logits/chosen": 0.18731476366519928, "logits/rejected": 0.1431311070919037, "logps/chosen": -176.1766357421875, "logps/rejected": -171.20559692382812, "loss": 0.5285, "rewards/accuracies": 0.75, "rewards/chosen": 0.4174979329109192, "rewards/margins": 0.9883413910865784, "rewards/rejected": -0.5708434581756592, "step": 860 }, { "epoch": 0.5249199817101051, "grad_norm": 63.5129674699019, "learning_rate": 3.7404878048780486e-08, "logits/chosen": 0.22578765451908112, "logits/rejected": 0.22811663150787354, "logps/chosen": -21.193035125732422, "logps/rejected": -17.314197540283203, "loss": 0.5605, "rewards/accuracies": 0.75, "rewards/chosen": -0.32173627614974976, "rewards/margins": 0.3135220408439636, "rewards/rejected": -0.6352583169937134, "step": 861 }, { "epoch": 0.5255296448712087, "grad_norm": 62.97554727957017, "learning_rate": 3.7448780487804874e-08, "logits/chosen": 0.1587522029876709, "logits/rejected": 0.171729177236557, "logps/chosen": -9.075733184814453, "logps/rejected": -16.870502471923828, "loss": 0.5485, "rewards/accuracies": 0.5, "rewards/chosen": -0.12192894518375397, "rewards/margins": 0.08783310651779175, "rewards/rejected": -0.2097620666027069, "step": 862 }, { "epoch": 0.5261393080323121, "grad_norm": 61.41324613191589, "learning_rate": 3.749268292682927e-08, "logits/chosen": 0.2167873978614807, "logits/rejected": 0.5014666318893433, "logps/chosen": -66.5404052734375, "logps/rejected": -27.729721069335938, "loss": 0.5666, "rewards/accuracies": 0.75, "rewards/chosen": -0.31119388341903687, "rewards/margins": 0.0655917152762413, "rewards/rejected": -0.37678560614585876, "step": 863 }, { "epoch": 0.5267489711934157, "grad_norm": 57.71637274764013, "learning_rate": 3.753658536585366e-08, "logits/chosen": 0.20771121978759766, "logits/rejected": 0.20334795117378235, "logps/chosen": -61.600059509277344, "logps/rejected": -48.765316009521484, "loss": 0.5433, "rewards/accuracies": 0.75, "rewards/chosen": -0.10763809084892273, "rewards/margins": 0.006966426968574524, "rewards/rejected": -0.11460452526807785, "step": 864 }, { "epoch": 0.5273586343545191, "grad_norm": 67.18479937320431, "learning_rate": 3.7580487804878046e-08, "logits/chosen": -0.12771549820899963, "logits/rejected": 0.0017118752002716064, "logps/chosen": -308.0665283203125, "logps/rejected": -276.1645812988281, "loss": 0.5406, "rewards/accuracies": 0.75, "rewards/chosen": 0.2535865902900696, "rewards/margins": 0.3989185392856598, "rewards/rejected": -0.1453319638967514, "step": 865 }, { "epoch": 0.5279682975156226, "grad_norm": 66.86041384032525, "learning_rate": 3.762439024390244e-08, "logits/chosen": 0.008922770619392395, "logits/rejected": 0.037095263600349426, "logps/chosen": -95.94760131835938, "logps/rejected": -49.44132995605469, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": 0.08840463310480118, "rewards/margins": 0.4873013198375702, "rewards/rejected": -0.3988966941833496, "step": 866 }, { "epoch": 0.5285779606767261, "grad_norm": 56.773550955702355, "learning_rate": 3.766829268292683e-08, "logits/chosen": -0.13588771224021912, "logits/rejected": -0.21735835075378418, "logps/chosen": -71.41838073730469, "logps/rejected": -94.79507446289062, "loss": 0.5806, "rewards/accuracies": 0.5, "rewards/chosen": -0.02211012877523899, "rewards/margins": 0.020595155656337738, "rewards/rejected": -0.04270528629422188, "step": 867 }, { "epoch": 0.5291876238378296, "grad_norm": 76.66512714672785, "learning_rate": 3.771219512195122e-08, "logits/chosen": 0.17641451954841614, "logits/rejected": 0.181544229388237, "logps/chosen": -59.29175567626953, "logps/rejected": -83.70962524414062, "loss": 0.5483, "rewards/accuracies": 0.5, "rewards/chosen": -0.016960304230451584, "rewards/margins": 0.236171156167984, "rewards/rejected": -0.2531314492225647, "step": 868 }, { "epoch": 0.5297972869989331, "grad_norm": 75.57283352876863, "learning_rate": 3.775609756097561e-08, "logits/chosen": -0.008752023801207542, "logits/rejected": 0.08318771421909332, "logps/chosen": -147.72816467285156, "logps/rejected": -135.38661193847656, "loss": 0.6189, "rewards/accuracies": 0.5, "rewards/chosen": 0.08691678941249847, "rewards/margins": -0.1018247976899147, "rewards/rejected": 0.18874159455299377, "step": 869 }, { "epoch": 0.5304069501600366, "grad_norm": 64.29654345833464, "learning_rate": 3.78e-08, "logits/chosen": 0.11647903174161911, "logits/rejected": 0.17515388131141663, "logps/chosen": -140.824462890625, "logps/rejected": -57.53905487060547, "loss": 0.6441, "rewards/accuracies": 0.25, "rewards/chosen": 0.11540143936872482, "rewards/margins": 0.5147880911827087, "rewards/rejected": -0.39938661456108093, "step": 870 }, { "epoch": 0.53101661332114, "grad_norm": 65.75072184060713, "learning_rate": 3.784390243902439e-08, "logits/chosen": 0.1211961880326271, "logits/rejected": 0.09717921912670135, "logps/chosen": -455.9720458984375, "logps/rejected": -421.9548645019531, "loss": 0.5417, "rewards/accuracies": 0.75, "rewards/chosen": 0.0023680515587329865, "rewards/margins": 1.077951431274414, "rewards/rejected": -1.0755834579467773, "step": 871 }, { "epoch": 0.5316262764822436, "grad_norm": 63.004622087496614, "learning_rate": 3.788780487804878e-08, "logits/chosen": -0.015010075643658638, "logits/rejected": 0.10390684753656387, "logps/chosen": -275.31884765625, "logps/rejected": -142.97242736816406, "loss": 0.5365, "rewards/accuracies": 0.75, "rewards/chosen": 0.734698474407196, "rewards/margins": 0.42196959257125854, "rewards/rejected": 0.3127288818359375, "step": 872 }, { "epoch": 0.532235939643347, "grad_norm": 61.66937202736482, "learning_rate": 3.793170731707317e-08, "logits/chosen": -0.03673422336578369, "logits/rejected": 0.0396769754588604, "logps/chosen": -183.277099609375, "logps/rejected": -121.23249053955078, "loss": 0.5385, "rewards/accuracies": 0.25, "rewards/chosen": -0.21187247335910797, "rewards/margins": -0.20256415009498596, "rewards/rejected": -0.009308312088251114, "step": 873 }, { "epoch": 0.5328456028044506, "grad_norm": 63.32267307609421, "learning_rate": 3.797560975609756e-08, "logits/chosen": -0.06537565588951111, "logits/rejected": 0.0261433944106102, "logps/chosen": -133.59780883789062, "logps/rejected": -172.2808837890625, "loss": 0.5802, "rewards/accuracies": 0.5, "rewards/chosen": 0.053683459758758545, "rewards/margins": 0.19040435552597046, "rewards/rejected": -0.13672089576721191, "step": 874 }, { "epoch": 0.533455265965554, "grad_norm": 66.27506589205731, "learning_rate": 3.801951219512195e-08, "logits/chosen": 0.07901205122470856, "logits/rejected": 0.07933249324560165, "logps/chosen": -52.86782455444336, "logps/rejected": -62.64750289916992, "loss": 0.5814, "rewards/accuracies": 0.25, "rewards/chosen": -0.1269572675228119, "rewards/margins": -0.14318525791168213, "rewards/rejected": 0.016227995976805687, "step": 875 }, { "epoch": 0.5340649291266575, "grad_norm": 54.33717921908857, "learning_rate": 3.8063414634146344e-08, "logits/chosen": 0.12236712872982025, "logits/rejected": 0.09126129746437073, "logps/chosen": -13.379688262939453, "logps/rejected": -44.06802749633789, "loss": 0.4841, "rewards/accuracies": 1.0, "rewards/chosen": -0.10507187992334366, "rewards/margins": 0.14723534882068634, "rewards/rejected": -0.2523072361946106, "step": 876 }, { "epoch": 0.534674592287761, "grad_norm": 66.47825394805919, "learning_rate": 3.810731707317073e-08, "logits/chosen": 0.06389956176280975, "logits/rejected": 0.28784507513046265, "logps/chosen": -306.0577087402344, "logps/rejected": -66.86945343017578, "loss": 0.6358, "rewards/accuracies": 0.25, "rewards/chosen": -0.242964506149292, "rewards/margins": 0.01772509515285492, "rewards/rejected": -0.2606896460056305, "step": 877 }, { "epoch": 0.5352842554488645, "grad_norm": 74.50690460636137, "learning_rate": 3.815121951219512e-08, "logits/chosen": 0.3442728519439697, "logits/rejected": 0.30142903327941895, "logps/chosen": -226.09994506835938, "logps/rejected": -223.4921417236328, "loss": 0.6019, "rewards/accuracies": 0.75, "rewards/chosen": 0.7171735167503357, "rewards/margins": 0.784739077091217, "rewards/rejected": -0.06756556034088135, "step": 878 }, { "epoch": 0.535893918609968, "grad_norm": 67.43150128420167, "learning_rate": 3.819512195121951e-08, "logits/chosen": 0.09721434116363525, "logits/rejected": 0.04542946070432663, "logps/chosen": -130.51011657714844, "logps/rejected": -212.75540161132812, "loss": 0.5525, "rewards/accuracies": 0.75, "rewards/chosen": -0.28920498490333557, "rewards/margins": 0.26738715171813965, "rewards/rejected": -0.5565921664237976, "step": 879 }, { "epoch": 0.5365035817710715, "grad_norm": 64.15874977445931, "learning_rate": 3.8239024390243905e-08, "logits/chosen": 0.04385033994913101, "logits/rejected": 0.07755731046199799, "logps/chosen": -23.495189666748047, "logps/rejected": -20.148441314697266, "loss": 0.5657, "rewards/accuracies": 0.5, "rewards/chosen": -0.47862696647644043, "rewards/margins": 0.13002800941467285, "rewards/rejected": -0.6086549758911133, "step": 880 }, { "epoch": 0.5371132449321749, "grad_norm": 53.00174423455976, "learning_rate": 3.828292682926829e-08, "logits/chosen": 0.2399432361125946, "logits/rejected": 0.07920615375041962, "logps/chosen": -80.57872009277344, "logps/rejected": -93.03114318847656, "loss": 0.5224, "rewards/accuracies": 0.75, "rewards/chosen": -0.13346940279006958, "rewards/margins": 0.15735048055648804, "rewards/rejected": -0.2908198833465576, "step": 881 }, { "epoch": 0.5377229080932785, "grad_norm": 58.27008171272014, "learning_rate": 3.832682926829268e-08, "logits/chosen": 0.2204049527645111, "logits/rejected": 0.2619776725769043, "logps/chosen": -112.0592269897461, "logps/rejected": -84.74361419677734, "loss": 0.5639, "rewards/accuracies": 1.0, "rewards/chosen": 0.29275035858154297, "rewards/margins": 0.8456393480300903, "rewards/rejected": -0.5528889298439026, "step": 882 }, { "epoch": 0.5383325712543819, "grad_norm": 62.09697051583322, "learning_rate": 3.8370731707317076e-08, "logits/chosen": 0.21296963095664978, "logits/rejected": 0.2081933617591858, "logps/chosen": -141.5391082763672, "logps/rejected": -146.18832397460938, "loss": 0.5585, "rewards/accuracies": 1.0, "rewards/chosen": 0.11810910701751709, "rewards/margins": 0.5128703117370605, "rewards/rejected": -0.39476117491722107, "step": 883 }, { "epoch": 0.5389422344154855, "grad_norm": 61.80920607763717, "learning_rate": 3.8414634146341465e-08, "logits/chosen": 0.2989388704299927, "logits/rejected": 0.336221843957901, "logps/chosen": -19.220060348510742, "logps/rejected": -17.0111083984375, "loss": 0.6451, "rewards/accuracies": 0.5, "rewards/chosen": -0.9095132350921631, "rewards/margins": 0.013400629162788391, "rewards/rejected": -0.9229139089584351, "step": 884 }, { "epoch": 0.539551897576589, "grad_norm": 58.934770284041925, "learning_rate": 3.845853658536585e-08, "logits/chosen": 0.2550952136516571, "logits/rejected": 0.28509005904197693, "logps/chosen": -191.87539672851562, "logps/rejected": -186.66815185546875, "loss": 0.5078, "rewards/accuracies": 1.0, "rewards/chosen": 0.20852214097976685, "rewards/margins": 0.72804856300354, "rewards/rejected": -0.5195264220237732, "step": 885 }, { "epoch": 0.5401615607376924, "grad_norm": 96.03745592632615, "learning_rate": 3.850243902439024e-08, "logits/chosen": 0.2564608156681061, "logits/rejected": 0.19282256066799164, "logps/chosen": -195.31539916992188, "logps/rejected": -199.86245727539062, "loss": 0.4757, "rewards/accuracies": 0.75, "rewards/chosen": 0.08128748834133148, "rewards/margins": 0.8918872475624084, "rewards/rejected": -0.8105998039245605, "step": 886 }, { "epoch": 0.540771223898796, "grad_norm": 70.6692264776895, "learning_rate": 3.8546341463414636e-08, "logits/chosen": 0.17313377559185028, "logits/rejected": 0.272353857755661, "logps/chosen": -82.64848327636719, "logps/rejected": -50.599117279052734, "loss": 0.6496, "rewards/accuracies": 0.25, "rewards/chosen": 0.050652459263801575, "rewards/margins": -0.05836388096213341, "rewards/rejected": 0.10901632905006409, "step": 887 }, { "epoch": 0.5413808870598994, "grad_norm": 69.05191052518173, "learning_rate": 3.8590243902439025e-08, "logits/chosen": -0.11841931194067001, "logits/rejected": 0.1710411161184311, "logps/chosen": -233.16030883789062, "logps/rejected": -112.22335052490234, "loss": 0.5506, "rewards/accuracies": 0.75, "rewards/chosen": -0.4198493957519531, "rewards/margins": 0.03017844259738922, "rewards/rejected": -0.45002782344818115, "step": 888 }, { "epoch": 0.5419905502210028, "grad_norm": 72.33339946514252, "learning_rate": 3.863414634146341e-08, "logits/chosen": 0.164263516664505, "logits/rejected": 0.2504717707633972, "logps/chosen": -300.3990783691406, "logps/rejected": -249.3133544921875, "loss": 0.617, "rewards/accuracies": 0.25, "rewards/chosen": 0.005677081644535065, "rewards/margins": 0.24519182741641998, "rewards/rejected": -0.23951473832130432, "step": 889 }, { "epoch": 0.5426002133821064, "grad_norm": 61.011371088142006, "learning_rate": 3.867804878048781e-08, "logits/chosen": 0.14612877368927002, "logits/rejected": 0.12802213430404663, "logps/chosen": -198.5043487548828, "logps/rejected": -97.168701171875, "loss": 0.4808, "rewards/accuracies": 1.0, "rewards/chosen": 0.4118140935897827, "rewards/margins": 1.3088165521621704, "rewards/rejected": -0.8970025777816772, "step": 890 }, { "epoch": 0.5432098765432098, "grad_norm": 70.19987568244505, "learning_rate": 3.8721951219512197e-08, "logits/chosen": 0.07287600636482239, "logits/rejected": 0.051877789199352264, "logps/chosen": -293.9187927246094, "logps/rejected": -132.78085327148438, "loss": 0.6426, "rewards/accuracies": 0.75, "rewards/chosen": 0.4249412417411804, "rewards/margins": 0.5122901797294617, "rewards/rejected": -0.08734893053770065, "step": 891 }, { "epoch": 0.5438195397043134, "grad_norm": 60.54433657791981, "learning_rate": 3.8765853658536585e-08, "logits/chosen": 0.021549783647060394, "logits/rejected": 0.06050729751586914, "logps/chosen": -61.32946014404297, "logps/rejected": -33.854331970214844, "loss": 0.528, "rewards/accuracies": 0.5, "rewards/chosen": -0.10759210586547852, "rewards/margins": 0.653812050819397, "rewards/rejected": -0.7614041566848755, "step": 892 }, { "epoch": 0.5444292028654169, "grad_norm": 65.48766296584395, "learning_rate": 3.8809756097560973e-08, "logits/chosen": 0.05073666572570801, "logits/rejected": -0.06363290548324585, "logps/chosen": -230.37246704101562, "logps/rejected": -273.86712646484375, "loss": 0.5731, "rewards/accuracies": 0.75, "rewards/chosen": 0.08037916570901871, "rewards/margins": 0.78180992603302, "rewards/rejected": -0.7014308571815491, "step": 893 }, { "epoch": 0.5450388660265203, "grad_norm": 70.97853505550982, "learning_rate": 3.885365853658537e-08, "logits/chosen": 0.1399250328540802, "logits/rejected": 0.22606408596038818, "logps/chosen": -87.23171997070312, "logps/rejected": -57.06981658935547, "loss": 0.5862, "rewards/accuracies": 1.0, "rewards/chosen": 0.46670445799827576, "rewards/margins": 0.7650110721588135, "rewards/rejected": -0.2983066439628601, "step": 894 }, { "epoch": 0.5456485291876239, "grad_norm": 63.7807241221083, "learning_rate": 3.889756097560976e-08, "logits/chosen": 0.0728025808930397, "logits/rejected": 0.027045216411352158, "logps/chosen": -124.65536499023438, "logps/rejected": -142.7744140625, "loss": 0.5279, "rewards/accuracies": 1.0, "rewards/chosen": -0.08896440267562866, "rewards/margins": 0.5423890352249146, "rewards/rejected": -0.6313533782958984, "step": 895 }, { "epoch": 0.5462581923487273, "grad_norm": 96.02331057254527, "learning_rate": 3.8941463414634145e-08, "logits/chosen": 0.1260644495487213, "logits/rejected": 0.12321975827217102, "logps/chosen": -155.9744415283203, "logps/rejected": -182.54986572265625, "loss": 0.5877, "rewards/accuracies": 0.5, "rewards/chosen": 0.024586722254753113, "rewards/margins": 0.18018847703933716, "rewards/rejected": -0.15560175478458405, "step": 896 }, { "epoch": 0.5468678555098309, "grad_norm": 66.8006183697143, "learning_rate": 3.898536585365854e-08, "logits/chosen": 0.02144142985343933, "logits/rejected": 0.5131258368492126, "logps/chosen": -351.69073486328125, "logps/rejected": -134.90220642089844, "loss": 0.5518, "rewards/accuracies": 0.5, "rewards/chosen": 0.5111550688743591, "rewards/margins": 0.5789368748664856, "rewards/rejected": -0.06778179109096527, "step": 897 }, { "epoch": 0.5474775186709343, "grad_norm": 72.3824971990929, "learning_rate": 3.902926829268293e-08, "logits/chosen": 0.08979152888059616, "logits/rejected": 0.06702014803886414, "logps/chosen": -48.506126403808594, "logps/rejected": -31.855939865112305, "loss": 0.57, "rewards/accuracies": 0.75, "rewards/chosen": -0.40456414222717285, "rewards/margins": 0.7067840695381165, "rewards/rejected": -1.1113481521606445, "step": 898 }, { "epoch": 0.5480871818320378, "grad_norm": 65.49358279038536, "learning_rate": 3.907317073170732e-08, "logits/chosen": 0.13155023753643036, "logits/rejected": 0.16425983607769012, "logps/chosen": -167.928955078125, "logps/rejected": -142.37261962890625, "loss": 0.5711, "rewards/accuracies": 1.0, "rewards/chosen": 0.509557843208313, "rewards/margins": 0.36642348766326904, "rewards/rejected": 0.14313434064388275, "step": 899 }, { "epoch": 0.5486968449931413, "grad_norm": 72.51322204893103, "learning_rate": 3.9117073170731705e-08, "logits/chosen": 0.16980314254760742, "logits/rejected": 0.1725219041109085, "logps/chosen": -122.37548065185547, "logps/rejected": -105.88607788085938, "loss": 0.5305, "rewards/accuracies": 0.75, "rewards/chosen": -0.15686455368995667, "rewards/margins": 0.8508955240249634, "rewards/rejected": -1.0077600479125977, "step": 900 }, { "epoch": 0.5493065081542448, "grad_norm": 66.56509400685799, "learning_rate": 3.91609756097561e-08, "logits/chosen": 0.06980474293231964, "logits/rejected": 0.1476413458585739, "logps/chosen": -149.52793884277344, "logps/rejected": -30.278118133544922, "loss": 0.5101, "rewards/accuracies": 1.0, "rewards/chosen": 0.29264557361602783, "rewards/margins": 0.5442541837692261, "rewards/rejected": -0.25160861015319824, "step": 901 }, { "epoch": 0.5499161713153483, "grad_norm": 83.17406807513954, "learning_rate": 3.920487804878049e-08, "logits/chosen": -0.29419541358947754, "logits/rejected": 0.16222888231277466, "logps/chosen": -263.3582763671875, "logps/rejected": -228.7915496826172, "loss": 0.6433, "rewards/accuracies": 0.25, "rewards/chosen": -0.2893264889717102, "rewards/margins": -0.17224933207035065, "rewards/rejected": -0.11707716435194016, "step": 902 }, { "epoch": 0.5505258344764518, "grad_norm": 68.37619134528198, "learning_rate": 3.924878048780488e-08, "logits/chosen": 0.2935391962528229, "logits/rejected": 0.32337260246276855, "logps/chosen": -30.69774627685547, "logps/rejected": -38.217796325683594, "loss": 0.5518, "rewards/accuracies": 1.0, "rewards/chosen": -0.022282440215349197, "rewards/margins": 0.13705119490623474, "rewards/rejected": -0.15933363139629364, "step": 903 }, { "epoch": 0.5511354976375552, "grad_norm": 70.41511856903495, "learning_rate": 3.929268292682927e-08, "logits/chosen": 0.33251404762268066, "logits/rejected": 0.22450914978981018, "logps/chosen": -298.6009521484375, "logps/rejected": -312.72027587890625, "loss": 0.595, "rewards/accuracies": 0.75, "rewards/chosen": 0.24860572814941406, "rewards/margins": 1.0966771841049194, "rewards/rejected": -0.8480713963508606, "step": 904 }, { "epoch": 0.5517451607986588, "grad_norm": 65.6598566284955, "learning_rate": 3.933658536585366e-08, "logits/chosen": -0.023178113624453545, "logits/rejected": 0.34410110116004944, "logps/chosen": -171.30035400390625, "logps/rejected": -213.72647094726562, "loss": 0.5795, "rewards/accuracies": 1.0, "rewards/chosen": 0.48885175585746765, "rewards/margins": 0.25117263197898865, "rewards/rejected": 0.23767909407615662, "step": 905 }, { "epoch": 0.5523548239597622, "grad_norm": 70.87554003593823, "learning_rate": 3.938048780487805e-08, "logits/chosen": 0.1324978768825531, "logits/rejected": 0.06611113250255585, "logps/chosen": -20.51483917236328, "logps/rejected": -63.52241897583008, "loss": 0.5555, "rewards/accuracies": 0.5, "rewards/chosen": 0.13383133709430695, "rewards/margins": 0.04824209585785866, "rewards/rejected": 0.08558925241231918, "step": 906 }, { "epoch": 0.5529644871208658, "grad_norm": 67.18938327107257, "learning_rate": 3.942439024390244e-08, "logits/chosen": -0.09553372114896774, "logits/rejected": 0.03629935532808304, "logps/chosen": -319.3539733886719, "logps/rejected": -232.38819885253906, "loss": 0.5308, "rewards/accuracies": 1.0, "rewards/chosen": 0.5030369758605957, "rewards/margins": 0.9751700162887573, "rewards/rejected": -0.4721331000328064, "step": 907 }, { "epoch": 0.5535741502819692, "grad_norm": 63.656136051102344, "learning_rate": 3.946829268292683e-08, "logits/chosen": 0.09303402900695801, "logits/rejected": 0.11336948722600937, "logps/chosen": -119.49852752685547, "logps/rejected": -111.59620666503906, "loss": 0.5668, "rewards/accuracies": 0.75, "rewards/chosen": 0.33455780148506165, "rewards/margins": 0.30913645029067993, "rewards/rejected": 0.025421354919672012, "step": 908 }, { "epoch": 0.5541838134430727, "grad_norm": 69.6908758824901, "learning_rate": 3.951219512195122e-08, "logits/chosen": -0.09655658900737762, "logits/rejected": -0.21397748589515686, "logps/chosen": -96.808349609375, "logps/rejected": -326.0218505859375, "loss": 0.5141, "rewards/accuracies": 0.5, "rewards/chosen": 0.17469826340675354, "rewards/margins": 0.20265799760818481, "rewards/rejected": -0.027959734201431274, "step": 909 }, { "epoch": 0.5547934766041762, "grad_norm": 89.759948990915, "learning_rate": 3.955609756097561e-08, "logits/chosen": 0.13062049448490143, "logits/rejected": 0.11106927692890167, "logps/chosen": -349.80029296875, "logps/rejected": -142.99050903320312, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": 0.4324016571044922, "rewards/margins": 0.0628436878323555, "rewards/rejected": 0.3695579469203949, "step": 910 }, { "epoch": 0.5554031397652797, "grad_norm": 69.4579941607974, "learning_rate": 3.9600000000000004e-08, "logits/chosen": 0.07106968015432358, "logits/rejected": 0.26455602049827576, "logps/chosen": -244.58111572265625, "logps/rejected": -200.248046875, "loss": 0.5304, "rewards/accuracies": 0.75, "rewards/chosen": 0.20474356412887573, "rewards/margins": 0.17258234322071075, "rewards/rejected": 0.032161224633455276, "step": 911 }, { "epoch": 0.5560128029263832, "grad_norm": 76.18955954738001, "learning_rate": 3.964390243902439e-08, "logits/chosen": -0.20763921737670898, "logits/rejected": -0.17405462265014648, "logps/chosen": -152.68344116210938, "logps/rejected": -226.15586853027344, "loss": 0.612, "rewards/accuracies": 1.0, "rewards/chosen": 0.05544380843639374, "rewards/margins": 0.907264232635498, "rewards/rejected": -0.8518204092979431, "step": 912 }, { "epoch": 0.5566224660874867, "grad_norm": 67.28490303937213, "learning_rate": 3.968780487804878e-08, "logits/chosen": 0.06502640247344971, "logits/rejected": 0.2051590085029602, "logps/chosen": -424.8998107910156, "logps/rejected": -411.7484130859375, "loss": 0.5342, "rewards/accuracies": 1.0, "rewards/chosen": 0.9498125314712524, "rewards/margins": 1.6604827642440796, "rewards/rejected": -0.7106701731681824, "step": 913 }, { "epoch": 0.5572321292485901, "grad_norm": 70.57081369071896, "learning_rate": 3.973170731707317e-08, "logits/chosen": 0.2460608035326004, "logits/rejected": 0.15885694324970245, "logps/chosen": -14.876495361328125, "logps/rejected": -28.577953338623047, "loss": 0.6335, "rewards/accuracies": 0.5, "rewards/chosen": -0.39212724566459656, "rewards/margins": 0.036636412143707275, "rewards/rejected": -0.42876365780830383, "step": 914 }, { "epoch": 0.5578417924096937, "grad_norm": 59.11274273520954, "learning_rate": 3.9775609756097564e-08, "logits/chosen": 0.0618540458381176, "logits/rejected": 0.059687189757823944, "logps/chosen": -68.97518157958984, "logps/rejected": -85.0937271118164, "loss": 0.5981, "rewards/accuracies": 0.75, "rewards/chosen": -0.006830386817455292, "rewards/margins": 0.06073179841041565, "rewards/rejected": -0.06756218522787094, "step": 915 }, { "epoch": 0.5584514555707971, "grad_norm": 51.8397678888205, "learning_rate": 3.981951219512195e-08, "logits/chosen": 0.2564215660095215, "logits/rejected": 0.2258543074131012, "logps/chosen": -18.547195434570312, "logps/rejected": -23.24953842163086, "loss": 0.4998, "rewards/accuracies": 0.75, "rewards/chosen": -0.3841609060764313, "rewards/margins": 0.4379531145095825, "rewards/rejected": -0.8221140503883362, "step": 916 }, { "epoch": 0.5590611187319007, "grad_norm": 65.88073805729441, "learning_rate": 3.986341463414634e-08, "logits/chosen": 0.18171361088752747, "logits/rejected": 0.0895804762840271, "logps/chosen": -57.35551071166992, "logps/rejected": -79.67073822021484, "loss": 0.5368, "rewards/accuracies": 0.75, "rewards/chosen": 0.07624735683202744, "rewards/margins": 0.8581210374832153, "rewards/rejected": -0.7818737030029297, "step": 917 }, { "epoch": 0.5596707818930041, "grad_norm": 62.610922622098954, "learning_rate": 3.9907317073170736e-08, "logits/chosen": -0.21924518048763275, "logits/rejected": -0.3091610074043274, "logps/chosen": -234.76693725585938, "logps/rejected": -350.0892028808594, "loss": 0.6026, "rewards/accuracies": 0.5, "rewards/chosen": -0.06699717789888382, "rewards/margins": 0.21809442341327667, "rewards/rejected": -0.2850916087627411, "step": 918 }, { "epoch": 0.5602804450541076, "grad_norm": 51.50259242433927, "learning_rate": 3.9951219512195124e-08, "logits/chosen": 0.0463281124830246, "logits/rejected": 0.09033738076686859, "logps/chosen": -159.4967803955078, "logps/rejected": -67.3938217163086, "loss": 0.4737, "rewards/accuracies": 0.75, "rewards/chosen": 0.40997254848480225, "rewards/margins": 0.6638243198394775, "rewards/rejected": -0.2538517713546753, "step": 919 }, { "epoch": 0.5608901082152111, "grad_norm": 66.66138931593925, "learning_rate": 3.999512195121951e-08, "logits/chosen": -0.0645560771226883, "logits/rejected": 0.14277854561805725, "logps/chosen": -244.21054077148438, "logps/rejected": -223.73312377929688, "loss": 0.5307, "rewards/accuracies": 1.0, "rewards/chosen": 0.47790470719337463, "rewards/margins": 0.6905884742736816, "rewards/rejected": -0.21268370747566223, "step": 920 }, { "epoch": 0.5614997713763146, "grad_norm": 67.47433297965408, "learning_rate": 4.00390243902439e-08, "logits/chosen": 0.3004991412162781, "logits/rejected": 0.3832394778728485, "logps/chosen": -248.2864532470703, "logps/rejected": -215.03794860839844, "loss": 0.5642, "rewards/accuracies": 1.0, "rewards/chosen": 0.5374934673309326, "rewards/margins": 0.7683519721031189, "rewards/rejected": -0.23085854947566986, "step": 921 }, { "epoch": 0.5621094345374181, "grad_norm": 54.94697086770058, "learning_rate": 4.0082926829268296e-08, "logits/chosen": -0.0013063345104455948, "logits/rejected": 0.19500063359737396, "logps/chosen": -227.93211364746094, "logps/rejected": -151.1141357421875, "loss": 0.4935, "rewards/accuracies": 0.75, "rewards/chosen": 0.5804883241653442, "rewards/margins": 0.4667474031448364, "rewards/rejected": 0.1137409657239914, "step": 922 }, { "epoch": 0.5627190976985216, "grad_norm": 64.04981408028173, "learning_rate": 4.0126829268292684e-08, "logits/chosen": 0.07343435287475586, "logits/rejected": -0.07359534502029419, "logps/chosen": -130.68255615234375, "logps/rejected": -151.26487731933594, "loss": 0.5517, "rewards/accuracies": 1.0, "rewards/chosen": 0.20980963110923767, "rewards/margins": 0.731520414352417, "rewards/rejected": -0.5217108726501465, "step": 923 }, { "epoch": 0.563328760859625, "grad_norm": 60.291269856199136, "learning_rate": 4.017073170731707e-08, "logits/chosen": -0.09406832605600357, "logits/rejected": 0.11262158304452896, "logps/chosen": -251.36114501953125, "logps/rejected": -160.82583618164062, "loss": 0.5505, "rewards/accuracies": 0.5, "rewards/chosen": 0.004199355840682983, "rewards/margins": 0.11875885725021362, "rewards/rejected": -0.11455950140953064, "step": 924 }, { "epoch": 0.5639384240207286, "grad_norm": 65.0621827815013, "learning_rate": 4.021463414634147e-08, "logits/chosen": 0.11223086714744568, "logits/rejected": -0.03253212571144104, "logps/chosen": -41.525733947753906, "logps/rejected": -84.82911682128906, "loss": 0.5188, "rewards/accuracies": 0.5, "rewards/chosen": -0.3485022783279419, "rewards/margins": 0.012273922562599182, "rewards/rejected": -0.3607761859893799, "step": 925 }, { "epoch": 0.564548087181832, "grad_norm": 56.796599648277606, "learning_rate": 4.0258536585365856e-08, "logits/chosen": 0.1416153758764267, "logits/rejected": 0.15603217482566833, "logps/chosen": -245.56536865234375, "logps/rejected": -110.10095977783203, "loss": 0.4634, "rewards/accuracies": 1.0, "rewards/chosen": 0.6076117753982544, "rewards/margins": 1.1378636360168457, "rewards/rejected": -0.5302518606185913, "step": 926 }, { "epoch": 0.5651577503429356, "grad_norm": 67.01272873587294, "learning_rate": 4.0302439024390244e-08, "logits/chosen": 0.051580868661403656, "logits/rejected": 0.20569553971290588, "logps/chosen": -174.1630859375, "logps/rejected": -179.910888671875, "loss": 0.6088, "rewards/accuracies": 0.75, "rewards/chosen": 0.3665463924407959, "rewards/margins": 0.7571026086807251, "rewards/rejected": -0.390556275844574, "step": 927 }, { "epoch": 0.565767413504039, "grad_norm": 66.70958922544808, "learning_rate": 4.034634146341463e-08, "logits/chosen": 0.1787756085395813, "logits/rejected": 0.1766122579574585, "logps/chosen": -135.0958251953125, "logps/rejected": -103.45654296875, "loss": 0.5702, "rewards/accuracies": 1.0, "rewards/chosen": 0.36315441131591797, "rewards/margins": 0.9478495121002197, "rewards/rejected": -0.5846951603889465, "step": 928 }, { "epoch": 0.5663770766651425, "grad_norm": 78.72233349624145, "learning_rate": 4.039024390243903e-08, "logits/chosen": 0.16744698584079742, "logits/rejected": -0.0412500724196434, "logps/chosen": -72.60401916503906, "logps/rejected": -136.37399291992188, "loss": 0.5395, "rewards/accuracies": 0.75, "rewards/chosen": -0.0803537592291832, "rewards/margins": 0.4975617229938507, "rewards/rejected": -0.5779154896736145, "step": 929 }, { "epoch": 0.566986739826246, "grad_norm": 65.98584845026838, "learning_rate": 4.0434146341463416e-08, "logits/chosen": -0.10447554290294647, "logits/rejected": -0.03891365975141525, "logps/chosen": -291.3166809082031, "logps/rejected": -287.75225830078125, "loss": 0.5893, "rewards/accuracies": 0.5, "rewards/chosen": 0.45769044756889343, "rewards/margins": 0.3405483365058899, "rewards/rejected": 0.11714211106300354, "step": 930 }, { "epoch": 0.5675964029873495, "grad_norm": 58.353142155605795, "learning_rate": 4.0478048780487805e-08, "logits/chosen": 0.2372773289680481, "logits/rejected": 0.10270366817712784, "logps/chosen": -177.44635009765625, "logps/rejected": -283.463623046875, "loss": 0.5667, "rewards/accuracies": 1.0, "rewards/chosen": 0.37482908368110657, "rewards/margins": 0.3479050099849701, "rewards/rejected": 0.026924047619104385, "step": 931 }, { "epoch": 0.5682060661484529, "grad_norm": 72.97153861954438, "learning_rate": 4.0521951219512186e-08, "logits/chosen": 0.09728141129016876, "logits/rejected": 0.06666161864995956, "logps/chosen": -266.215087890625, "logps/rejected": -192.38037109375, "loss": 0.5327, "rewards/accuracies": 0.75, "rewards/chosen": 0.8658161163330078, "rewards/margins": 0.7490353584289551, "rewards/rejected": 0.11678075790405273, "step": 932 }, { "epoch": 0.5688157293095565, "grad_norm": 51.366865251220304, "learning_rate": 4.056585365853658e-08, "logits/chosen": 0.3116620182991028, "logits/rejected": 0.25762253999710083, "logps/chosen": -179.5105743408203, "logps/rejected": -229.45407104492188, "loss": 0.4982, "rewards/accuracies": 1.0, "rewards/chosen": -0.1297607719898224, "rewards/margins": 1.1740977764129639, "rewards/rejected": -1.3038586378097534, "step": 933 }, { "epoch": 0.5694253924706599, "grad_norm": 70.49128080381824, "learning_rate": 4.060975609756097e-08, "logits/chosen": -0.3792869448661804, "logits/rejected": 0.07428139448165894, "logps/chosen": -390.8101806640625, "logps/rejected": -231.0935516357422, "loss": 0.5915, "rewards/accuracies": 1.0, "rewards/chosen": 0.7346107959747314, "rewards/margins": 0.936454176902771, "rewards/rejected": -0.20184330642223358, "step": 934 }, { "epoch": 0.5700350556317635, "grad_norm": 76.78091355138127, "learning_rate": 4.065365853658536e-08, "logits/chosen": 0.19513678550720215, "logits/rejected": 0.292969286441803, "logps/chosen": -280.4601745605469, "logps/rejected": -232.74989318847656, "loss": 0.5518, "rewards/accuracies": 0.75, "rewards/chosen": 0.11097672581672668, "rewards/margins": 0.47675424814224243, "rewards/rejected": -0.36577752232551575, "step": 935 }, { "epoch": 0.5706447187928669, "grad_norm": 83.47007403711588, "learning_rate": 4.069756097560975e-08, "logits/chosen": 0.40284186601638794, "logits/rejected": 0.1840941309928894, "logps/chosen": -217.3123779296875, "logps/rejected": -430.5103759765625, "loss": 0.5134, "rewards/accuracies": 1.0, "rewards/chosen": 0.9091841578483582, "rewards/margins": 1.3295221328735352, "rewards/rejected": -0.4203380346298218, "step": 936 }, { "epoch": 0.5712543819539704, "grad_norm": 60.346002831050065, "learning_rate": 4.074146341463414e-08, "logits/chosen": 0.17786885797977448, "logits/rejected": 0.19254018366336823, "logps/chosen": -221.40101623535156, "logps/rejected": -169.64413452148438, "loss": 0.5908, "rewards/accuracies": 0.75, "rewards/chosen": 0.26896461844444275, "rewards/margins": 0.8941740989685059, "rewards/rejected": -0.6252095699310303, "step": 937 }, { "epoch": 0.5718640451150739, "grad_norm": 70.27171933319133, "learning_rate": 4.078536585365853e-08, "logits/chosen": 0.22855284810066223, "logits/rejected": 0.2680366635322571, "logps/chosen": -60.72048568725586, "logps/rejected": -69.85613250732422, "loss": 0.6047, "rewards/accuracies": 0.5, "rewards/chosen": 0.02325417473912239, "rewards/margins": 0.02311794087290764, "rewards/rejected": 0.0001362226903438568, "step": 938 }, { "epoch": 0.5724737082761774, "grad_norm": 75.11512603751487, "learning_rate": 4.082926829268292e-08, "logits/chosen": -0.007381145842373371, "logits/rejected": 0.12239038944244385, "logps/chosen": -203.8936767578125, "logps/rejected": -116.77022552490234, "loss": 0.5919, "rewards/accuracies": 0.5, "rewards/chosen": 0.07102936506271362, "rewards/margins": 0.3084529638290405, "rewards/rejected": -0.2374236136674881, "step": 939 }, { "epoch": 0.5730833714372809, "grad_norm": 77.50486609865688, "learning_rate": 4.087317073170731e-08, "logits/chosen": -0.046913955360651016, "logits/rejected": 0.12509378790855408, "logps/chosen": -213.4773406982422, "logps/rejected": -218.0093994140625, "loss": 0.643, "rewards/accuracies": 0.5, "rewards/chosen": -0.20496401190757751, "rewards/margins": -0.17977844178676605, "rewards/rejected": -0.025185570120811462, "step": 940 }, { "epoch": 0.5736930345983844, "grad_norm": 91.34440591980292, "learning_rate": 4.09170731707317e-08, "logits/chosen": 0.04759877920150757, "logits/rejected": 0.032981082797050476, "logps/chosen": -256.04766845703125, "logps/rejected": -182.34393310546875, "loss": 0.5821, "rewards/accuracies": 0.75, "rewards/chosen": 0.005273975431919098, "rewards/margins": 0.3753475844860077, "rewards/rejected": -0.3700735867023468, "step": 941 }, { "epoch": 0.5743026977594878, "grad_norm": 70.78543660509564, "learning_rate": 4.096097560975609e-08, "logits/chosen": 0.031253427267074585, "logits/rejected": 0.09773498773574829, "logps/chosen": -184.8252410888672, "logps/rejected": -68.20289611816406, "loss": 0.6523, "rewards/accuracies": 0.75, "rewards/chosen": 0.12901097536087036, "rewards/margins": 0.34820958971977234, "rewards/rejected": -0.21919861435890198, "step": 942 }, { "epoch": 0.5749123609205914, "grad_norm": 61.64130036814762, "learning_rate": 4.1004878048780485e-08, "logits/chosen": 0.08989404141902924, "logits/rejected": 0.117643803358078, "logps/chosen": -237.77716064453125, "logps/rejected": -140.23069763183594, "loss": 0.5281, "rewards/accuracies": 0.75, "rewards/chosen": 0.4668736159801483, "rewards/margins": 0.7956620454788208, "rewards/rejected": -0.3287883698940277, "step": 943 }, { "epoch": 0.5755220240816948, "grad_norm": 68.6945218222372, "learning_rate": 4.104878048780487e-08, "logits/chosen": 0.20969705283641815, "logits/rejected": 0.17862199246883392, "logps/chosen": -64.42387390136719, "logps/rejected": -90.00393676757812, "loss": 0.5093, "rewards/accuracies": 0.75, "rewards/chosen": -0.11434374749660492, "rewards/margins": 0.636167049407959, "rewards/rejected": -0.7505108118057251, "step": 944 }, { "epoch": 0.5761316872427984, "grad_norm": 92.56663195856316, "learning_rate": 4.109268292682926e-08, "logits/chosen": 0.23893681168556213, "logits/rejected": 0.30128633975982666, "logps/chosen": -254.2822265625, "logps/rejected": -179.03640747070312, "loss": 0.7023, "rewards/accuracies": 0.5, "rewards/chosen": 0.20787009596824646, "rewards/margins": -0.08017578721046448, "rewards/rejected": 0.2880459129810333, "step": 945 }, { "epoch": 0.5767413504039018, "grad_norm": 70.43087604972249, "learning_rate": 4.113658536585365e-08, "logits/chosen": 0.18003559112548828, "logits/rejected": 0.21685735881328583, "logps/chosen": -154.2003936767578, "logps/rejected": -217.83892822265625, "loss": 0.5489, "rewards/accuracies": 0.5, "rewards/chosen": -8.031725883483887e-05, "rewards/margins": 0.23531605303287506, "rewards/rejected": -0.2353963553905487, "step": 946 }, { "epoch": 0.5773510135650053, "grad_norm": 71.21231053272065, "learning_rate": 4.1180487804878045e-08, "logits/chosen": -0.06945399940013885, "logits/rejected": -0.06084947660565376, "logps/chosen": -96.3125228881836, "logps/rejected": -100.3994369506836, "loss": 0.6224, "rewards/accuracies": 0.5, "rewards/chosen": -0.5278519988059998, "rewards/margins": 0.04955005645751953, "rewards/rejected": -0.5774020552635193, "step": 947 }, { "epoch": 0.5779606767261088, "grad_norm": 65.26086714554752, "learning_rate": 4.1224390243902433e-08, "logits/chosen": 0.22658458352088928, "logits/rejected": 0.12774381041526794, "logps/chosen": -270.35333251953125, "logps/rejected": -205.83335876464844, "loss": 0.5235, "rewards/accuracies": 0.75, "rewards/chosen": 0.33648747205734253, "rewards/margins": 0.10773984342813492, "rewards/rejected": 0.2287476658821106, "step": 948 }, { "epoch": 0.5785703398872123, "grad_norm": 81.54389536975877, "learning_rate": 4.126829268292682e-08, "logits/chosen": 0.04804135859012604, "logits/rejected": 0.0008035674691200256, "logps/chosen": -120.53761291503906, "logps/rejected": -125.85433959960938, "loss": 0.5928, "rewards/accuracies": 0.25, "rewards/chosen": -0.24643567204475403, "rewards/margins": -0.33742666244506836, "rewards/rejected": 0.09099098294973373, "step": 949 }, { "epoch": 0.5791800030483159, "grad_norm": 82.12086780288085, "learning_rate": 4.131219512195122e-08, "logits/chosen": 0.2091103047132492, "logits/rejected": -0.04949437081813812, "logps/chosen": -201.59471130371094, "logps/rejected": -286.5262756347656, "loss": 0.521, "rewards/accuracies": 1.0, "rewards/chosen": 0.27747267484664917, "rewards/margins": 1.1522293090820312, "rewards/rejected": -0.8747566342353821, "step": 950 }, { "epoch": 0.5797896662094193, "grad_norm": 69.04790105659762, "learning_rate": 4.1356097560975605e-08, "logits/chosen": 0.25812891125679016, "logits/rejected": 0.03673887252807617, "logps/chosen": -91.65919494628906, "logps/rejected": -90.58918762207031, "loss": 0.6029, "rewards/accuracies": 0.5, "rewards/chosen": -0.04524761438369751, "rewards/margins": 0.2664438784122467, "rewards/rejected": -0.3116915225982666, "step": 951 }, { "epoch": 0.5803993293705227, "grad_norm": 87.67035025167849, "learning_rate": 4.1399999999999994e-08, "logits/chosen": 0.06877303123474121, "logits/rejected": 0.15757039189338684, "logps/chosen": -244.7814178466797, "logps/rejected": -294.79486083984375, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": -0.5146667957305908, "rewards/margins": 0.38214778900146484, "rewards/rejected": -0.8968145847320557, "step": 952 }, { "epoch": 0.5810089925316263, "grad_norm": 63.742718134835904, "learning_rate": 4.144390243902438e-08, "logits/chosen": -0.04098774120211601, "logits/rejected": 0.09652681648731232, "logps/chosen": -222.15682983398438, "logps/rejected": -176.15093994140625, "loss": 0.4896, "rewards/accuracies": 0.75, "rewards/chosen": 0.22891703248023987, "rewards/margins": 0.5597389936447144, "rewards/rejected": -0.3308219909667969, "step": 953 }, { "epoch": 0.5816186556927297, "grad_norm": 62.23053401449768, "learning_rate": 4.148780487804878e-08, "logits/chosen": -0.25104236602783203, "logits/rejected": -0.2953557074069977, "logps/chosen": -156.33468627929688, "logps/rejected": -279.6402587890625, "loss": 0.5958, "rewards/accuracies": 0.25, "rewards/chosen": 0.1959829032421112, "rewards/margins": 1.0025813579559326, "rewards/rejected": -0.806598424911499, "step": 954 }, { "epoch": 0.5822283188538333, "grad_norm": 67.38110967504043, "learning_rate": 4.1531707317073165e-08, "logits/chosen": -0.020015094429254532, "logits/rejected": 0.13997766375541687, "logps/chosen": -184.95852661132812, "logps/rejected": -136.2812957763672, "loss": 0.5132, "rewards/accuracies": 0.5, "rewards/chosen": 0.2717243432998657, "rewards/margins": 0.4409331679344177, "rewards/rejected": -0.1692088395357132, "step": 955 }, { "epoch": 0.5828379820149368, "grad_norm": 61.08016186118609, "learning_rate": 4.1575609756097554e-08, "logits/chosen": 0.007236507721245289, "logits/rejected": 0.09439484775066376, "logps/chosen": -262.1932373046875, "logps/rejected": -176.46446228027344, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": 0.20543357729911804, "rewards/margins": 0.30884745717048645, "rewards/rejected": -0.10341387242078781, "step": 956 }, { "epoch": 0.5834476451760402, "grad_norm": 69.39354863186936, "learning_rate": 4.161951219512195e-08, "logits/chosen": -0.15755020081996918, "logits/rejected": -0.3018410801887512, "logps/chosen": -358.4852294921875, "logps/rejected": -470.5343017578125, "loss": 0.5343, "rewards/accuracies": 0.5, "rewards/chosen": 0.21769274771213531, "rewards/margins": 0.16905879974365234, "rewards/rejected": 0.048633962869644165, "step": 957 }, { "epoch": 0.5840573083371438, "grad_norm": 86.61322570045859, "learning_rate": 4.166341463414634e-08, "logits/chosen": -0.02643953263759613, "logits/rejected": -0.18218819797039032, "logps/chosen": -41.61878967285156, "logps/rejected": -220.1629638671875, "loss": 0.6194, "rewards/accuracies": 0.5, "rewards/chosen": -0.44664257764816284, "rewards/margins": 0.0005926638841629028, "rewards/rejected": -0.44723525643348694, "step": 958 }, { "epoch": 0.5846669714982472, "grad_norm": 62.05519185121555, "learning_rate": 4.1707317073170725e-08, "logits/chosen": -0.34936612844467163, "logits/rejected": -0.14146625995635986, "logps/chosen": -313.269287109375, "logps/rejected": -150.72903442382812, "loss": 0.5197, "rewards/accuracies": 1.0, "rewards/chosen": 0.2663576602935791, "rewards/margins": 0.5314609408378601, "rewards/rejected": -0.2651032507419586, "step": 959 }, { "epoch": 0.5852766346593508, "grad_norm": 61.8538936273219, "learning_rate": 4.1751219512195114e-08, "logits/chosen": 0.08507747203111649, "logits/rejected": 0.04343874752521515, "logps/chosen": -97.35144805908203, "logps/rejected": -92.90192413330078, "loss": 0.5136, "rewards/accuracies": 0.5, "rewards/chosen": -0.20625300705432892, "rewards/margins": -0.16570445895195007, "rewards/rejected": -0.04054853320121765, "step": 960 }, { "epoch": 0.5858862978204542, "grad_norm": 69.59421865613955, "learning_rate": 4.179512195121951e-08, "logits/chosen": 0.022425949573516846, "logits/rejected": 0.0824100449681282, "logps/chosen": -365.5918884277344, "logps/rejected": -397.54443359375, "loss": 0.5295, "rewards/accuracies": 0.75, "rewards/chosen": 0.3914957046508789, "rewards/margins": 1.1910912990570068, "rewards/rejected": -0.7995957136154175, "step": 961 }, { "epoch": 0.5864959609815577, "grad_norm": 76.91340708220399, "learning_rate": 4.18390243902439e-08, "logits/chosen": 0.3572169840335846, "logits/rejected": 0.2854265570640564, "logps/chosen": -30.446258544921875, "logps/rejected": -36.61526107788086, "loss": 0.6124, "rewards/accuracies": 0.5, "rewards/chosen": -0.4780455231666565, "rewards/margins": -0.24744245409965515, "rewards/rejected": -0.23060305416584015, "step": 962 }, { "epoch": 0.5871056241426612, "grad_norm": 57.46941019347026, "learning_rate": 4.1882926829268286e-08, "logits/chosen": 0.21760831773281097, "logits/rejected": 0.13755109906196594, "logps/chosen": -193.99929809570312, "logps/rejected": -162.6146697998047, "loss": 0.5556, "rewards/accuracies": 0.75, "rewards/chosen": 0.17073407769203186, "rewards/margins": 0.8833505511283875, "rewards/rejected": -0.7126164436340332, "step": 963 }, { "epoch": 0.5877152873037647, "grad_norm": 62.629537187405504, "learning_rate": 4.192682926829268e-08, "logits/chosen": 0.04777471721172333, "logits/rejected": 0.055221885442733765, "logps/chosen": -98.55489349365234, "logps/rejected": -166.0889434814453, "loss": 0.526, "rewards/accuracies": 0.25, "rewards/chosen": -0.33463388681411743, "rewards/margins": -0.12675432860851288, "rewards/rejected": -0.20787954330444336, "step": 964 }, { "epoch": 0.5883249504648682, "grad_norm": 75.29882126736368, "learning_rate": 4.197073170731707e-08, "logits/chosen": 0.08066365867853165, "logits/rejected": 0.27275511622428894, "logps/chosen": -282.13165283203125, "logps/rejected": -201.79586791992188, "loss": 0.6298, "rewards/accuracies": 1.0, "rewards/chosen": 0.40546715259552, "rewards/margins": 1.2127996683120728, "rewards/rejected": -0.8073325753211975, "step": 965 }, { "epoch": 0.5889346136259717, "grad_norm": 73.94280467512128, "learning_rate": 4.201463414634146e-08, "logits/chosen": 0.14924946427345276, "logits/rejected": 0.12114415317773819, "logps/chosen": -24.413921356201172, "logps/rejected": -26.602420806884766, "loss": 0.559, "rewards/accuracies": 0.5, "rewards/chosen": -0.42113298177719116, "rewards/margins": 0.08444094657897949, "rewards/rejected": -0.5055739283561707, "step": 966 }, { "epoch": 0.5895442767870751, "grad_norm": 72.32887870017322, "learning_rate": 4.2058536585365846e-08, "logits/chosen": 0.37911567091941833, "logits/rejected": 0.14017818868160248, "logps/chosen": -34.29087448120117, "logps/rejected": -59.50763702392578, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": 0.01295555755496025, "rewards/margins": 0.1707940399646759, "rewards/rejected": -0.15783849358558655, "step": 967 }, { "epoch": 0.5901539399481787, "grad_norm": 85.10928808814798, "learning_rate": 4.210243902439024e-08, "logits/chosen": 0.02898680604994297, "logits/rejected": 0.17125096917152405, "logps/chosen": -128.51568603515625, "logps/rejected": -93.6944580078125, "loss": 0.5281, "rewards/accuracies": 0.25, "rewards/chosen": -0.4702191948890686, "rewards/margins": -0.3852922022342682, "rewards/rejected": -0.08492698520421982, "step": 968 }, { "epoch": 0.5907636031092821, "grad_norm": 64.60074775054339, "learning_rate": 4.214634146341463e-08, "logits/chosen": 0.4834749698638916, "logits/rejected": 0.6553069353103638, "logps/chosen": -277.7457275390625, "logps/rejected": -201.1106414794922, "loss": 0.5532, "rewards/accuracies": 0.25, "rewards/chosen": 0.4859507083892822, "rewards/margins": 0.24577556550502777, "rewards/rejected": 0.24017517268657684, "step": 969 }, { "epoch": 0.5913732662703857, "grad_norm": 71.28492793310289, "learning_rate": 4.219024390243902e-08, "logits/chosen": 0.2692300081253052, "logits/rejected": 0.31959807872772217, "logps/chosen": -174.9003143310547, "logps/rejected": -136.06768798828125, "loss": 0.5951, "rewards/accuracies": 0.5, "rewards/chosen": -0.0905541181564331, "rewards/margins": 0.4873233437538147, "rewards/rejected": -0.5778774619102478, "step": 970 }, { "epoch": 0.5919829294314891, "grad_norm": 63.96088036649985, "learning_rate": 4.223414634146341e-08, "logits/chosen": -0.14569628238677979, "logits/rejected": -0.007821448147296906, "logps/chosen": -106.27445983886719, "logps/rejected": -71.78903198242188, "loss": 0.5752, "rewards/accuracies": 0.5, "rewards/chosen": -0.2875051200389862, "rewards/margins": -0.14759160578250885, "rewards/rejected": -0.13991349935531616, "step": 971 }, { "epoch": 0.5925925925925926, "grad_norm": 78.3828918011369, "learning_rate": 4.22780487804878e-08, "logits/chosen": -0.027136696502566338, "logits/rejected": 0.2894710898399353, "logps/chosen": -156.28268432617188, "logps/rejected": -223.8277587890625, "loss": 0.6042, "rewards/accuracies": 0.25, "rewards/chosen": -0.290881872177124, "rewards/margins": -0.12722742557525635, "rewards/rejected": -0.16365444660186768, "step": 972 }, { "epoch": 0.5932022557536961, "grad_norm": 60.19492529290873, "learning_rate": 4.232195121951219e-08, "logits/chosen": 0.1515863537788391, "logits/rejected": -0.06478295475244522, "logps/chosen": -335.7998046875, "logps/rejected": -445.21929931640625, "loss": 0.4768, "rewards/accuracies": 1.0, "rewards/chosen": 0.6023845672607422, "rewards/margins": 1.3038933277130127, "rewards/rejected": -0.7015087008476257, "step": 973 }, { "epoch": 0.5938119189147996, "grad_norm": 56.192626708773645, "learning_rate": 4.236585365853658e-08, "logits/chosen": -0.017517834901809692, "logits/rejected": -0.14214028418064117, "logps/chosen": -217.49298095703125, "logps/rejected": -184.72012329101562, "loss": 0.4848, "rewards/accuracies": 0.75, "rewards/chosen": -0.18382145464420319, "rewards/margins": 0.42155730724334717, "rewards/rejected": -0.6053788065910339, "step": 974 }, { "epoch": 0.594421582075903, "grad_norm": 82.4116097946905, "learning_rate": 4.240975609756097e-08, "logits/chosen": -0.14134134352207184, "logits/rejected": -0.06352880597114563, "logps/chosen": -513.8828735351562, "logps/rejected": -412.7125549316406, "loss": 0.5264, "rewards/accuracies": 0.75, "rewards/chosen": 0.817517101764679, "rewards/margins": 0.8614957332611084, "rewards/rejected": -0.04397859424352646, "step": 975 }, { "epoch": 0.5950312452370066, "grad_norm": 68.60202196421983, "learning_rate": 4.245365853658536e-08, "logits/chosen": -0.06833840161561966, "logits/rejected": 0.08813981711864471, "logps/chosen": -86.92523193359375, "logps/rejected": -91.20805358886719, "loss": 0.538, "rewards/accuracies": 0.5, "rewards/chosen": 0.06409449130296707, "rewards/margins": -0.0014316122978925705, "rewards/rejected": 0.06552610546350479, "step": 976 }, { "epoch": 0.59564090839811, "grad_norm": 63.80535251075345, "learning_rate": 4.249756097560975e-08, "logits/chosen": -0.13767874240875244, "logits/rejected": -0.09763995558023453, "logps/chosen": -107.87483215332031, "logps/rejected": -147.85107421875, "loss": 0.5485, "rewards/accuracies": 0.0, "rewards/chosen": -0.4442089796066284, "rewards/margins": -0.43157342076301575, "rewards/rejected": -0.012635531835258007, "step": 977 }, { "epoch": 0.5962505715592136, "grad_norm": 58.96860958544613, "learning_rate": 4.2541463414634144e-08, "logits/chosen": 0.2365347146987915, "logits/rejected": 0.23233827948570251, "logps/chosen": -232.26263427734375, "logps/rejected": -258.4931335449219, "loss": 0.4819, "rewards/accuracies": 0.75, "rewards/chosen": 0.28829145431518555, "rewards/margins": 0.654391884803772, "rewards/rejected": -0.3661004602909088, "step": 978 }, { "epoch": 0.596860234720317, "grad_norm": 56.34013149049837, "learning_rate": 4.258536585365853e-08, "logits/chosen": 0.13740472495555878, "logits/rejected": 0.12650051712989807, "logps/chosen": -58.84000015258789, "logps/rejected": -66.1595458984375, "loss": 0.5355, "rewards/accuracies": 0.5, "rewards/chosen": -0.20688855648040771, "rewards/margins": 0.028081614524126053, "rewards/rejected": -0.23497018218040466, "step": 979 }, { "epoch": 0.5974698978814205, "grad_norm": 70.09919354884728, "learning_rate": 4.262926829268292e-08, "logits/chosen": -0.05875900015234947, "logits/rejected": -0.19873517751693726, "logps/chosen": -141.268310546875, "logps/rejected": -179.65872192382812, "loss": 0.6015, "rewards/accuracies": 0.25, "rewards/chosen": -0.4807414710521698, "rewards/margins": 0.02805427461862564, "rewards/rejected": -0.5087957382202148, "step": 980 }, { "epoch": 0.598079561042524, "grad_norm": 70.50180958730834, "learning_rate": 4.2673170731707316e-08, "logits/chosen": 0.22438283264636993, "logits/rejected": 0.2016701102256775, "logps/chosen": -172.8882598876953, "logps/rejected": -207.31109619140625, "loss": 0.5625, "rewards/accuracies": 0.75, "rewards/chosen": -0.1393517404794693, "rewards/margins": 0.16943779587745667, "rewards/rejected": -0.30878955125808716, "step": 981 }, { "epoch": 0.5986892242036275, "grad_norm": 47.63198791448597, "learning_rate": 4.2717073170731704e-08, "logits/chosen": 0.13777931034564972, "logits/rejected": 0.1058683693408966, "logps/chosen": -130.87457275390625, "logps/rejected": -157.2276611328125, "loss": 0.5121, "rewards/accuracies": 0.75, "rewards/chosen": -0.2509147822856903, "rewards/margins": 0.616559624671936, "rewards/rejected": -0.8674744367599487, "step": 982 }, { "epoch": 0.599298887364731, "grad_norm": 72.20509391638281, "learning_rate": 4.276097560975609e-08, "logits/chosen": 0.237956240773201, "logits/rejected": 0.13941863179206848, "logps/chosen": -145.96958923339844, "logps/rejected": -284.981201171875, "loss": 0.5956, "rewards/accuracies": 0.75, "rewards/chosen": 0.11076083034276962, "rewards/margins": 0.25600630044937134, "rewards/rejected": -0.14524544775485992, "step": 983 }, { "epoch": 0.5999085505258345, "grad_norm": 64.97602705586922, "learning_rate": 4.280487804878048e-08, "logits/chosen": 0.10785136371850967, "logits/rejected": -0.051660746335983276, "logps/chosen": -32.13536834716797, "logps/rejected": -46.07811737060547, "loss": 0.5479, "rewards/accuracies": 0.75, "rewards/chosen": -0.061608847230672836, "rewards/margins": 0.35192739963531494, "rewards/rejected": -0.4135362505912781, "step": 984 }, { "epoch": 0.6005182136869379, "grad_norm": 74.32472065832633, "learning_rate": 4.2848780487804876e-08, "logits/chosen": 0.22914312779903412, "logits/rejected": 0.04533011466264725, "logps/chosen": -81.81015014648438, "logps/rejected": -99.21920013427734, "loss": 0.5848, "rewards/accuracies": 1.0, "rewards/chosen": 0.3640258312225342, "rewards/margins": 0.7226523160934448, "rewards/rejected": -0.35862642526626587, "step": 985 }, { "epoch": 0.6011278768480415, "grad_norm": 63.60531552456201, "learning_rate": 4.2892682926829265e-08, "logits/chosen": 0.16781377792358398, "logits/rejected": 0.13715007901191711, "logps/chosen": -140.97671508789062, "logps/rejected": -256.8678283691406, "loss": 0.4367, "rewards/accuracies": 0.75, "rewards/chosen": -0.07384621351957321, "rewards/margins": 0.8329897522926331, "rewards/rejected": -0.9068359136581421, "step": 986 }, { "epoch": 0.6017375400091449, "grad_norm": 57.035252503405076, "learning_rate": 4.293658536585365e-08, "logits/chosen": -0.14472906291484833, "logits/rejected": 0.16360566020011902, "logps/chosen": -85.19625091552734, "logps/rejected": -50.558929443359375, "loss": 0.5303, "rewards/accuracies": 0.5, "rewards/chosen": -0.08855142444372177, "rewards/margins": -0.014545775949954987, "rewards/rejected": -0.07400565594434738, "step": 987 }, { "epoch": 0.6023472031702485, "grad_norm": 56.92118331145405, "learning_rate": 4.298048780487805e-08, "logits/chosen": 0.12844596803188324, "logits/rejected": 0.2380760908126831, "logps/chosen": -123.90941619873047, "logps/rejected": -85.47669219970703, "loss": 0.4726, "rewards/accuracies": 0.75, "rewards/chosen": 0.6185051798820496, "rewards/margins": 0.6352143883705139, "rewards/rejected": -0.016709256917238235, "step": 988 }, { "epoch": 0.6029568663313519, "grad_norm": 61.22634805888205, "learning_rate": 4.3024390243902436e-08, "logits/chosen": 0.137892946600914, "logits/rejected": 0.2502715289592743, "logps/chosen": -138.75047302246094, "logps/rejected": -18.938377380371094, "loss": 0.5409, "rewards/accuracies": 0.75, "rewards/chosen": -0.08287467807531357, "rewards/margins": 0.5290653109550476, "rewards/rejected": -0.6119399666786194, "step": 989 }, { "epoch": 0.6035665294924554, "grad_norm": 63.03320331999485, "learning_rate": 4.3068292682926825e-08, "logits/chosen": 0.2903212010860443, "logits/rejected": 0.25095999240875244, "logps/chosen": -284.98394775390625, "logps/rejected": -293.1748046875, "loss": 0.4376, "rewards/accuracies": 0.75, "rewards/chosen": 0.4981961250305176, "rewards/margins": 1.0244200229644775, "rewards/rejected": -0.5262239575386047, "step": 990 }, { "epoch": 0.6041761926535589, "grad_norm": 56.12693618714919, "learning_rate": 4.311219512195121e-08, "logits/chosen": 0.3138669729232788, "logits/rejected": 0.06886722147464752, "logps/chosen": -221.6787567138672, "logps/rejected": -371.2520751953125, "loss": 0.4959, "rewards/accuracies": 0.75, "rewards/chosen": 0.40865635871887207, "rewards/margins": 1.7064982652664185, "rewards/rejected": -1.2978421449661255, "step": 991 }, { "epoch": 0.6047858558146624, "grad_norm": 65.94861499842719, "learning_rate": 4.315609756097561e-08, "logits/chosen": 0.18619008362293243, "logits/rejected": 0.11499892175197601, "logps/chosen": -280.6131591796875, "logps/rejected": -166.84622192382812, "loss": 0.5502, "rewards/accuracies": 0.5, "rewards/chosen": 0.04074333980679512, "rewards/margins": 0.4415995478630066, "rewards/rejected": -0.40085622668266296, "step": 992 }, { "epoch": 0.6053955189757659, "grad_norm": 70.49265773787819, "learning_rate": 4.3199999999999996e-08, "logits/chosen": -0.009435049258172512, "logits/rejected": 0.2987651526927948, "logps/chosen": -302.96124267578125, "logps/rejected": -192.93569946289062, "loss": 0.5567, "rewards/accuracies": 0.75, "rewards/chosen": -0.06466828286647797, "rewards/margins": 0.46986982226371765, "rewards/rejected": -0.5345380902290344, "step": 993 }, { "epoch": 0.6060051821368694, "grad_norm": 61.530930320466695, "learning_rate": 4.3243902439024385e-08, "logits/chosen": 0.1342477798461914, "logits/rejected": 0.15137697756290436, "logps/chosen": -184.71934509277344, "logps/rejected": -140.88229370117188, "loss": 0.5371, "rewards/accuracies": 0.75, "rewards/chosen": 0.1671849489212036, "rewards/margins": 0.7406481504440308, "rewards/rejected": -0.5734632015228271, "step": 994 }, { "epoch": 0.6066148452979728, "grad_norm": 67.61584483534448, "learning_rate": 4.328780487804878e-08, "logits/chosen": 0.09232601523399353, "logits/rejected": 0.3603507876396179, "logps/chosen": -429.93292236328125, "logps/rejected": -311.12701416015625, "loss": 0.5626, "rewards/accuracies": 0.75, "rewards/chosen": 0.46371012926101685, "rewards/margins": 0.33862996101379395, "rewards/rejected": 0.12508010864257812, "step": 995 }, { "epoch": 0.6072245084590764, "grad_norm": 80.14941941324331, "learning_rate": 4.333170731707317e-08, "logits/chosen": -0.181582972407341, "logits/rejected": -0.0784536823630333, "logps/chosen": -293.38812255859375, "logps/rejected": -180.1998291015625, "loss": 0.6736, "rewards/accuracies": 0.5, "rewards/chosen": 0.05510110408067703, "rewards/margins": 0.2038726508617401, "rewards/rejected": -0.14877153933048248, "step": 996 }, { "epoch": 0.6078341716201798, "grad_norm": 64.12534841741636, "learning_rate": 4.3375609756097557e-08, "logits/chosen": -0.07904496788978577, "logits/rejected": 0.06720384955406189, "logps/chosen": -391.43597412109375, "logps/rejected": -271.29388427734375, "loss": 0.466, "rewards/accuracies": 0.5, "rewards/chosen": 0.9313393235206604, "rewards/margins": 1.4372742176055908, "rewards/rejected": -0.5059348940849304, "step": 997 }, { "epoch": 0.6084438347812834, "grad_norm": 62.79329595765985, "learning_rate": 4.3419512195121945e-08, "logits/chosen": 0.11170154809951782, "logits/rejected": 0.08879280090332031, "logps/chosen": -182.6605224609375, "logps/rejected": -114.03193664550781, "loss": 0.6013, "rewards/accuracies": 0.75, "rewards/chosen": 0.35989508032798767, "rewards/margins": 1.1250437498092651, "rewards/rejected": -0.7651485800743103, "step": 998 }, { "epoch": 0.6090534979423868, "grad_norm": 73.49008345337796, "learning_rate": 4.346341463414634e-08, "logits/chosen": 0.05520867183804512, "logits/rejected": 0.09620153158903122, "logps/chosen": -95.85297393798828, "logps/rejected": -52.026451110839844, "loss": 0.5837, "rewards/accuracies": 0.5, "rewards/chosen": -0.012775249779224396, "rewards/margins": 0.11057233810424805, "rewards/rejected": -0.12334759533405304, "step": 999 }, { "epoch": 0.6096631611034903, "grad_norm": 85.98186056287153, "learning_rate": 4.350731707317073e-08, "logits/chosen": 0.16709566116333008, "logits/rejected": 0.12938140332698822, "logps/chosen": -238.64108276367188, "logps/rejected": -247.4223175048828, "loss": 0.5785, "rewards/accuracies": 0.75, "rewards/chosen": 0.43460771441459656, "rewards/margins": 0.6481321454048157, "rewards/rejected": -0.21352441608905792, "step": 1000 }, { "epoch": 0.6096631611034903, "eval_logits/chosen": 0.015029330737888813, "eval_logits/rejected": 0.09182646870613098, "eval_logps/chosen": -162.3784637451172, "eval_logps/rejected": -112.31405639648438, "eval_loss": 0.5489267706871033, "eval_rewards/accuracies": 0.5454545617103577, "eval_rewards/chosen": 0.025668036192655563, "eval_rewards/margins": 0.33774468302726746, "eval_rewards/rejected": -0.3120766580104828, "eval_runtime": 36.0159, "eval_samples_per_second": 7.33, "eval_steps_per_second": 0.916, "step": 1000 } ], "logging_steps": 1, "max_steps": 1640, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 105686681518080.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }