{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994767137624281, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010465724751439038, "grad_norm": 21.102116873134612, "learning_rate": 5.208333333333333e-09, "logits/chosen": -2.924262046813965, "logits/rejected": -2.7925047874450684, "logps/chosen": -380.8447570800781, "logps/rejected": -358.51123046875, "loss": 4.6506, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010465724751439037, "grad_norm": 15.822543074567085, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.595761299133301, "logits/rejected": -2.569227457046509, "logps/chosen": -256.6064453125, "logps/rejected": -234.93408203125, "loss": 4.5621, "rewards/accuracies": 0.5, "rewards/chosen": 0.00042897689854726195, "rewards/margins": 0.0009927540086209774, "rewards/rejected": -0.0005637770518660545, "step": 10 }, { "epoch": 0.020931449502878074, "grad_norm": 18.010820015079055, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.613164186477661, "logits/rejected": -2.5756287574768066, "logps/chosen": -283.0158996582031, "logps/rejected": -282.265869140625, "loss": 4.4053, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0006733193295076489, "rewards/margins": 0.0005819452926516533, "rewards/rejected": 9.137402230408043e-05, "step": 20 }, { "epoch": 0.03139717425431711, "grad_norm": 21.44807572026145, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.691143035888672, "logits/rejected": -2.6666667461395264, "logps/chosen": -269.9042053222656, "logps/rejected": -276.4795837402344, "loss": 5.105, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0037794082891196012, "rewards/margins": 0.0018267262494191527, "rewards/rejected": 0.0019526820397004485, "step": 30 }, { "epoch": 0.04186289900575615, "grad_norm": 17.302023991146115, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6577816009521484, "logits/rejected": -2.5818943977355957, "logps/chosen": -288.9285888671875, "logps/rejected": -280.9770202636719, "loss": 4.9032, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.020702064037322998, "rewards/margins": 0.009830506518483162, "rewards/rejected": 0.01087155845016241, "step": 40 }, { "epoch": 0.052328623757195186, "grad_norm": 22.46337927130885, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.6507585048675537, "logits/rejected": -2.5627222061157227, "logps/chosen": -263.1905212402344, "logps/rejected": -234.9305419921875, "loss": 4.8274, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.044054824858903885, "rewards/margins": 0.02749818004667759, "rewards/rejected": 0.016556641086935997, "step": 50 }, { "epoch": 0.06279434850863422, "grad_norm": 18.98737987603255, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.5976526737213135, "logits/rejected": -2.5587098598480225, "logps/chosen": -299.9574890136719, "logps/rejected": -276.1783142089844, "loss": 4.5279, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.040667824447155, "rewards/margins": 0.04492232948541641, "rewards/rejected": -0.004254504106938839, "step": 60 }, { "epoch": 0.07326007326007326, "grad_norm": 20.501382800234886, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.469130039215088, "logits/rejected": -2.452857732772827, "logps/chosen": -265.96978759765625, "logps/rejected": -271.6788330078125, "loss": 4.6703, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0633089542388916, "rewards/margins": 0.07126398384571075, "rewards/rejected": -0.13457295298576355, "step": 70 }, { "epoch": 0.0837257980115123, "grad_norm": 25.49997843488533, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.4551777839660645, "logits/rejected": -2.3624327182769775, "logps/chosen": -285.5320739746094, "logps/rejected": -276.4596252441406, "loss": 4.5605, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09489366412162781, "rewards/margins": 0.15657536685466766, "rewards/rejected": -0.2514690160751343, "step": 80 }, { "epoch": 0.09419152276295134, "grad_norm": 30.61647338954573, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.3756256103515625, "logits/rejected": -2.332918882369995, "logps/chosen": -277.46014404296875, "logps/rejected": -290.0049743652344, "loss": 4.1231, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.21862252056598663, "rewards/margins": 0.033695660531520844, "rewards/rejected": -0.25231820344924927, "step": 90 }, { "epoch": 0.10465724751439037, "grad_norm": 38.124561793065574, "learning_rate": 4.999732492681437e-07, "logits/chosen": -2.332035779953003, "logits/rejected": -2.2253689765930176, "logps/chosen": -314.4341125488281, "logps/rejected": -317.18695068359375, "loss": 4.5854, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1460995227098465, "rewards/margins": 0.22573721408843994, "rewards/rejected": -0.37183672189712524, "step": 100 }, { "epoch": 0.10465724751439037, "eval_logits/chosen": -2.2812609672546387, "eval_logits/rejected": -2.192293167114258, "eval_logps/chosen": -309.1551818847656, "eval_logps/rejected": -310.1242370605469, "eval_loss": 4.381103515625, "eval_rewards/accuracies": 0.648809552192688, "eval_rewards/chosen": -0.2718724012374878, "eval_rewards/margins": 0.2273014634847641, "eval_rewards/rejected": -0.4991738498210907, "eval_runtime": 176.2372, "eval_samples_per_second": 11.348, "eval_steps_per_second": 0.357, "step": 100 }, { "epoch": 0.1151229722658294, "grad_norm": 47.336977780094564, "learning_rate": 4.996723692767926e-07, "logits/chosen": -2.0436112880706787, "logits/rejected": -1.9534924030303955, "logps/chosen": -310.6973571777344, "logps/rejected": -324.1681823730469, "loss": 3.758, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6924275755882263, "rewards/margins": 0.17653007805347443, "rewards/rejected": -0.8689576387405396, "step": 110 }, { "epoch": 0.12558869701726844, "grad_norm": 109.43376131471078, "learning_rate": 4.990375746213598e-07, "logits/chosen": -0.08515436947345734, "logits/rejected": 0.34949326515197754, "logps/chosen": -343.26495361328125, "logps/rejected": -412.98577880859375, "loss": 4.0333, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8068662881851196, "rewards/margins": 0.438527911901474, "rewards/rejected": -1.2453943490982056, "step": 120 }, { "epoch": 0.1360544217687075, "grad_norm": 95.04671304091885, "learning_rate": 4.980697142834314e-07, "logits/chosen": 0.396954745054245, "logits/rejected": 1.0232269763946533, "logps/chosen": -406.28521728515625, "logps/rejected": -430.10760498046875, "loss": 4.2005, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2501262426376343, "rewards/margins": 0.5063079595565796, "rewards/rejected": -1.7564342021942139, "step": 130 }, { "epoch": 0.14652014652014653, "grad_norm": 144.39035434160894, "learning_rate": 4.967700826904229e-07, "logits/chosen": -0.1560676395893097, "logits/rejected": 0.6105406880378723, "logps/chosen": -416.2538146972656, "logps/rejected": -463.2472229003906, "loss": 3.7876, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2516638040542603, "rewards/margins": 0.49105915427207947, "rewards/rejected": -1.7427231073379517, "step": 140 }, { "epoch": 0.15698587127158556, "grad_norm": 125.21681673589694, "learning_rate": 4.951404179843962e-07, "logits/chosen": 2.0407581329345703, "logits/rejected": 2.8481547832489014, "logps/chosen": -510.521484375, "logps/rejected": -534.6341552734375, "loss": 3.898, "rewards/accuracies": 0.625, "rewards/chosen": -2.226250648498535, "rewards/margins": 0.6501102447509766, "rewards/rejected": -2.876361131668091, "step": 150 }, { "epoch": 0.1674515960230246, "grad_norm": 66.88313091855639, "learning_rate": 4.931828996974498e-07, "logits/chosen": 2.163175106048584, "logits/rejected": 3.5420451164245605, "logps/chosen": -585.4688720703125, "logps/rejected": -635.2697143554688, "loss": 3.9393, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.89656138420105, "rewards/margins": 0.8964195251464844, "rewards/rejected": -3.792980909347534, "step": 160 }, { "epoch": 0.17791732077446362, "grad_norm": 188.98325062900707, "learning_rate": 4.909001458367866e-07, "logits/chosen": 0.49319368600845337, "logits/rejected": 1.3766599893569946, "logps/chosen": -599.5331420898438, "logps/rejected": -654.1383056640625, "loss": 3.9922, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -3.210639476776123, "rewards/margins": 0.922272801399231, "rewards/rejected": -4.132911682128906, "step": 170 }, { "epoch": 0.18838304552590268, "grad_norm": 320.6202106283321, "learning_rate": 4.882952093833627e-07, "logits/chosen": 0.6820823550224304, "logits/rejected": 1.588409185409546, "logps/chosen": -1040.5491943359375, "logps/rejected": -1233.1207275390625, "loss": 3.36, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -8.006011009216309, "rewards/margins": 1.8573882579803467, "rewards/rejected": -9.86340045928955, "step": 180 }, { "epoch": 0.1988487702773417, "grad_norm": 157.79546381015746, "learning_rate": 4.853715742087946e-07, "logits/chosen": 3.3087031841278076, "logits/rejected": 4.11985445022583, "logps/chosen": -1690.8167724609375, "logps/rejected": -1890.6634521484375, "loss": 2.6799, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -14.548372268676758, "rewards/margins": 1.8595011234283447, "rewards/rejected": -16.407875061035156, "step": 190 }, { "epoch": 0.20931449502878074, "grad_norm": 178.97245767319544, "learning_rate": 4.821331504159906e-07, "logits/chosen": 0.3337511122226715, "logits/rejected": 1.9961885213851929, "logps/chosen": -1578.712158203125, "logps/rejected": -1801.65625, "loss": 2.6464, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -12.680551528930664, "rewards/margins": 2.764849901199341, "rewards/rejected": -15.445402145385742, "step": 200 }, { "epoch": 0.20931449502878074, "eval_logits/chosen": -0.35622134804725647, "eval_logits/rejected": 0.6981890797615051, "eval_logps/chosen": -1244.43603515625, "eval_logps/rejected": -1423.3580322265625, "eval_loss": 2.606262683868408, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -9.624680519104004, "eval_rewards/margins": 2.0068302154541016, "eval_rewards/rejected": -11.631510734558105, "eval_runtime": 177.3795, "eval_samples_per_second": 11.275, "eval_steps_per_second": 0.355, "step": 200 }, { "epoch": 0.21978021978021978, "grad_norm": 221.39959720400535, "learning_rate": 4.785842691097342e-07, "logits/chosen": 0.43124809861183167, "logits/rejected": 1.6196168661117554, "logps/chosen": -1394.329345703125, "logps/rejected": -1612.8701171875, "loss": 2.2192, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -11.115188598632812, "rewards/margins": 2.29093337059021, "rewards/rejected": -13.406122207641602, "step": 210 }, { "epoch": 0.2302459445316588, "grad_norm": 107.97254065213261, "learning_rate": 4.7472967660421603e-07, "logits/chosen": 0.5400440096855164, "logits/rejected": 1.9760030508041382, "logps/chosen": -1507.001220703125, "logps/rejected": -1713.616455078125, "loss": 2.018, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -12.197932243347168, "rewards/margins": 2.5764663219451904, "rewards/rejected": -14.774396896362305, "step": 220 }, { "epoch": 0.24071166928309787, "grad_norm": 217.88193736039008, "learning_rate": 4.705745280752585e-07, "logits/chosen": 1.4225207567214966, "logits/rejected": 2.4756038188934326, "logps/chosen": -1726.320068359375, "logps/rejected": -2005.7041015625, "loss": 1.9719, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -14.375930786132812, "rewards/margins": 2.995753526687622, "rewards/rejected": -17.37168312072754, "step": 230 }, { "epoch": 0.25117739403453687, "grad_norm": 109.77258728949327, "learning_rate": 4.6612438066572555e-07, "logits/chosen": 2.2113587856292725, "logits/rejected": 3.125591993331909, "logps/chosen": -1894.770751953125, "logps/rejected": -2110.86376953125, "loss": 1.9847, "rewards/accuracies": 0.59375, "rewards/chosen": -16.27196502685547, "rewards/margins": 2.294943332672119, "rewards/rejected": -18.56690788269043, "step": 240 }, { "epoch": 0.2616431187859759, "grad_norm": 276.53415343052893, "learning_rate": 4.6138518605333664e-07, "logits/chosen": 1.203977108001709, "logits/rejected": 1.9225616455078125, "logps/chosen": -1561.0047607421875, "logps/rejected": -1763.075439453125, "loss": 2.257, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -12.832204818725586, "rewards/margins": 2.222775936126709, "rewards/rejected": -15.05497932434082, "step": 250 }, { "epoch": 0.272108843537415, "grad_norm": 159.14963627253198, "learning_rate": 4.5636328249082514e-07, "logits/chosen": 1.134037733078003, "logits/rejected": 2.1568219661712646, "logps/chosen": -1608.8623046875, "logps/rejected": -1763.599853515625, "loss": 2.2606, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -13.198956489562988, "rewards/margins": 1.6145280599594116, "rewards/rejected": -14.813486099243164, "step": 260 }, { "epoch": 0.282574568288854, "grad_norm": 199.45417630865836, "learning_rate": 4.510653863290871e-07, "logits/chosen": 0.3547247350215912, "logits/rejected": 1.2751286029815674, "logps/chosen": -1781.0726318359375, "logps/rejected": -2089.05615234375, "loss": 1.7211, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -14.814226150512695, "rewards/margins": 3.540767192840576, "rewards/rejected": -18.354991912841797, "step": 270 }, { "epoch": 0.29304029304029305, "grad_norm": 162.5497817330968, "learning_rate": 4.4549858303465737e-07, "logits/chosen": 0.21130748093128204, "logits/rejected": 1.2269564867019653, "logps/chosen": -1743.0787353515625, "logps/rejected": -2033.669921875, "loss": 1.9445, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -14.468345642089844, "rewards/margins": 3.0355849266052246, "rewards/rejected": -17.50392723083496, "step": 280 }, { "epoch": 0.3035060177917321, "grad_norm": 307.15808847538113, "learning_rate": 4.396703177135261e-07, "logits/chosen": 0.7419403791427612, "logits/rejected": 1.9202260971069336, "logps/chosen": -1948.1787109375, "logps/rejected": -2273.5205078125, "loss": 1.9864, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -16.753948211669922, "rewards/margins": 3.6111111640930176, "rewards/rejected": -20.36505699157715, "step": 290 }, { "epoch": 0.3139717425431711, "grad_norm": 90.00202577382801, "learning_rate": 4.335883851539693e-07, "logits/chosen": 0.30849236249923706, "logits/rejected": 1.1072229146957397, "logps/chosen": -1431.3275146484375, "logps/rejected": -1653.4029541015625, "loss": 1.9069, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -11.623054504394531, "rewards/margins": 2.2337088584899902, "rewards/rejected": -13.856762886047363, "step": 300 }, { "epoch": 0.3139717425431711, "eval_logits/chosen": 0.45899611711502075, "eval_logits/rejected": 1.5569082498550415, "eval_logps/chosen": -1266.6490478515625, "eval_logps/rejected": -1452.7674560546875, "eval_loss": 2.262396812438965, "eval_rewards/accuracies": 0.6329365372657776, "eval_rewards/chosen": -9.846811294555664, "eval_rewards/margins": 2.0787949562072754, "eval_rewards/rejected": -11.925606727600098, "eval_runtime": 176.5188, "eval_samples_per_second": 11.33, "eval_steps_per_second": 0.357, "step": 300 }, { "epoch": 0.32443746729461015, "grad_norm": 177.8700388917398, "learning_rate": 4.272609194017105e-07, "logits/chosen": 0.647371768951416, "logits/rejected": 2.9104599952697754, "logps/chosen": -1395.496826171875, "logps/rejected": -1711.9573974609375, "loss": 2.3095, "rewards/accuracies": 0.75, "rewards/chosen": -11.117349624633789, "rewards/margins": 3.674748182296753, "rewards/rejected": -14.792098999023438, "step": 310 }, { "epoch": 0.3349031920460492, "grad_norm": 180.92515200199898, "learning_rate": 4.2069638288135547e-07, "logits/chosen": 0.9543863534927368, "logits/rejected": 1.7447538375854492, "logps/chosen": -1926.299560546875, "logps/rejected": -2217.88037109375, "loss": 2.1724, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -16.73282814025879, "rewards/margins": 2.939984083175659, "rewards/rejected": -19.672813415527344, "step": 320 }, { "epoch": 0.3453689167974882, "grad_norm": 145.6894284610869, "learning_rate": 4.139035550786494e-07, "logits/chosen": -0.039321091026067734, "logits/rejected": 0.5018073320388794, "logps/chosen": -1734.091796875, "logps/rejected": -1908.339111328125, "loss": 1.716, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -14.77747917175293, "rewards/margins": 1.9848415851593018, "rewards/rejected": -16.76232147216797, "step": 330 }, { "epoch": 0.35583464154892724, "grad_norm": 183.78050890033984, "learning_rate": 4.0689152079869306e-07, "logits/chosen": -0.5724295377731323, "logits/rejected": 0.023262571543455124, "logps/chosen": -1660.732177734375, "logps/rejected": -1876.1025390625, "loss": 1.8439, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -13.912447929382324, "rewards/margins": 2.488671064376831, "rewards/rejected": -16.401119232177734, "step": 340 }, { "epoch": 0.3663003663003663, "grad_norm": 149.28700648360655, "learning_rate": 3.99669658015821e-07, "logits/chosen": 0.006322336383163929, "logits/rejected": 0.6332755088806152, "logps/chosen": -1966.5765380859375, "logps/rejected": -2201.843505859375, "loss": 1.6671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.753414154052734, "rewards/margins": 2.7041499614715576, "rewards/rejected": -19.457565307617188, "step": 350 }, { "epoch": 0.37676609105180536, "grad_norm": 237.65668361495474, "learning_rate": 3.92247625331392e-07, "logits/chosen": -0.21500203013420105, "logits/rejected": 0.6255682110786438, "logps/chosen": -1989.7509765625, "logps/rejected": -2207.83349609375, "loss": 1.6927, "rewards/accuracies": 0.625, "rewards/chosen": -17.038667678833008, "rewards/margins": 2.4120330810546875, "rewards/rejected": -19.450698852539062, "step": 360 }, { "epoch": 0.3872318158032444, "grad_norm": 152.55773033990448, "learning_rate": 3.846353490562664e-07, "logits/chosen": -0.39199286699295044, "logits/rejected": -0.043508779257535934, "logps/chosen": -1889.5286865234375, "logps/rejected": -2139.589111328125, "loss": 1.7098, "rewards/accuracies": 0.59375, "rewards/chosen": -16.262523651123047, "rewards/margins": 2.7768733501434326, "rewards/rejected": -19.039398193359375, "step": 370 }, { "epoch": 0.3976975405546834, "grad_norm": 239.86422108427834, "learning_rate": 3.768430099352445e-07, "logits/chosen": -0.5338395833969116, "logits/rejected": -0.10323655605316162, "logps/chosen": -1830.7080078125, "logps/rejected": -2104.773681640625, "loss": 1.786, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -15.76060962677002, "rewards/margins": 2.8237688541412354, "rewards/rejected": -18.58437728881836, "step": 380 }, { "epoch": 0.40816326530612246, "grad_norm": 137.89263121746114, "learning_rate": 3.6888102953122304e-07, "logits/chosen": -0.3421451449394226, "logits/rejected": 0.2877078056335449, "logps/chosen": -1774.384765625, "logps/rejected": -2007.7366943359375, "loss": 1.9274, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -14.92773151397705, "rewards/margins": 2.465951442718506, "rewards/rejected": -17.393680572509766, "step": 390 }, { "epoch": 0.4186289900575615, "grad_norm": 164.86784545063486, "learning_rate": 3.607600562872785e-07, "logits/chosen": -0.7335325479507446, "logits/rejected": -0.33919858932495117, "logps/chosen": -1733.375244140625, "logps/rejected": -1963.0279541015625, "loss": 1.6642, "rewards/accuracies": 0.5625, "rewards/chosen": -14.63383960723877, "rewards/margins": 2.319460391998291, "rewards/rejected": -16.95330047607422, "step": 400 }, { "epoch": 0.4186289900575615, "eval_logits/chosen": -0.7751028537750244, "eval_logits/rejected": -0.08748837560415268, "eval_logps/chosen": -1731.152587890625, "eval_logps/rejected": -2045.1492919921875, "eval_loss": 1.6421091556549072, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -14.491846084594727, "eval_rewards/margins": 3.3575782775878906, "eval_rewards/rejected": -17.849422454833984, "eval_runtime": 176.0651, "eval_samples_per_second": 11.359, "eval_steps_per_second": 0.358, "step": 400 }, { "epoch": 0.4290947148090005, "grad_norm": 128.91689311765836, "learning_rate": 3.5249095128531856e-07, "logits/chosen": -0.10633065551519394, "logits/rejected": 0.350477933883667, "logps/chosen": -1862.1099853515625, "logps/rejected": -2067.15673828125, "loss": 1.7556, "rewards/accuracies": 0.59375, "rewards/chosen": -15.903741836547852, "rewards/margins": 2.2993245124816895, "rewards/rejected": -18.203065872192383, "step": 410 }, { "epoch": 0.43956043956043955, "grad_norm": 187.2282869549343, "learning_rate": 3.4408477372034736e-07, "logits/chosen": -0.2209610939025879, "logits/rejected": 0.7663095593452454, "logps/chosen": -1825.959228515625, "logps/rejected": -2182.580810546875, "loss": 1.8542, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -15.437875747680664, "rewards/margins": 3.867755174636841, "rewards/rejected": -19.30562973022461, "step": 420 }, { "epoch": 0.4500261643118786, "grad_norm": 150.13979068919696, "learning_rate": 3.3555276610977276e-07, "logits/chosen": -1.128701090812683, "logits/rejected": -0.5558885335922241, "logps/chosen": -1832.6103515625, "logps/rejected": -2176.197265625, "loss": 1.5079, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -15.788568496704102, "rewards/margins": 3.47161602973938, "rewards/rejected": -19.26018714904785, "step": 430 }, { "epoch": 0.4604918890633176, "grad_norm": 163.41066719667168, "learning_rate": 3.269063392575352e-07, "logits/chosen": -0.6949409246444702, "logits/rejected": -0.05746125057339668, "logps/chosen": -1597.5341796875, "logps/rejected": -1821.0198974609375, "loss": 1.4868, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -13.023595809936523, "rewards/margins": 2.632272481918335, "rewards/rejected": -15.655868530273438, "step": 440 }, { "epoch": 0.47095761381475665, "grad_norm": 133.46596474594617, "learning_rate": 3.1815705699316964e-07, "logits/chosen": -0.4808398187160492, "logits/rejected": 0.3264926075935364, "logps/chosen": -1599.6370849609375, "logps/rejected": -1936.6884765625, "loss": 1.5413, "rewards/accuracies": 0.625, "rewards/chosen": -13.172491073608398, "rewards/margins": 3.4112179279327393, "rewards/rejected": -16.583707809448242, "step": 450 }, { "epoch": 0.48142333856619574, "grad_norm": 155.84007478164062, "learning_rate": 3.0931662070620794e-07, "logits/chosen": -0.719369113445282, "logits/rejected": -0.06152462959289551, "logps/chosen": -1643.2447509765625, "logps/rejected": -1872.9976806640625, "loss": 1.7906, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -13.85786247253418, "rewards/margins": 2.4219117164611816, "rewards/rejected": -16.279773712158203, "step": 460 }, { "epoch": 0.49188906331763477, "grad_norm": 203.3322056694353, "learning_rate": 3.003968536966078e-07, "logits/chosen": -0.4609583020210266, "logits/rejected": -0.09374441206455231, "logps/chosen": -1654.1614990234375, "logps/rejected": -1845.5618896484375, "loss": 1.7718, "rewards/accuracies": 0.59375, "rewards/chosen": -13.703729629516602, "rewards/margins": 2.2525086402893066, "rewards/rejected": -15.956239700317383, "step": 470 }, { "epoch": 0.5023547880690737, "grad_norm": 156.4799546194198, "learning_rate": 2.9140968536213693e-07, "logits/chosen": -0.2353781908750534, "logits/rejected": 0.5946909785270691, "logps/chosen": -1859.3265380859375, "logps/rejected": -2325.88134765625, "loss": 1.3829, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -15.921140670776367, "rewards/margins": 4.824706077575684, "rewards/rejected": -20.745845794677734, "step": 480 }, { "epoch": 0.5128205128205128, "grad_norm": 160.19325879757844, "learning_rate": 2.823671352438608e-07, "logits/chosen": -0.9654836654663086, "logits/rejected": -0.002035105135291815, "logps/chosen": -1637.873291015625, "logps/rejected": -2143.010986328125, "loss": 1.6206, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -13.259417533874512, "rewards/margins": 5.606515407562256, "rewards/rejected": -18.86593246459961, "step": 490 }, { "epoch": 0.5232862375719518, "grad_norm": 221.83952267135834, "learning_rate": 2.73281296951072e-07, "logits/chosen": -0.6597784161567688, "logits/rejected": -0.14649493992328644, "logps/chosen": -1530.5738525390625, "logps/rejected": -1781.8070068359375, "loss": 1.6328, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -12.7192964553833, "rewards/margins": 2.8244967460632324, "rewards/rejected": -15.543792724609375, "step": 500 }, { "epoch": 0.5232862375719518, "eval_logits/chosen": -0.6590258479118347, "eval_logits/rejected": -0.091790109872818, "eval_logps/chosen": -1589.3370361328125, "eval_logps/rejected": -1890.562255859375, "eval_loss": 1.5119922161102295, "eval_rewards/accuracies": 0.6388888955116272, "eval_rewards/chosen": -13.073691368103027, "eval_rewards/margins": 3.229863166809082, "eval_rewards/rejected": -16.303556442260742, "eval_runtime": 177.8158, "eval_samples_per_second": 11.248, "eval_steps_per_second": 0.354, "step": 500 }, { "epoch": 0.533751962323391, "grad_norm": 187.4336485549293, "learning_rate": 2.641643219871597e-07, "logits/chosen": -0.5598984360694885, "logits/rejected": -0.2727218270301819, "logps/chosen": -1694.568359375, "logps/rejected": -2086.98193359375, "loss": 1.4069, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -14.106300354003906, "rewards/margins": 4.125433921813965, "rewards/rejected": -18.231733322143555, "step": 510 }, { "epoch": 0.54421768707483, "grad_norm": 180.24950333654212, "learning_rate": 2.550284034980507e-07, "logits/chosen": -0.652435302734375, "logits/rejected": -0.25857192277908325, "logps/chosen": -1941.6849365234375, "logps/rejected": -2231.46337890625, "loss": 1.6022, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -16.69613265991211, "rewards/margins": 3.1190426349639893, "rewards/rejected": -19.815174102783203, "step": 520 }, { "epoch": 0.554683411826269, "grad_norm": 147.71519410172087, "learning_rate": 2.4588575996495794e-07, "logits/chosen": -0.6198351979255676, "logits/rejected": -0.19036616384983063, "logps/chosen": -1601.6470947265625, "logps/rejected": -1820.4556884765625, "loss": 1.5136, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -13.329002380371094, "rewards/margins": 2.3253164291381836, "rewards/rejected": -15.654316902160645, "step": 530 }, { "epoch": 0.565149136577708, "grad_norm": 146.6770433780799, "learning_rate": 2.367486188632446e-07, "logits/chosen": -0.7303057909011841, "logits/rejected": 0.15564236044883728, "logps/chosen": -1670.916015625, "logps/rejected": -2011.5406494140625, "loss": 1.5458, "rewards/accuracies": 0.625, "rewards/chosen": -13.805659294128418, "rewards/margins": 3.703829288482666, "rewards/rejected": -17.509489059448242, "step": 540 }, { "epoch": 0.5756148613291471, "grad_norm": 206.94359776232758, "learning_rate": 2.276292003092593e-07, "logits/chosen": -0.22513580322265625, "logits/rejected": 0.4895138740539551, "logps/chosen": -1914.7532958984375, "logps/rejected": -2300.30322265625, "loss": 1.6801, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -16.434810638427734, "rewards/margins": 4.275403022766113, "rewards/rejected": -20.71021270751953, "step": 550 }, { "epoch": 0.5860805860805861, "grad_norm": 175.41735239090949, "learning_rate": 2.185397007170141e-07, "logits/chosen": -0.1453290730714798, "logits/rejected": 0.3121495842933655, "logps/chosen": -1876.300537109375, "logps/rejected": -2229.38134765625, "loss": 1.3878, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -15.934832572937012, "rewards/margins": 3.8099570274353027, "rewards/rejected": -19.744789123535156, "step": 560 }, { "epoch": 0.5965463108320251, "grad_norm": 142.79294258337345, "learning_rate": 2.094922764865619e-07, "logits/chosen": -0.276650995016098, "logits/rejected": 0.13945253193378448, "logps/chosen": -1827.0634765625, "logps/rejected": -2034.280517578125, "loss": 1.4902, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -15.475687980651855, "rewards/margins": 2.2050392627716064, "rewards/rejected": -17.680728912353516, "step": 570 }, { "epoch": 0.6070120355834642, "grad_norm": 245.80968468908674, "learning_rate": 2.0049902774588797e-07, "logits/chosen": -0.011815989390015602, "logits/rejected": 0.42436084151268005, "logps/chosen": -1794.5543212890625, "logps/rejected": -2061.93310546875, "loss": 1.4461, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -15.412150382995605, "rewards/margins": 3.078895330429077, "rewards/rejected": -18.491044998168945, "step": 580 }, { "epoch": 0.6174777603349032, "grad_norm": 175.38280547329734, "learning_rate": 1.9157198216806238e-07, "logits/chosen": -0.3044319152832031, "logits/rejected": 0.3406422734260559, "logps/chosen": -1649.8509521484375, "logps/rejected": -2006.366455078125, "loss": 1.5446, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -13.71898365020752, "rewards/margins": 3.657163143157959, "rewards/rejected": -17.376148223876953, "step": 590 }, { "epoch": 0.6279434850863422, "grad_norm": 203.04339818262545, "learning_rate": 1.8272307888529274e-07, "logits/chosen": 0.16477735340595245, "logits/rejected": 0.6171606183052063, "logps/chosen": -1870.41015625, "logps/rejected": -2165.638427734375, "loss": 1.6032, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -15.878863334655762, "rewards/margins": 3.1766743659973145, "rewards/rejected": -19.055538177490234, "step": 600 }, { "epoch": 0.6279434850863422, "eval_logits/chosen": 0.01903720200061798, "eval_logits/rejected": 0.6402472853660583, "eval_logps/chosen": -2015.7071533203125, "eval_logps/rejected": -2402.58447265625, "eval_loss": 1.4751698970794678, "eval_rewards/accuracies": 0.6230158805847168, "eval_rewards/chosen": -17.33738899230957, "eval_rewards/margins": 4.086385250091553, "eval_rewards/rejected": -21.42377471923828, "eval_runtime": 176.4506, "eval_samples_per_second": 11.335, "eval_steps_per_second": 0.357, "step": 600 }, { "epoch": 0.6384092098377813, "grad_norm": 184.64896406440843, "learning_rate": 1.7396415252139288e-07, "logits/chosen": 0.0034784465096890926, "logits/rejected": 0.6044633388519287, "logps/chosen": -2050.113037109375, "logps/rejected": -2622.564453125, "loss": 1.5229, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -17.44542694091797, "rewards/margins": 6.333140850067139, "rewards/rejected": -23.778566360473633, "step": 610 }, { "epoch": 0.6488749345892203, "grad_norm": 150.92780625161797, "learning_rate": 1.6530691736402316e-07, "logits/chosen": -0.05873150750994682, "logits/rejected": 0.2572210133075714, "logps/chosen": -1822.690185546875, "logps/rejected": -2140.002685546875, "loss": 1.3047, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -15.516670227050781, "rewards/margins": 3.519291400909424, "rewards/rejected": -19.035961151123047, "step": 620 }, { "epoch": 0.6593406593406593, "grad_norm": 158.62413320623054, "learning_rate": 1.5676295169786864e-07, "logits/chosen": -0.5535549521446228, "logits/rejected": -0.16974008083343506, "logps/chosen": -1799.411376953125, "logps/rejected": -2184.095458984375, "loss": 1.4004, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -15.247261047363281, "rewards/margins": 4.233900547027588, "rewards/rejected": -19.481159210205078, "step": 630 }, { "epoch": 0.6698063840920984, "grad_norm": 174.63990723873954, "learning_rate": 1.483436823197092e-07, "logits/chosen": -0.49727511405944824, "logits/rejected": -0.09024439752101898, "logps/chosen": -1910.181396484375, "logps/rejected": -2272.175537109375, "loss": 1.2582, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -16.431602478027344, "rewards/margins": 4.003415107727051, "rewards/rejected": -20.43501853942871, "step": 640 }, { "epoch": 0.6802721088435374, "grad_norm": 212.30897956956616, "learning_rate": 1.4006036925609243e-07, "logits/chosen": -0.5441917777061462, "logits/rejected": -0.3759006857872009, "logps/chosen": -1762.1038818359375, "logps/rejected": -1993.853515625, "loss": 1.3183, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -14.94421100616455, "rewards/margins": 2.2772457599639893, "rewards/rejected": -17.221454620361328, "step": 650 }, { "epoch": 0.6907378335949764, "grad_norm": 122.40725726992933, "learning_rate": 1.319240907040458e-07, "logits/chosen": -0.578727126121521, "logits/rejected": -0.15290720760822296, "logps/chosen": -1786.3648681640625, "logps/rejected": -2103.92919921875, "loss": 1.5482, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -15.042015075683594, "rewards/margins": 3.422727584838867, "rewards/rejected": -18.46474266052246, "step": 660 }, { "epoch": 0.7012035583464155, "grad_norm": 273.40146184819037, "learning_rate": 1.239457282149695e-07, "logits/chosen": -0.6542818546295166, "logits/rejected": -0.6405806541442871, "logps/chosen": -1718.8697509765625, "logps/rejected": -2025.167236328125, "loss": 1.1528, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -14.303213119506836, "rewards/margins": 3.1175124645233154, "rewards/rejected": -17.420726776123047, "step": 670 }, { "epoch": 0.7116692830978545, "grad_norm": 246.28508875936285, "learning_rate": 1.1613595214152711e-07, "logits/chosen": -0.6755629777908325, "logits/rejected": -0.26193898916244507, "logps/chosen": -1862.4000244140625, "logps/rejected": -2191.969482421875, "loss": 1.3671, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -15.507433891296387, "rewards/margins": 3.5333023071289062, "rewards/rejected": -19.04073715209961, "step": 680 }, { "epoch": 0.7221350078492935, "grad_norm": 216.14843384209277, "learning_rate": 1.0850520736699362e-07, "logits/chosen": -0.6002136468887329, "logits/rejected": 0.03606845811009407, "logps/chosen": -1838.6982421875, "logps/rejected": -2214.07470703125, "loss": 1.3895, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -15.680435180664062, "rewards/margins": 4.133326053619385, "rewards/rejected": -19.813762664794922, "step": 690 }, { "epoch": 0.7326007326007326, "grad_norm": 162.01079027631573, "learning_rate": 1.0106369933615042e-07, "logits/chosen": -0.7846351861953735, "logits/rejected": -0.5915166735649109, "logps/chosen": -1752.784423828125, "logps/rejected": -2021.7802734375, "loss": 1.5039, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -14.986889839172363, "rewards/margins": 2.845428943634033, "rewards/rejected": -17.832317352294922, "step": 700 }, { "epoch": 0.7326007326007326, "eval_logits/chosen": -0.8898468017578125, "eval_logits/rejected": -0.4967605769634247, "eval_logps/chosen": -1694.96240234375, "eval_logps/rejected": -2016.4490966796875, "eval_loss": 1.3852962255477905, "eval_rewards/accuracies": 0.6527777910232544, "eval_rewards/chosen": -14.129942893981934, "eval_rewards/margins": 3.432478666305542, "eval_rewards/rejected": -17.562421798706055, "eval_runtime": 176.0679, "eval_samples_per_second": 11.359, "eval_steps_per_second": 0.358, "step": 700 }, { "epoch": 0.7430664573521716, "grad_norm": 177.45761000957364, "learning_rate": 9.382138040640714e-08, "logits/chosen": -1.012629747390747, "logits/rejected": -0.6268833875656128, "logps/chosen": -1776.499755859375, "logps/rejected": -2017.539794921875, "loss": 1.5264, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -15.180249214172363, "rewards/margins": 2.565770387649536, "rewards/rejected": -17.74601936340332, "step": 710 }, { "epoch": 0.7535321821036107, "grad_norm": 140.94359920373847, "learning_rate": 8.678793653740632e-08, "logits/chosen": -0.9271895289421082, "logits/rejected": -0.47789469361305237, "logps/chosen": -1664.4437255859375, "logps/rejected": -1977.908447265625, "loss": 1.3295, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -13.8070650100708, "rewards/margins": 3.4972636699676514, "rewards/rejected": -17.3043270111084, "step": 720 }, { "epoch": 0.7639979068550498, "grad_norm": 190.75937551525504, "learning_rate": 7.997277433690983e-08, "logits/chosen": -0.8303499221801758, "logits/rejected": -0.2948521077632904, "logps/chosen": -1813.2340087890625, "logps/rejected": -2049.240234375, "loss": 1.4631, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -15.478715896606445, "rewards/margins": 2.6438088417053223, "rewards/rejected": -18.12252426147461, "step": 730 }, { "epoch": 0.7744636316064888, "grad_norm": 164.74206538760382, "learning_rate": 7.338500848029602e-08, "logits/chosen": -0.6835179924964905, "logits/rejected": -0.42263850569725037, "logps/chosen": -1808.6490478515625, "logps/rejected": -2096.81396484375, "loss": 1.2242, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -15.557826042175293, "rewards/margins": 3.0879101753234863, "rewards/rejected": -18.645736694335938, "step": 740 }, { "epoch": 0.7849293563579278, "grad_norm": 135.0757551116068, "learning_rate": 6.70334495204884e-08, "logits/chosen": -0.5583680868148804, "logits/rejected": -0.36530551314353943, "logps/chosen": -1854.912353515625, "logps/rejected": -2177.262451171875, "loss": 1.3344, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -16.08046531677246, "rewards/margins": 3.2556281089782715, "rewards/rejected": -19.33609390258789, "step": 750 }, { "epoch": 0.7953950811093669, "grad_norm": 158.01784405358154, "learning_rate": 6.092659210462231e-08, "logits/chosen": -0.653573215007782, "logits/rejected": -0.4876467287540436, "logps/chosen": -1903.880615234375, "logps/rejected": -2182.48291015625, "loss": 1.4038, "rewards/accuracies": 0.5625, "rewards/chosen": -16.625337600708008, "rewards/margins": 2.7693800926208496, "rewards/rejected": -19.394718170166016, "step": 760 }, { "epoch": 0.8058608058608059, "grad_norm": 169.97964049682443, "learning_rate": 5.507260361320737e-08, "logits/chosen": -1.0366981029510498, "logits/rejected": -0.9037246704101562, "logps/chosen": -1879.755126953125, "logps/rejected": -2001.697265625, "loss": 1.286, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -15.764042854309082, "rewards/margins": 1.033178687095642, "rewards/rejected": -16.797222137451172, "step": 770 }, { "epoch": 0.8163265306122449, "grad_norm": 162.02338031146334, "learning_rate": 4.947931323697982e-08, "logits/chosen": -1.0304605960845947, "logits/rejected": -0.9400796890258789, "logps/chosen": -1669.2073974609375, "logps/rejected": -2004.525390625, "loss": 1.5927, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -13.502462387084961, "rewards/margins": 3.8671538829803467, "rewards/rejected": -17.369617462158203, "step": 780 }, { "epoch": 0.826792255363684, "grad_norm": 140.7368428333841, "learning_rate": 4.415420150605398e-08, "logits/chosen": -1.0811887979507446, "logits/rejected": -0.5253428220748901, "logps/chosen": -1726.182373046875, "logps/rejected": -2063.27099609375, "loss": 1.4648, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -14.476069450378418, "rewards/margins": 3.7039875984191895, "rewards/rejected": -18.180057525634766, "step": 790 }, { "epoch": 0.837257980115123, "grad_norm": 202.82775780509928, "learning_rate": 3.9104390285376374e-08, "logits/chosen": -0.835501492023468, "logits/rejected": -0.5900505781173706, "logps/chosen": -1749.853759765625, "logps/rejected": -1951.329345703125, "loss": 1.3527, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -14.83747673034668, "rewards/margins": 2.0425891876220703, "rewards/rejected": -16.88006591796875, "step": 800 }, { "epoch": 0.837257980115123, "eval_logits/chosen": -1.0374784469604492, "eval_logits/rejected": -0.6750361919403076, "eval_logps/chosen": -1672.130615234375, "eval_logps/rejected": -1986.035888671875, "eval_loss": 1.366306185722351, "eval_rewards/accuracies": 0.6448412537574768, "eval_rewards/chosen": -13.901623725891113, "eval_rewards/margins": 3.3566668033599854, "eval_rewards/rejected": -17.25829315185547, "eval_runtime": 176.0547, "eval_samples_per_second": 11.36, "eval_steps_per_second": 0.358, "step": 800 }, { "epoch": 0.847723704866562, "grad_norm": 218.80895490878117, "learning_rate": 3.433663324986208e-08, "logits/chosen": -1.2597501277923584, "logits/rejected": -0.7243804931640625, "logps/chosen": -1665.3489990234375, "logps/rejected": -2045.541259765625, "loss": 1.4186, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -13.741106986999512, "rewards/margins": 4.249786853790283, "rewards/rejected": -17.990894317626953, "step": 810 }, { "epoch": 0.858189429618001, "grad_norm": 175.67069527310957, "learning_rate": 2.9857306851953897e-08, "logits/chosen": -1.075448751449585, "logits/rejected": -0.8459098935127258, "logps/chosen": -1705.802734375, "logps/rejected": -1971.207275390625, "loss": 1.1819, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -14.010282516479492, "rewards/margins": 2.8707687854766846, "rewards/rejected": -16.881052017211914, "step": 820 }, { "epoch": 0.8686551543694401, "grad_norm": 150.14969837730865, "learning_rate": 2.567240179368185e-08, "logits/chosen": -0.8211779594421387, "logits/rejected": -0.672277569770813, "logps/chosen": -1724.1959228515625, "logps/rejected": -1975.289306640625, "loss": 1.3771, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -14.66050910949707, "rewards/margins": 2.5944151878356934, "rewards/rejected": -17.25492286682129, "step": 830 }, { "epoch": 0.8791208791208791, "grad_norm": 143.51050018041488, "learning_rate": 2.1787515014630357e-08, "logits/chosen": -0.9592329263687134, "logits/rejected": -0.6304475665092468, "logps/chosen": -1664.050048828125, "logps/rejected": -2090.85107421875, "loss": 1.1841, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -13.817761421203613, "rewards/margins": 4.430028915405273, "rewards/rejected": -18.247791290283203, "step": 840 }, { "epoch": 0.8895866038723181, "grad_norm": 160.09590738302992, "learning_rate": 1.820784220652766e-08, "logits/chosen": -0.8976573944091797, "logits/rejected": -0.619744598865509, "logps/chosen": -1732.6185302734375, "logps/rejected": -2009.6126708984375, "loss": 1.3946, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -14.51783561706543, "rewards/margins": 3.2437214851379395, "rewards/rejected": -17.761554718017578, "step": 850 }, { "epoch": 0.9000523286237572, "grad_norm": 140.45079725700174, "learning_rate": 1.4938170864468636e-08, "logits/chosen": -1.2183126211166382, "logits/rejected": -0.7451462149620056, "logps/chosen": -1663.8861083984375, "logps/rejected": -2030.5501708984375, "loss": 1.403, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -13.755941390991211, "rewards/margins": 3.9360270500183105, "rewards/rejected": -17.691970825195312, "step": 860 }, { "epoch": 0.9105180533751962, "grad_norm": 177.87764974909854, "learning_rate": 1.1982873884064465e-08, "logits/chosen": -1.142114281654358, "logits/rejected": -0.8570957183837891, "logps/chosen": -1702.1165771484375, "logps/rejected": -2053.07568359375, "loss": 1.364, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -14.30346393585205, "rewards/margins": 3.762028932571411, "rewards/rejected": -18.06549072265625, "step": 870 }, { "epoch": 0.9209837781266352, "grad_norm": 138.3301348415624, "learning_rate": 9.345903713082304e-09, "logits/chosen": -1.0760080814361572, "logits/rejected": -0.866096019744873, "logps/chosen": -1735.3382568359375, "logps/rejected": -2023.660888671875, "loss": 1.355, "rewards/accuracies": 0.5625, "rewards/chosen": -14.312044143676758, "rewards/margins": 2.978205680847168, "rewards/rejected": -17.290246963500977, "step": 880 }, { "epoch": 0.9314495028780743, "grad_norm": 179.16273994251034, "learning_rate": 7.030787065396865e-09, "logits/chosen": -1.0234577655792236, "logits/rejected": -0.9720734357833862, "logps/chosen": -1736.5269775390625, "logps/rejected": -2083.37939453125, "loss": 1.4332, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -14.488537788391113, "rewards/margins": 3.543290376663208, "rewards/rejected": -18.031827926635742, "step": 890 }, { "epoch": 0.9419152276295133, "grad_norm": 163.4835379161221, "learning_rate": 5.04062020432286e-09, "logits/chosen": -0.8189510107040405, "logits/rejected": -0.8584410548210144, "logps/chosen": -1706.8818359375, "logps/rejected": -1968.8441162109375, "loss": 1.5137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.264904975891113, "rewards/margins": 2.609503984451294, "rewards/rejected": -16.874408721923828, "step": 900 }, { "epoch": 0.9419152276295133, "eval_logits/chosen": -1.001752495765686, "eval_logits/rejected": -0.673967182636261, "eval_logps/chosen": -1735.9151611328125, "eval_logps/rejected": -2073.3388671875, "eval_loss": 1.3373700380325317, "eval_rewards/accuracies": 0.6408730149269104, "eval_rewards/chosen": -14.539473533630371, "eval_rewards/margins": 3.5918467044830322, "eval_rewards/rejected": -18.13132095336914, "eval_runtime": 176.3334, "eval_samples_per_second": 11.342, "eval_steps_per_second": 0.357, "step": 900 }, { "epoch": 0.9523809523809523, "grad_norm": 190.32378571700949, "learning_rate": 3.3780648016376866e-09, "logits/chosen": -0.9321626424789429, "logits/rejected": -0.5902298092842102, "logps/chosen": -1696.779296875, "logps/rejected": -1922.1607666015625, "loss": 1.4578, "rewards/accuracies": 0.625, "rewards/chosen": -14.7335786819458, "rewards/margins": 2.3592441082000732, "rewards/rejected": -17.092823028564453, "step": 910 }, { "epoch": 0.9628466771323915, "grad_norm": 183.98567167006505, "learning_rate": 2.0453443778310766e-09, "logits/chosen": -1.0600922107696533, "logits/rejected": -0.7931039929389954, "logps/chosen": -1763.392822265625, "logps/rejected": -2107.805419921875, "loss": 1.3202, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -14.767558097839355, "rewards/margins": 3.794466495513916, "rewards/rejected": -18.562023162841797, "step": 920 }, { "epoch": 0.9733124018838305, "grad_norm": 181.56437725274117, "learning_rate": 1.0442413283435758e-09, "logits/chosen": -1.1890182495117188, "logits/rejected": -0.5295430421829224, "logps/chosen": -1729.0921630859375, "logps/rejected": -1985.2783203125, "loss": 1.5669, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -14.14258098602295, "rewards/margins": 3.068037748336792, "rewards/rejected": -17.210617065429688, "step": 930 }, { "epoch": 0.9837781266352695, "grad_norm": 173.28786175289625, "learning_rate": 3.760945397705828e-10, "logits/chosen": -0.856045126914978, "logits/rejected": -0.7398639917373657, "logps/chosen": -1713.3883056640625, "logps/rejected": -2039.740966796875, "loss": 1.266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.502069473266602, "rewards/margins": 3.2981293201446533, "rewards/rejected": -17.800199508666992, "step": 940 }, { "epoch": 0.9942438513867086, "grad_norm": 188.65879146663107, "learning_rate": 4.17975992204056e-11, "logits/chosen": -1.168084740638733, "logits/rejected": -0.8855546116828918, "logps/chosen": -1736.102783203125, "logps/rejected": -1955.3255615234375, "loss": 1.4604, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -14.33259391784668, "rewards/margins": 2.370856761932373, "rewards/rejected": -16.70345115661621, "step": 950 }, { "epoch": 0.9994767137624281, "step": 955, "total_flos": 0.0, "train_loss": 2.1165736393154604, "train_runtime": 18133.1885, "train_samples_per_second": 3.371, "train_steps_per_second": 0.053 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }