{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994767137624281, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.208333333333333e-09, "logits/chosen": -4.156338691711426, "logits/rejected": -4.146947383880615, "logps/chosen": -276.527099609375, "logps/rejected": -253.10324096679688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.208333333333333e-08, "logits/chosen": -4.072511672973633, "logits/rejected": -4.162118911743164, "logps/chosen": -398.7220764160156, "logps/rejected": -310.1617736816406, "loss": 0.6929, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0006214036839082837, "rewards/margins": 0.0005763211520388722, "rewards/rejected": 4.508249185164459e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -4.09468412399292, "logits/rejected": -4.120311737060547, "logps/chosen": -304.5065612792969, "logps/rejected": -286.7430725097656, "loss": 0.693, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0015097814612090588, "rewards/margins": -0.0002035536599578336, "rewards/rejected": 0.001713335164822638, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -4.106563568115234, "logits/rejected": -4.1498003005981445, "logps/chosen": -371.30047607421875, "logps/rejected": -341.87310791015625, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": 0.012897265143692493, "rewards/margins": 0.008245532400906086, "rewards/rejected": 0.0046517327427864075, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -4.051548480987549, "logits/rejected": -4.108038425445557, "logps/chosen": -362.06304931640625, "logps/rejected": -345.527099609375, "loss": 0.6869, "rewards/accuracies": 0.625, "rewards/chosen": 0.03629542142152786, "rewards/margins": 0.018107738345861435, "rewards/rejected": 0.018187683075666428, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.604166666666667e-07, "logits/chosen": -4.083974361419678, "logits/rejected": -4.089003562927246, "logps/chosen": -328.7674865722656, "logps/rejected": -337.3554382324219, "loss": 0.6775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07210709154605865, "rewards/margins": 0.03594246506690979, "rewards/rejected": 0.03616461902856827, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -4.070071220397949, "logits/rejected": -4.125251293182373, "logps/chosen": -364.65863037109375, "logps/rejected": -336.72119140625, "loss": 0.6673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14206922054290771, "rewards/margins": 0.05489688366651535, "rewards/rejected": 0.08717232942581177, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.645833333333333e-07, "logits/chosen": -4.12211799621582, "logits/rejected": -4.217160701751709, "logps/chosen": -381.5180358886719, "logps/rejected": -344.3231506347656, "loss": 0.6475, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19605371356010437, "rewards/margins": 0.1255684494972229, "rewards/rejected": 0.07048525661230087, "step": 70 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -3.8739821910858154, "logits/rejected": -3.94097900390625, "logps/chosen": -425.6156311035156, "logps/rejected": -385.5126037597656, "loss": 0.6165, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.13462719321250916, "rewards/margins": 0.24346613883972168, "rewards/rejected": -0.10883896052837372, "step": 80 }, { "epoch": 0.09, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -3.8873085975646973, "logits/rejected": -3.9699835777282715, "logps/chosen": -428.96026611328125, "logps/rejected": -417.627197265625, "loss": 0.617, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02063266560435295, "rewards/margins": 0.2929636836051941, "rewards/rejected": -0.27233099937438965, "step": 90 }, { "epoch": 0.1, "learning_rate": 4.999732492681437e-07, "logits/chosen": -3.9544475078582764, "logits/rejected": -3.9958584308624268, "logps/chosen": -388.5050964355469, "logps/rejected": -426.59765625, "loss": 0.6051, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.17338475584983826, "rewards/margins": 0.28527265787124634, "rewards/rejected": -0.458657443523407, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": -3.8776931762695312, "eval_logits/rejected": -3.979984760284424, "eval_logps/chosen": -407.94769287109375, "eval_logps/rejected": -389.8706970214844, "eval_loss": 0.5848276615142822, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.22141031920909882, "eval_rewards/margins": 0.3733499050140381, "eval_rewards/rejected": -0.5947602391242981, "eval_runtime": 202.6123, "eval_samples_per_second": 9.871, "eval_steps_per_second": 0.617, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.996723692767926e-07, "logits/chosen": -4.082424640655518, "logits/rejected": -4.156807899475098, "logps/chosen": -365.0426025390625, "logps/rejected": -362.7803649902344, "loss": 0.5643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3058136999607086, "rewards/margins": 0.49873948097229004, "rewards/rejected": -0.8045531511306763, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.990375746213598e-07, "logits/chosen": -4.234003067016602, "logits/rejected": -4.319502830505371, "logps/chosen": -424.35980224609375, "logps/rejected": -453.1839904785156, "loss": 0.5679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.47503072023391724, "rewards/margins": 0.55732262134552, "rewards/rejected": -1.0323532819747925, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.980697142834314e-07, "logits/chosen": -4.125961780548096, "logits/rejected": -4.213648319244385, "logps/chosen": -462.6040954589844, "logps/rejected": -459.0274963378906, "loss": 0.5456, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.616773247718811, "rewards/margins": 0.5351831912994385, "rewards/rejected": -1.15195631980896, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.967700826904229e-07, "logits/chosen": -4.07401704788208, "logits/rejected": -4.130145072937012, "logps/chosen": -381.79718017578125, "logps/rejected": -404.578369140625, "loss": 0.5092, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7604867219924927, "rewards/margins": 0.5355066657066345, "rewards/rejected": -1.2959933280944824, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.951404179843962e-07, "logits/chosen": -3.923344850540161, "logits/rejected": -3.991914749145508, "logps/chosen": -435.2518005371094, "logps/rejected": -472.1197204589844, "loss": 0.5597, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4754977226257324, "rewards/margins": 0.5571349859237671, "rewards/rejected": -1.032632827758789, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.931828996974498e-07, "logits/chosen": -3.7589492797851562, "logits/rejected": -3.853282928466797, "logps/chosen": -413.491455078125, "logps/rejected": -455.9912109375, "loss": 0.5265, "rewards/accuracies": 0.75, "rewards/chosen": -0.14779019355773926, "rewards/margins": 0.467332661151886, "rewards/rejected": -0.6151228547096252, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.909001458367866e-07, "logits/chosen": -3.895158290863037, "logits/rejected": -4.054537773132324, "logps/chosen": -404.14532470703125, "logps/rejected": -403.3351135253906, "loss": 0.5177, "rewards/accuracies": 0.75, "rewards/chosen": -0.5085878968238831, "rewards/margins": 0.7046168446540833, "rewards/rejected": -1.2132047414779663, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.882952093833627e-07, "logits/chosen": -3.9119632244110107, "logits/rejected": -3.9340500831604004, "logps/chosen": -415.5750427246094, "logps/rejected": -543.4241333007812, "loss": 0.4752, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0291075706481934, "rewards/margins": 0.9823731184005737, "rewards/rejected": -2.0114803314208984, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.853715742087946e-07, "logits/chosen": -3.9852688312530518, "logits/rejected": -4.081984519958496, "logps/chosen": -480.1775817871094, "logps/rejected": -522.3671264648438, "loss": 0.4535, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1266310214996338, "rewards/margins": 0.9308539628982544, "rewards/rejected": -2.0574851036071777, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.821331504159906e-07, "logits/chosen": -4.0369672775268555, "logits/rejected": -4.075179576873779, "logps/chosen": -509.94189453125, "logps/rejected": -613.8836059570312, "loss": 0.5134, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5363746881484985, "rewards/margins": 0.8696328401565552, "rewards/rejected": -2.406007766723633, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": -3.8175740242004395, "eval_logits/rejected": -3.9154281616210938, "eval_logps/chosen": -545.1560668945312, "eval_logps/rejected": -580.6380004882812, "eval_loss": 0.5025292634963989, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": -1.5934933423995972, "eval_rewards/margins": 0.9089404940605164, "eval_rewards/rejected": -2.5024337768554688, "eval_runtime": 203.658, "eval_samples_per_second": 9.82, "eval_steps_per_second": 0.614, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.785842691097342e-07, "logits/chosen": -3.9959654808044434, "logits/rejected": -4.14572811126709, "logps/chosen": -542.1124267578125, "logps/rejected": -513.849853515625, "loss": 0.5114, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1273362636566162, "rewards/margins": 1.0702550411224365, "rewards/rejected": -2.1975910663604736, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7472967660421603e-07, "logits/chosen": -3.90177583694458, "logits/rejected": -3.9888691902160645, "logps/chosen": -448.62091064453125, "logps/rejected": -494.3854064941406, "loss": 0.4834, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7570297718048096, "rewards/margins": 0.9269709587097168, "rewards/rejected": -1.6840009689331055, "step": 220 }, { "epoch": 0.24, "learning_rate": 4.705745280752585e-07, "logits/chosen": -3.9159817695617676, "logits/rejected": -4.030351161956787, "logps/chosen": -501.67938232421875, "logps/rejected": -505.40618896484375, "loss": 0.4834, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1143962144851685, "rewards/margins": 0.7790688276290894, "rewards/rejected": -1.8934648036956787, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.6612438066572555e-07, "logits/chosen": -3.872812271118164, "logits/rejected": -3.9638118743896484, "logps/chosen": -486.70751953125, "logps/rejected": -500.75518798828125, "loss": 0.4416, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2733886241912842, "rewards/margins": 0.9924365282058716, "rewards/rejected": -2.2658252716064453, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.6138518605333664e-07, "logits/chosen": -3.8824219703674316, "logits/rejected": -3.9245293140411377, "logps/chosen": -505.47711181640625, "logps/rejected": -631.7078247070312, "loss": 0.4569, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.439082145690918, "rewards/margins": 1.2419004440307617, "rewards/rejected": -2.680982828140259, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.5636328249082514e-07, "logits/chosen": -4.006925106048584, "logits/rejected": -4.075568199157715, "logps/chosen": -515.6424560546875, "logps/rejected": -610.078125, "loss": 0.4646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.567647933959961, "rewards/margins": 0.9515784382820129, "rewards/rejected": -2.519226551055908, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.510653863290871e-07, "logits/chosen": -4.042003154754639, "logits/rejected": -4.134153842926025, "logps/chosen": -530.5443115234375, "logps/rejected": -585.4014892578125, "loss": 0.4713, "rewards/accuracies": 0.875, "rewards/chosen": -1.408792495727539, "rewards/margins": 1.231185793876648, "rewards/rejected": -2.6399781703948975, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.4549858303465737e-07, "logits/chosen": -3.96376371383667, "logits/rejected": -3.9938507080078125, "logps/chosen": -508.6741638183594, "logps/rejected": -597.8483276367188, "loss": 0.4507, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4071502685546875, "rewards/margins": 1.0225669145584106, "rewards/rejected": -2.4297173023223877, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.396703177135261e-07, "logits/chosen": -3.989154815673828, "logits/rejected": -4.069916248321533, "logps/chosen": -498.7647399902344, "logps/rejected": -517.9674072265625, "loss": 0.452, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4257739782333374, "rewards/margins": 0.752226710319519, "rewards/rejected": -2.1780009269714355, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.335883851539693e-07, "logits/chosen": -3.9300217628479004, "logits/rejected": -4.022156238555908, "logps/chosen": -441.078369140625, "logps/rejected": -534.4451904296875, "loss": 0.4489, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3399455547332764, "rewards/margins": 1.284125566482544, "rewards/rejected": -2.6240711212158203, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": -3.670259714126587, "eval_logits/rejected": -3.7593655586242676, "eval_logps/chosen": -542.0072021484375, "eval_logps/rejected": -591.3610229492188, "eval_loss": 0.46141302585601807, "eval_rewards/accuracies": 0.7760000228881836, "eval_rewards/chosen": -1.5620052814483643, "eval_rewards/margins": 1.0476588010787964, "eval_rewards/rejected": -2.609663963317871, "eval_runtime": 204.4704, "eval_samples_per_second": 9.781, "eval_steps_per_second": 0.611, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.272609194017105e-07, "logits/chosen": -3.7294070720672607, "logits/rejected": -3.789606809616089, "logps/chosen": -506.30218505859375, "logps/rejected": -687.495849609375, "loss": 0.4155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.452553629875183, "rewards/margins": 1.6206657886505127, "rewards/rejected": -3.0732195377349854, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.2069638288135547e-07, "logits/chosen": -3.735316753387451, "logits/rejected": -3.802316188812256, "logps/chosen": -556.4932861328125, "logps/rejected": -685.213134765625, "loss": 0.4397, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5745762586593628, "rewards/margins": 1.468933343887329, "rewards/rejected": -3.0435097217559814, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.139035550786494e-07, "logits/chosen": -3.783989667892456, "logits/rejected": -3.8774330615997314, "logps/chosen": -530.0714111328125, "logps/rejected": -536.76123046875, "loss": 0.4832, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6511234045028687, "rewards/margins": 1.1192381381988525, "rewards/rejected": -2.7703614234924316, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.0689152079869306e-07, "logits/chosen": -3.9041965007781982, "logits/rejected": -3.946739912033081, "logps/chosen": -430.56365966796875, "logps/rejected": -504.49786376953125, "loss": 0.4937, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.466640591621399, "rewards/margins": 0.9467592239379883, "rewards/rejected": -2.4133996963500977, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.99669658015821e-07, "logits/chosen": -3.7613883018493652, "logits/rejected": -3.790003538131714, "logps/chosen": -495.4722595214844, "logps/rejected": -638.2640380859375, "loss": 0.4831, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7402899265289307, "rewards/margins": 1.1566716432571411, "rewards/rejected": -2.8969614505767822, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.92247625331392e-07, "logits/chosen": -3.937546968460083, "logits/rejected": -3.993028163909912, "logps/chosen": -548.022216796875, "logps/rejected": -609.3221435546875, "loss": 0.4675, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.875231385231018, "rewards/margins": 0.993303656578064, "rewards/rejected": -2.868535280227661, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.846353490562664e-07, "logits/chosen": -3.8289928436279297, "logits/rejected": -3.8607897758483887, "logps/chosen": -511.55853271484375, "logps/rejected": -626.59814453125, "loss": 0.4382, "rewards/accuracies": 0.75, "rewards/chosen": -1.6782903671264648, "rewards/margins": 0.9424189329147339, "rewards/rejected": -2.6207094192504883, "step": 370 }, { "epoch": 0.4, "learning_rate": 3.768430099352445e-07, "logits/chosen": -3.8665966987609863, "logits/rejected": -3.9191222190856934, "logps/chosen": -600.85400390625, "logps/rejected": -636.7755126953125, "loss": 0.4414, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.972818374633789, "rewards/margins": 0.9349247217178345, "rewards/rejected": -2.907742977142334, "step": 380 }, { "epoch": 0.41, "learning_rate": 3.6888102953122304e-07, "logits/chosen": -3.8797574043273926, "logits/rejected": -3.9397029876708984, "logps/chosen": -541.1205444335938, "logps/rejected": -635.4256591796875, "loss": 0.4324, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8883755207061768, "rewards/margins": 1.541088581085205, "rewards/rejected": -3.4294638633728027, "step": 390 }, { "epoch": 0.42, "learning_rate": 3.607600562872785e-07, "logits/chosen": -3.781221389770508, "logits/rejected": -3.8847899436950684, "logps/chosen": -629.6137084960938, "logps/rejected": -632.5733032226562, "loss": 0.4359, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1249642372131348, "rewards/margins": 1.0126584768295288, "rewards/rejected": -3.137622833251953, "step": 400 }, { "epoch": 0.42, "eval_logits/chosen": -3.6220741271972656, "eval_logits/rejected": -3.702164888381958, "eval_logps/chosen": -594.591796875, "eval_logps/rejected": -651.9946899414062, "eval_loss": 0.4466642141342163, "eval_rewards/accuracies": 0.7680000066757202, "eval_rewards/chosen": -2.087851047515869, "eval_rewards/margins": 1.1281490325927734, "eval_rewards/rejected": -3.2160003185272217, "eval_runtime": 201.9384, "eval_samples_per_second": 9.904, "eval_steps_per_second": 0.619, "step": 400 }, { "epoch": 0.43, "learning_rate": 3.5249095128531856e-07, "logits/chosen": -3.7463316917419434, "logits/rejected": -3.8346214294433594, "logps/chosen": -645.6383666992188, "logps/rejected": -699.442626953125, "loss": 0.4482, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9998807907104492, "rewards/margins": 1.0837665796279907, "rewards/rejected": -3.0836472511291504, "step": 410 }, { "epoch": 0.44, "learning_rate": 3.4408477372034736e-07, "logits/chosen": -3.732384204864502, "logits/rejected": -3.813812255859375, "logps/chosen": -501.06536865234375, "logps/rejected": -586.9138793945312, "loss": 0.4336, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9944934844970703, "rewards/margins": 1.117249846458435, "rewards/rejected": -3.111743211746216, "step": 420 }, { "epoch": 0.45, "learning_rate": 3.3555276610977276e-07, "logits/chosen": -3.799959659576416, "logits/rejected": -3.7980499267578125, "logps/chosen": -466.7644958496094, "logps/rejected": -580.1270751953125, "loss": 0.4684, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.807948350906372, "rewards/margins": 1.1490823030471802, "rewards/rejected": -2.9570305347442627, "step": 430 }, { "epoch": 0.46, "learning_rate": 3.269063392575352e-07, "logits/chosen": -3.8081250190734863, "logits/rejected": -3.8595759868621826, "logps/chosen": -476.84326171875, "logps/rejected": -597.8814697265625, "loss": 0.4459, "rewards/accuracies": 0.75, "rewards/chosen": -1.5761287212371826, "rewards/margins": 1.3166580200195312, "rewards/rejected": -2.892787218093872, "step": 440 }, { "epoch": 0.47, "learning_rate": 3.1815705699316964e-07, "logits/chosen": -3.71815824508667, "logits/rejected": -3.7602291107177734, "logps/chosen": -502.3971252441406, "logps/rejected": -605.4251708984375, "loss": 0.4634, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5930547714233398, "rewards/margins": 1.314320683479309, "rewards/rejected": -2.9073758125305176, "step": 450 }, { "epoch": 0.48, "learning_rate": 3.0931662070620794e-07, "logits/chosen": -3.741687774658203, "logits/rejected": -3.811591625213623, "logps/chosen": -549.6846923828125, "logps/rejected": -649.2257080078125, "loss": 0.4345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.719300627708435, "rewards/margins": 1.1254762411117554, "rewards/rejected": -2.8447766304016113, "step": 460 }, { "epoch": 0.49, "learning_rate": 3.003968536966078e-07, "logits/chosen": -3.716046094894409, "logits/rejected": -3.803307056427002, "logps/chosen": -572.4447631835938, "logps/rejected": -625.49755859375, "loss": 0.4451, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8884143829345703, "rewards/margins": 1.2486246824264526, "rewards/rejected": -3.1370389461517334, "step": 470 }, { "epoch": 0.5, "learning_rate": 2.9140968536213693e-07, "logits/chosen": -3.867417573928833, "logits/rejected": -3.8436825275421143, "logps/chosen": -478.40789794921875, "logps/rejected": -613.2086791992188, "loss": 0.4803, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.025106906890869, "rewards/margins": 1.0343272686004639, "rewards/rejected": -3.059434175491333, "step": 480 }, { "epoch": 0.51, "learning_rate": 2.823671352438608e-07, "logits/chosen": -3.8223514556884766, "logits/rejected": -3.864264965057373, "logps/chosen": -514.0059814453125, "logps/rejected": -626.1476440429688, "loss": 0.4292, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7328517436981201, "rewards/margins": 1.1822339296340942, "rewards/rejected": -2.915085554122925, "step": 490 }, { "epoch": 0.52, "learning_rate": 2.73281296951072e-07, "logits/chosen": -3.8355319499969482, "logits/rejected": -3.8868179321289062, "logps/chosen": -587.6881103515625, "logps/rejected": -719.8489990234375, "loss": 0.4271, "rewards/accuracies": 0.8125, "rewards/chosen": -2.061047315597534, "rewards/margins": 1.5319583415985107, "rewards/rejected": -3.593005418777466, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": -3.7408037185668945, "eval_logits/rejected": -3.8189327716827393, "eval_logps/chosen": -591.3005981445312, "eval_logps/rejected": -652.2026977539062, "eval_loss": 0.4440613090991974, "eval_rewards/accuracies": 0.7839999794960022, "eval_rewards/chosen": -2.054938793182373, "eval_rewards/margins": 1.1631413698196411, "eval_rewards/rejected": -3.2180800437927246, "eval_runtime": 203.8018, "eval_samples_per_second": 9.813, "eval_steps_per_second": 0.613, "step": 500 }, { "epoch": 0.53, "learning_rate": 2.641643219871597e-07, "logits/chosen": -3.8902289867401123, "logits/rejected": -4.004373550415039, "logps/chosen": -559.3378295898438, "logps/rejected": -618.6632690429688, "loss": 0.3992, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.054337739944458, "rewards/margins": 1.3023302555084229, "rewards/rejected": -3.3566677570343018, "step": 510 }, { "epoch": 0.54, "learning_rate": 2.550284034980507e-07, "logits/chosen": -3.8437469005584717, "logits/rejected": -3.888261318206787, "logps/chosen": -559.3350830078125, "logps/rejected": -661.4388427734375, "loss": 0.4473, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9623550176620483, "rewards/margins": 1.2001652717590332, "rewards/rejected": -3.162520170211792, "step": 520 }, { "epoch": 0.55, "learning_rate": 2.4588575996495794e-07, "logits/chosen": -3.9187228679656982, "logits/rejected": -3.973421096801758, "logps/chosen": -602.2457885742188, "logps/rejected": -682.3179931640625, "loss": 0.42, "rewards/accuracies": 0.75, "rewards/chosen": -1.9900497198104858, "rewards/margins": 1.2919167280197144, "rewards/rejected": -3.2819664478302, "step": 530 }, { "epoch": 0.57, "learning_rate": 2.367486188632446e-07, "logits/chosen": -3.9012649059295654, "logits/rejected": -3.986586093902588, "logps/chosen": -567.7922973632812, "logps/rejected": -682.8541259765625, "loss": 0.4298, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8885074853897095, "rewards/margins": 1.357795000076294, "rewards/rejected": -3.2463021278381348, "step": 540 }, { "epoch": 0.58, "learning_rate": 2.276292003092593e-07, "logits/chosen": -3.9662468433380127, "logits/rejected": -4.023472785949707, "logps/chosen": -548.5009765625, "logps/rejected": -603.1859130859375, "loss": 0.4332, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9097919464111328, "rewards/margins": 1.235344648361206, "rewards/rejected": -3.145136594772339, "step": 550 }, { "epoch": 0.59, "learning_rate": 2.185397007170141e-07, "logits/chosen": -3.96891713142395, "logits/rejected": -3.992366075515747, "logps/chosen": -527.6655883789062, "logps/rejected": -594.5096435546875, "loss": 0.4661, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0592868328094482, "rewards/margins": 1.0083281993865967, "rewards/rejected": -3.067615032196045, "step": 560 }, { "epoch": 0.6, "learning_rate": 2.094922764865619e-07, "logits/chosen": -3.8889777660369873, "logits/rejected": -3.9203734397888184, "logps/chosen": -530.451904296875, "logps/rejected": -649.4442749023438, "loss": 0.4476, "rewards/accuracies": 0.8125, "rewards/chosen": -2.085118532180786, "rewards/margins": 1.273771047592163, "rewards/rejected": -3.3588898181915283, "step": 570 }, { "epoch": 0.61, "learning_rate": 2.0049902774588797e-07, "logits/chosen": -3.9504013061523438, "logits/rejected": -4.046439170837402, "logps/chosen": -551.9801635742188, "logps/rejected": -631.9030151367188, "loss": 0.4411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0798585414886475, "rewards/margins": 1.2492122650146484, "rewards/rejected": -3.329070568084717, "step": 580 }, { "epoch": 0.62, "learning_rate": 1.9157198216806238e-07, "logits/chosen": -3.846717119216919, "logits/rejected": -3.8746533393859863, "logps/chosen": -514.6439208984375, "logps/rejected": -652.5971069335938, "loss": 0.4229, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9687875509262085, "rewards/margins": 1.090827465057373, "rewards/rejected": -3.059614896774292, "step": 590 }, { "epoch": 0.63, "learning_rate": 1.8272307888529274e-07, "logits/chosen": -3.835319995880127, "logits/rejected": -3.88202166557312, "logps/chosen": -595.4498291015625, "logps/rejected": -732.9796142578125, "loss": 0.4181, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6396028995513916, "rewards/margins": 1.44252610206604, "rewards/rejected": -3.0821290016174316, "step": 600 }, { "epoch": 0.63, "eval_logits/chosen": -3.717047929763794, "eval_logits/rejected": -3.7949790954589844, "eval_logps/chosen": -584.56982421875, "eval_logps/rejected": -647.177734375, "eval_loss": 0.43660393357276917, "eval_rewards/accuracies": 0.7760000228881836, "eval_rewards/chosen": -1.9876312017440796, "eval_rewards/margins": 1.1801990270614624, "eval_rewards/rejected": -3.167830467224121, "eval_runtime": 203.6736, "eval_samples_per_second": 9.82, "eval_steps_per_second": 0.614, "step": 600 }, { "epoch": 0.64, "learning_rate": 1.7396415252139288e-07, "logits/chosen": -3.9154458045959473, "logits/rejected": -3.974060535430908, "logps/chosen": -533.2646484375, "logps/rejected": -585.8008422851562, "loss": 0.4206, "rewards/accuracies": 0.75, "rewards/chosen": -1.9508171081542969, "rewards/margins": 1.1387238502502441, "rewards/rejected": -3.089540958404541, "step": 610 }, { "epoch": 0.65, "learning_rate": 1.6530691736402316e-07, "logits/chosen": -3.8785858154296875, "logits/rejected": -3.912694215774536, "logps/chosen": -563.8421630859375, "logps/rejected": -652.9039306640625, "loss": 0.4436, "rewards/accuracies": 0.8125, "rewards/chosen": -2.181243658065796, "rewards/margins": 1.337597131729126, "rewards/rejected": -3.518840789794922, "step": 620 }, { "epoch": 0.66, "learning_rate": 1.5676295169786864e-07, "logits/chosen": -3.9640355110168457, "logits/rejected": -4.0246195793151855, "logps/chosen": -543.6168823242188, "logps/rejected": -720.9603271484375, "loss": 0.3924, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9450584650039673, "rewards/margins": 1.83893620967865, "rewards/rejected": -3.783994674682617, "step": 630 }, { "epoch": 0.67, "learning_rate": 1.483436823197092e-07, "logits/chosen": -3.8995678424835205, "logits/rejected": -3.9963104724884033, "logps/chosen": -529.6903686523438, "logps/rejected": -631.6978149414062, "loss": 0.4512, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9272207021713257, "rewards/margins": 1.3727694749832153, "rewards/rejected": -3.29999041557312, "step": 640 }, { "epoch": 0.68, "learning_rate": 1.4006036925609243e-07, "logits/chosen": -3.972506046295166, "logits/rejected": -4.015632152557373, "logps/chosen": -595.8973388671875, "logps/rejected": -727.6198120117188, "loss": 0.4132, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.108734369277954, "rewards/margins": 1.5102851390838623, "rewards/rejected": -3.6190192699432373, "step": 650 }, { "epoch": 0.69, "learning_rate": 1.319240907040458e-07, "logits/chosen": -3.9822299480438232, "logits/rejected": -4.0792555809021, "logps/chosen": -610.338134765625, "logps/rejected": -650.5155639648438, "loss": 0.4142, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.233393669128418, "rewards/margins": 1.272410273551941, "rewards/rejected": -3.5058040618896484, "step": 660 }, { "epoch": 0.7, "learning_rate": 1.239457282149695e-07, "logits/chosen": -4.008191108703613, "logits/rejected": -4.095564842224121, "logps/chosen": -588.9998168945312, "logps/rejected": -686.9320068359375, "loss": 0.4315, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.217425584793091, "rewards/margins": 1.2320573329925537, "rewards/rejected": -3.4494826793670654, "step": 670 }, { "epoch": 0.71, "learning_rate": 1.1613595214152711e-07, "logits/chosen": -3.9681782722473145, "logits/rejected": -4.009424686431885, "logps/chosen": -520.8330688476562, "logps/rejected": -595.3616943359375, "loss": 0.4556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8725841045379639, "rewards/margins": 1.0434370040893555, "rewards/rejected": -2.9160211086273193, "step": 680 }, { "epoch": 0.72, "learning_rate": 1.0850520736699362e-07, "logits/chosen": -3.910592555999756, "logits/rejected": -3.9710609912872314, "logps/chosen": -484.9183044433594, "logps/rejected": -601.7095336914062, "loss": 0.408, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8240716457366943, "rewards/margins": 1.474691390991211, "rewards/rejected": -3.2987632751464844, "step": 690 }, { "epoch": 0.73, "learning_rate": 1.0106369933615042e-07, "logits/chosen": -3.870868682861328, "logits/rejected": -3.974864959716797, "logps/chosen": -537.2079467773438, "logps/rejected": -636.1268310546875, "loss": 0.4, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.947430968284607, "rewards/margins": 1.452516794204712, "rewards/rejected": -3.3999481201171875, "step": 700 }, { "epoch": 0.73, "eval_logits/chosen": -3.7969613075256348, "eval_logits/rejected": -3.87385630607605, "eval_logps/chosen": -602.2762451171875, "eval_logps/rejected": -665.6045532226562, "eval_loss": 0.4316871762275696, "eval_rewards/accuracies": 0.7639999985694885, "eval_rewards/chosen": -2.1646955013275146, "eval_rewards/margins": 1.1874041557312012, "eval_rewards/rejected": -3.352099657058716, "eval_runtime": 204.7157, "eval_samples_per_second": 9.77, "eval_steps_per_second": 0.611, "step": 700 }, { "epoch": 0.74, "learning_rate": 9.382138040640714e-08, "logits/chosen": -3.92706298828125, "logits/rejected": -3.9605202674865723, "logps/chosen": -567.6884155273438, "logps/rejected": -623.2738647460938, "loss": 0.4277, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1468379497528076, "rewards/margins": 1.0972353219985962, "rewards/rejected": -3.2440733909606934, "step": 710 }, { "epoch": 0.75, "learning_rate": 8.678793653740632e-08, "logits/chosen": -3.9516849517822266, "logits/rejected": -4.008336544036865, "logps/chosen": -593.2915649414062, "logps/rejected": -721.9989013671875, "loss": 0.4049, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1563327312469482, "rewards/margins": 1.5128753185272217, "rewards/rejected": -3.669208526611328, "step": 720 }, { "epoch": 0.76, "learning_rate": 7.997277433690983e-08, "logits/chosen": -3.8838019371032715, "logits/rejected": -3.978668689727783, "logps/chosen": -610.8298950195312, "logps/rejected": -639.4989624023438, "loss": 0.4293, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.255786418914795, "rewards/margins": 1.1647707223892212, "rewards/rejected": -3.4205574989318848, "step": 730 }, { "epoch": 0.77, "learning_rate": 7.338500848029602e-08, "logits/chosen": -3.8697731494903564, "logits/rejected": -4.00671911239624, "logps/chosen": -697.6756591796875, "logps/rejected": -759.2227783203125, "loss": 0.4093, "rewards/accuracies": 0.8125, "rewards/chosen": -2.504605293273926, "rewards/margins": 1.4814784526824951, "rewards/rejected": -3.986083507537842, "step": 740 }, { "epoch": 0.78, "learning_rate": 6.70334495204884e-08, "logits/chosen": -3.8761909008026123, "logits/rejected": -3.915804386138916, "logps/chosen": -594.4487915039062, "logps/rejected": -745.0926513671875, "loss": 0.4379, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.277494430541992, "rewards/margins": 1.3665436506271362, "rewards/rejected": -3.644038438796997, "step": 750 }, { "epoch": 0.8, "learning_rate": 6.092659210462231e-08, "logits/chosen": -3.8564085960388184, "logits/rejected": -3.92138671875, "logps/chosen": -573.0128173828125, "logps/rejected": -675.12060546875, "loss": 0.394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.239450693130493, "rewards/margins": 1.328932285308838, "rewards/rejected": -3.568382740020752, "step": 760 }, { "epoch": 0.81, "learning_rate": 5.507260361320737e-08, "logits/chosen": -3.944653034210205, "logits/rejected": -3.991550922393799, "logps/chosen": -670.3541870117188, "logps/rejected": -742.4327392578125, "loss": 0.4177, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.406642198562622, "rewards/margins": 1.1013365983963013, "rewards/rejected": -3.5079784393310547, "step": 770 }, { "epoch": 0.82, "learning_rate": 4.947931323697982e-08, "logits/chosen": -3.9694862365722656, "logits/rejected": -3.969531297683716, "logps/chosen": -480.57232666015625, "logps/rejected": -576.7296142578125, "loss": 0.4341, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.20851469039917, "rewards/margins": 0.9807012677192688, "rewards/rejected": -3.189215898513794, "step": 780 }, { "epoch": 0.83, "learning_rate": 4.415420150605398e-08, "logits/chosen": -3.9178504943847656, "logits/rejected": -3.8871982097625732, "logps/chosen": -574.5850830078125, "logps/rejected": -722.9913330078125, "loss": 0.4176, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2402052879333496, "rewards/margins": 1.2347979545593262, "rewards/rejected": -3.475003480911255, "step": 790 }, { "epoch": 0.84, "learning_rate": 3.9104390285376374e-08, "logits/chosen": -3.819741725921631, "logits/rejected": -3.860565185546875, "logps/chosen": -664.21142578125, "logps/rejected": -710.8525390625, "loss": 0.4123, "rewards/accuracies": 0.75, "rewards/chosen": -2.2596795558929443, "rewards/margins": 1.2178363800048828, "rewards/rejected": -3.4775161743164062, "step": 800 }, { "epoch": 0.84, "eval_logits/chosen": -3.7826802730560303, "eval_logits/rejected": -3.8605716228485107, "eval_logps/chosen": -606.1934204101562, "eval_logps/rejected": -675.3074951171875, "eval_loss": 0.42910608649253845, "eval_rewards/accuracies": 0.7680000066757202, "eval_rewards/chosen": -2.2038679122924805, "eval_rewards/margins": 1.245260238647461, "eval_rewards/rejected": -3.4491279125213623, "eval_runtime": 203.5751, "eval_samples_per_second": 9.824, "eval_steps_per_second": 0.614, "step": 800 }, { "epoch": 0.85, "learning_rate": 3.433663324986208e-08, "logits/chosen": -3.9174346923828125, "logits/rejected": -3.9697937965393066, "logps/chosen": -566.511962890625, "logps/rejected": -639.9072265625, "loss": 0.4185, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.194934606552124, "rewards/margins": 1.1658843755722046, "rewards/rejected": -3.3608193397521973, "step": 810 }, { "epoch": 0.86, "learning_rate": 2.9857306851953897e-08, "logits/chosen": -3.928525447845459, "logits/rejected": -4.02262544631958, "logps/chosen": -588.645751953125, "logps/rejected": -649.748046875, "loss": 0.4511, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0678915977478027, "rewards/margins": 1.4058529138565063, "rewards/rejected": -3.4737441539764404, "step": 820 }, { "epoch": 0.87, "learning_rate": 2.567240179368185e-08, "logits/chosen": -3.9319825172424316, "logits/rejected": -3.9248032569885254, "logps/chosen": -541.6334228515625, "logps/rejected": -707.2698364257812, "loss": 0.4226, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2310593128204346, "rewards/margins": 1.1511319875717163, "rewards/rejected": -3.3821911811828613, "step": 830 }, { "epoch": 0.88, "learning_rate": 2.1787515014630357e-08, "logits/chosen": -3.929730176925659, "logits/rejected": -3.983701705932617, "logps/chosen": -561.4950561523438, "logps/rejected": -643.8230590820312, "loss": 0.457, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.112525224685669, "rewards/margins": 1.0804126262664795, "rewards/rejected": -3.1929378509521484, "step": 840 }, { "epoch": 0.89, "learning_rate": 1.820784220652766e-08, "logits/chosen": -3.8999176025390625, "logits/rejected": -3.999703884124756, "logps/chosen": -581.9009399414062, "logps/rejected": -630.4606323242188, "loss": 0.4246, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9299805164337158, "rewards/margins": 1.3858671188354492, "rewards/rejected": -3.315847873687744, "step": 850 }, { "epoch": 0.9, "learning_rate": 1.4938170864468636e-08, "logits/chosen": -3.9592201709747314, "logits/rejected": -3.982034206390381, "logps/chosen": -560.0491333007812, "logps/rejected": -656.3305053710938, "loss": 0.4568, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.186509847640991, "rewards/margins": 1.1325315237045288, "rewards/rejected": -3.3190410137176514, "step": 860 }, { "epoch": 0.91, "learning_rate": 1.1982873884064465e-08, "logits/chosen": -3.9274532794952393, "logits/rejected": -3.9674625396728516, "logps/chosen": -524.0363159179688, "logps/rejected": -671.90966796875, "loss": 0.424, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.886129379272461, "rewards/margins": 1.563849687576294, "rewards/rejected": -3.4499785900115967, "step": 870 }, { "epoch": 0.92, "learning_rate": 9.345903713082304e-09, "logits/chosen": -3.8932175636291504, "logits/rejected": -3.9537672996520996, "logps/chosen": -636.6458740234375, "logps/rejected": -725.6226196289062, "loss": 0.4073, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2143332958221436, "rewards/margins": 1.2432187795639038, "rewards/rejected": -3.457551956176758, "step": 880 }, { "epoch": 0.93, "learning_rate": 7.030787065396865e-09, "logits/chosen": -3.920933961868286, "logits/rejected": -3.956883192062378, "logps/chosen": -529.4722290039062, "logps/rejected": -692.9610595703125, "loss": 0.431, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8962398767471313, "rewards/margins": 1.446508765220642, "rewards/rejected": -3.3427481651306152, "step": 890 }, { "epoch": 0.94, "learning_rate": 5.04062020432286e-09, "logits/chosen": -3.8574442863464355, "logits/rejected": -3.9011623859405518, "logps/chosen": -620.3335571289062, "logps/rejected": -719.7374267578125, "loss": 0.4394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.159029722213745, "rewards/margins": 1.208085298538208, "rewards/rejected": -3.367115020751953, "step": 900 }, { "epoch": 0.94, "eval_logits/chosen": -3.8000898361206055, "eval_logits/rejected": -3.8776535987854004, "eval_logps/chosen": -599.057373046875, "eval_logps/rejected": -666.7249755859375, "eval_loss": 0.42921942472457886, "eval_rewards/accuracies": 0.7680000066757202, "eval_rewards/chosen": -2.132506847381592, "eval_rewards/margins": 1.230795979499817, "eval_rewards/rejected": -3.3633029460906982, "eval_runtime": 203.7413, "eval_samples_per_second": 9.816, "eval_steps_per_second": 0.614, "step": 900 }, { "epoch": 0.95, "learning_rate": 3.3780648016376866e-09, "logits/chosen": -3.9453799724578857, "logits/rejected": -3.986725330352783, "logps/chosen": -560.10205078125, "logps/rejected": -671.8740234375, "loss": 0.4277, "rewards/accuracies": 0.875, "rewards/chosen": -2.1619820594787598, "rewards/margins": 1.315500259399414, "rewards/rejected": -3.477482557296753, "step": 910 }, { "epoch": 0.96, "learning_rate": 2.0453443778310766e-09, "logits/chosen": -3.936732530593872, "logits/rejected": -3.999469041824341, "logps/chosen": -593.9410400390625, "logps/rejected": -700.7437744140625, "loss": 0.3904, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.2095024585723877, "rewards/margins": 1.3912229537963867, "rewards/rejected": -3.600724697113037, "step": 920 }, { "epoch": 0.97, "learning_rate": 1.0442413283435758e-09, "logits/chosen": -3.881744384765625, "logits/rejected": -3.9535934925079346, "logps/chosen": -482.16058349609375, "logps/rejected": -652.9393310546875, "loss": 0.3867, "rewards/accuracies": 0.875, "rewards/chosen": -1.774622917175293, "rewards/margins": 1.7917388677597046, "rewards/rejected": -3.566361904144287, "step": 930 }, { "epoch": 0.98, "learning_rate": 3.760945397705828e-10, "logits/chosen": -3.958568572998047, "logits/rejected": -4.010906219482422, "logps/chosen": -617.6387939453125, "logps/rejected": -726.321044921875, "loss": 0.4054, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.300060749053955, "rewards/margins": 1.2705519199371338, "rewards/rejected": -3.570613145828247, "step": 940 }, { "epoch": 0.99, "learning_rate": 4.17975992204056e-11, "logits/chosen": -3.931025266647339, "logits/rejected": -3.9499351978302, "logps/chosen": -544.7789916992188, "logps/rejected": -728.7254638671875, "loss": 0.4304, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.2649009227752686, "rewards/margins": 1.7213318347930908, "rewards/rejected": -3.9862327575683594, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.46914771214829687, "train_runtime": 18559.523, "train_samples_per_second": 3.294, "train_steps_per_second": 0.051 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }