{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026171159382360636, "grad_norm": 3.590468168258667, "learning_rate": 1.3054830287206268e-08, "logits/chosen": 0.6792653799057007, "logits/rejected": 1.31020188331604, "logps/chosen": -469.49981689453125, "logps/rejected": -525.3796997070312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0026171159382360636, "grad_norm": 3.1591908931732178, "learning_rate": 1.3054830287206266e-07, "logits/chosen": 1.5021909475326538, "logits/rejected": 1.427976131439209, "logps/chosen": -398.0495300292969, "logps/rejected": -356.86016845703125, "loss": 0.6928, "rewards/accuracies": 0.5138888955116272, "rewards/chosen": -8.217601134674624e-05, "rewards/margins": 0.0007430262048728764, "rewards/rejected": -0.000825202208943665, "step": 10 }, { "epoch": 0.005234231876472127, "grad_norm": 3.51601243019104, "learning_rate": 2.610966057441253e-07, "logits/chosen": 1.3317842483520508, "logits/rejected": 1.638771414756775, "logps/chosen": -435.8251953125, "logps/rejected": -342.2559509277344, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00043620201176963747, "rewards/margins": 0.0005908687599003315, "rewards/rejected": -0.00015466664626728743, "step": 20 }, { "epoch": 0.007851347814708191, "grad_norm": 3.3784544467926025, "learning_rate": 3.9164490861618804e-07, "logits/chosen": 1.3975493907928467, "logits/rejected": 1.4588085412979126, "logps/chosen": -377.9482727050781, "logps/rejected": -355.25885009765625, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0005684455973096192, "rewards/margins": 0.0005999829736538231, "rewards/rejected": -3.1537445465801284e-05, "step": 30 }, { "epoch": 0.010468463752944255, "grad_norm": 2.979464292526245, "learning_rate": 5.221932114882506e-07, "logits/chosen": 1.6848558187484741, "logits/rejected": 1.9189517498016357, "logps/chosen": -316.1363525390625, "logps/rejected": -315.66058349609375, "loss": 0.6934, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00017661902529653162, "rewards/margins": -0.0004952313611283898, "rewards/rejected": 0.00031861235038377345, "step": 40 }, { "epoch": 0.01308557969118032, "grad_norm": 3.1099374294281006, "learning_rate": 6.527415143603135e-07, "logits/chosen": 1.5289150476455688, "logits/rejected": 1.5212490558624268, "logps/chosen": -398.5328674316406, "logps/rejected": -336.2831115722656, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0001419751497451216, "rewards/margins": 0.00047931409790180624, "rewards/rejected": -0.0003373388899490237, "step": 50 }, { "epoch": 0.015702695629416383, "grad_norm": 2.992774248123169, "learning_rate": 7.832898172323761e-07, "logits/chosen": 1.4803454875946045, "logits/rejected": 1.6450494527816772, "logps/chosen": -373.1556396484375, "logps/rejected": -328.5540771484375, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00015133472334127873, "rewards/margins": -8.949339098762721e-05, "rewards/rejected": 0.00024082818708848208, "step": 60 }, { "epoch": 0.018319811567652448, "grad_norm": 3.166975498199463, "learning_rate": 9.138381201044387e-07, "logits/chosen": 1.3975117206573486, "logits/rejected": 1.632108449935913, "logps/chosen": -385.858154296875, "logps/rejected": -334.81219482421875, "loss": 0.6934, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.00018815476505551487, "rewards/margins": -0.0004339146544225514, "rewards/rejected": 0.0002457600203342736, "step": 70 }, { "epoch": 0.02093692750588851, "grad_norm": 3.23146915435791, "learning_rate": 1.0443864229765013e-06, "logits/chosen": 1.7188682556152344, "logits/rejected": 1.6811161041259766, "logps/chosen": -383.59771728515625, "logps/rejected": -346.7442321777344, "loss": 0.6935, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00110594870056957, "rewards/margins": -0.0007629155879840255, "rewards/rejected": -0.00034303305437788367, "step": 80 }, { "epoch": 0.023554043444124574, "grad_norm": 3.223733901977539, "learning_rate": 1.1749347258485642e-06, "logits/chosen": 1.5662400722503662, "logits/rejected": 1.790412187576294, "logps/chosen": -364.4346923828125, "logps/rejected": -327.1966552734375, "loss": 0.6928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0004470300336834043, "rewards/margins": 0.0006952629191800952, "rewards/rejected": -0.00114229298196733, "step": 90 }, { "epoch": 0.02617115938236064, "grad_norm": 2.721588373184204, "learning_rate": 1.305483028720627e-06, "logits/chosen": 1.4209253787994385, "logits/rejected": 1.6146681308746338, "logps/chosen": -368.99169921875, "logps/rejected": -334.1181640625, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0008529865299351513, "rewards/margins": -8.540081762475893e-05, "rewards/rejected": -0.0007675857050344348, "step": 100 }, { "epoch": 0.02617115938236064, "eval_logits/chosen": 1.2742817401885986, "eval_logits/rejected": 1.4772121906280518, "eval_logps/chosen": -388.4209289550781, "eval_logps/rejected": -344.7744140625, "eval_loss": 0.6929848194122314, "eval_rewards/accuracies": 0.5210000276565552, "eval_rewards/chosen": -0.0005557815893553197, "eval_rewards/margins": 0.00034635106567293406, "eval_rewards/rejected": -0.0009021326550282538, "eval_runtime": 233.4714, "eval_samples_per_second": 8.566, "eval_steps_per_second": 1.071, "step": 100 }, { "epoch": 0.028788275320596704, "grad_norm": 3.0894269943237305, "learning_rate": 1.4360313315926894e-06, "logits/chosen": 1.4099972248077393, "logits/rejected": 1.5973542928695679, "logps/chosen": -405.90130615234375, "logps/rejected": -338.3550720214844, "loss": 0.6929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0014196943957358599, "rewards/margins": 0.00048225713544525206, "rewards/rejected": -0.0019019513856619596, "step": 110 }, { "epoch": 0.031405391258832765, "grad_norm": 3.1108322143554688, "learning_rate": 1.5665796344647521e-06, "logits/chosen": 1.4038909673690796, "logits/rejected": 1.6449644565582275, "logps/chosen": -425.57330322265625, "logps/rejected": -380.5539855957031, "loss": 0.6926, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0003092998522333801, "rewards/margins": 0.0012004419695585966, "rewards/rejected": -0.0015097421128302813, "step": 120 }, { "epoch": 0.03402250719706883, "grad_norm": 3.5928196907043457, "learning_rate": 1.6971279373368146e-06, "logits/chosen": 1.4526954889297485, "logits/rejected": 1.6924293041229248, "logps/chosen": -368.2237548828125, "logps/rejected": -353.4678955078125, "loss": 0.6924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0008503898861818016, "rewards/margins": 0.0015992727130651474, "rewards/rejected": -0.00244966265745461, "step": 130 }, { "epoch": 0.036639623135304895, "grad_norm": 3.48256778717041, "learning_rate": 1.8276762402088774e-06, "logits/chosen": 1.583683729171753, "logits/rejected": 1.6599153280258179, "logps/chosen": -401.0623474121094, "logps/rejected": -320.5968017578125, "loss": 0.6925, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0012277833884581923, "rewards/margins": 0.0012606108793988824, "rewards/rejected": -0.0024883942678570747, "step": 140 }, { "epoch": 0.03925673907354096, "grad_norm": 3.6008684635162354, "learning_rate": 1.9582245430809403e-06, "logits/chosen": 1.608473539352417, "logits/rejected": 1.6299419403076172, "logps/chosen": -419.42083740234375, "logps/rejected": -340.1468811035156, "loss": 0.6921, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0011100767878815532, "rewards/margins": 0.0020758803002536297, "rewards/rejected": -0.003185956971719861, "step": 150 }, { "epoch": 0.04187385501177702, "grad_norm": 3.3119959831237793, "learning_rate": 2.0887728459530026e-06, "logits/chosen": 1.191816806793213, "logits/rejected": 1.4365403652191162, "logps/chosen": -375.5677185058594, "logps/rejected": -358.6431884765625, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0033069555647671223, "rewards/margins": 0.0001626126904739067, "rewards/rejected": -0.0034695682115852833, "step": 160 }, { "epoch": 0.04449097095001309, "grad_norm": 3.996933937072754, "learning_rate": 2.2193211488250653e-06, "logits/chosen": 1.554457426071167, "logits/rejected": 1.7651093006134033, "logps/chosen": -324.51995849609375, "logps/rejected": -305.23175048828125, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0038299753796309233, "rewards/margins": 0.0007525371038354933, "rewards/rejected": -0.004582512192428112, "step": 170 }, { "epoch": 0.04710808688824915, "grad_norm": 2.7113592624664307, "learning_rate": 2.3498694516971284e-06, "logits/chosen": 1.3284379243850708, "logits/rejected": 1.6581776142120361, "logps/chosen": -373.4314880371094, "logps/rejected": -329.12628173828125, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0027942766901105642, "rewards/margins": 0.0031869211234152317, "rewards/rejected": -0.005981197580695152, "step": 180 }, { "epoch": 0.04972520282648522, "grad_norm": 3.2230942249298096, "learning_rate": 2.4804177545691907e-06, "logits/chosen": 1.333396315574646, "logits/rejected": 1.4282341003417969, "logps/chosen": -385.22369384765625, "logps/rejected": -338.3575744628906, "loss": 0.6923, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0038786418735980988, "rewards/margins": 0.0016467362875118852, "rewards/rejected": -0.005525378044694662, "step": 190 }, { "epoch": 0.05234231876472128, "grad_norm": 2.9138338565826416, "learning_rate": 2.610966057441254e-06, "logits/chosen": 1.4856141805648804, "logits/rejected": 1.6409976482391357, "logps/chosen": -361.8666687011719, "logps/rejected": -304.0251159667969, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.005739121697843075, "rewards/margins": 0.00320308655500412, "rewards/rejected": -0.00894220918416977, "step": 200 }, { "epoch": 0.05234231876472128, "eval_logits/chosen": 1.275007724761963, "eval_logits/rejected": 1.4769618511199951, "eval_logps/chosen": -388.9444274902344, "eval_logps/rejected": -345.5613098144531, "eval_loss": 0.6916878819465637, "eval_rewards/accuracies": 0.5799999833106995, "eval_rewards/chosen": -0.005790840368717909, "eval_rewards/margins": 0.0029809444677084684, "eval_rewards/rejected": -0.008771784603595734, "eval_runtime": 233.1655, "eval_samples_per_second": 8.578, "eval_steps_per_second": 1.072, "step": 200 }, { "epoch": 0.05495943470295734, "grad_norm": 3.762040615081787, "learning_rate": 2.741514360313316e-06, "logits/chosen": 1.5017220973968506, "logits/rejected": 1.6038427352905273, "logps/chosen": -397.37506103515625, "logps/rejected": -332.4270324707031, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": -0.006693325936794281, "rewards/margins": 0.004596198443323374, "rewards/rejected": -0.011289524845778942, "step": 210 }, { "epoch": 0.05757655064119341, "grad_norm": 3.389441728591919, "learning_rate": 2.872062663185379e-06, "logits/chosen": 1.4440081119537354, "logits/rejected": 1.5644251108169556, "logps/chosen": -370.3480224609375, "logps/rejected": -320.7288818359375, "loss": 0.6904, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.008492258377373219, "rewards/margins": 0.005523340776562691, "rewards/rejected": -0.014015598222613335, "step": 220 }, { "epoch": 0.06019366657942947, "grad_norm": 3.2270307540893555, "learning_rate": 3.0026109660574416e-06, "logits/chosen": 1.2713916301727295, "logits/rejected": 1.3412346839904785, "logps/chosen": -443.724853515625, "logps/rejected": -387.1663513183594, "loss": 0.6909, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.011385348625481129, "rewards/margins": 0.00459087360650301, "rewards/rejected": -0.01597622036933899, "step": 230 }, { "epoch": 0.06281078251766553, "grad_norm": 3.316723346710205, "learning_rate": 3.1331592689295043e-06, "logits/chosen": 1.3093677759170532, "logits/rejected": 1.5563104152679443, "logps/chosen": -427.25787353515625, "logps/rejected": -385.2057189941406, "loss": 0.6907, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.016488298773765564, "rewards/margins": 0.005019473843276501, "rewards/rejected": -0.02150776982307434, "step": 240 }, { "epoch": 0.06542789845590159, "grad_norm": 3.3876242637634277, "learning_rate": 3.263707571801567e-06, "logits/chosen": 1.4990646839141846, "logits/rejected": 1.7924457788467407, "logps/chosen": -395.66754150390625, "logps/rejected": -346.0106506347656, "loss": 0.6898, "rewards/accuracies": 0.59375, "rewards/chosen": -0.021374255418777466, "rewards/margins": 0.006892757024616003, "rewards/rejected": -0.028267016634345055, "step": 250 }, { "epoch": 0.06804501439413765, "grad_norm": 3.321073055267334, "learning_rate": 3.3942558746736293e-06, "logits/chosen": 1.3270446062088013, "logits/rejected": 1.5052978992462158, "logps/chosen": -402.4129638671875, "logps/rejected": -366.2283630371094, "loss": 0.6878, "rewards/accuracies": 0.6875, "rewards/chosen": -0.028442109003663063, "rewards/margins": 0.010941008105874062, "rewards/rejected": -0.039383117109537125, "step": 260 }, { "epoch": 0.07066213033237373, "grad_norm": 2.7087340354919434, "learning_rate": 3.524804177545692e-06, "logits/chosen": 1.3542237281799316, "logits/rejected": 1.5894794464111328, "logps/chosen": -386.6691589355469, "logps/rejected": -324.80499267578125, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": -0.035862237215042114, "rewards/margins": 0.014425190165638924, "rewards/rejected": -0.05028742551803589, "step": 270 }, { "epoch": 0.07327924627060979, "grad_norm": 3.6236772537231445, "learning_rate": 3.6553524804177547e-06, "logits/chosen": 1.2839621305465698, "logits/rejected": 1.604859709739685, "logps/chosen": -376.8522033691406, "logps/rejected": -334.2494812011719, "loss": 0.687, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.046802129596471786, "rewards/margins": 0.012729302048683167, "rewards/rejected": -0.05953143909573555, "step": 280 }, { "epoch": 0.07589636220884585, "grad_norm": 4.159788131713867, "learning_rate": 3.7859007832898174e-06, "logits/chosen": 1.5032262802124023, "logits/rejected": 1.5165659189224243, "logps/chosen": -416.37554931640625, "logps/rejected": -369.2384338378906, "loss": 0.6851, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05019887164235115, "rewards/margins": 0.017000939697027206, "rewards/rejected": -0.06719981133937836, "step": 290 }, { "epoch": 0.07851347814708191, "grad_norm": 3.8966574668884277, "learning_rate": 3.9164490861618806e-06, "logits/chosen": 1.5554125308990479, "logits/rejected": 1.7502315044403076, "logps/chosen": -363.2331237792969, "logps/rejected": -331.22332763671875, "loss": 0.6861, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.056589674204587936, "rewards/margins": 0.014802152290940285, "rewards/rejected": -0.07139183580875397, "step": 300 }, { "epoch": 0.07851347814708191, "eval_logits/chosen": 1.225297212600708, "eval_logits/rejected": 1.4230777025222778, "eval_logps/chosen": -394.262451171875, "eval_logps/rejected": -352.1134338378906, "eval_loss": 0.6859813332557678, "eval_rewards/accuracies": 0.5989999771118164, "eval_rewards/chosen": -0.05897095054388046, "eval_rewards/margins": 0.015322154387831688, "eval_rewards/rejected": -0.0742930993437767, "eval_runtime": 233.2642, "eval_samples_per_second": 8.574, "eval_steps_per_second": 1.072, "step": 300 }, { "epoch": 0.08113059408531798, "grad_norm": 4.554100513458252, "learning_rate": 4.046997389033943e-06, "logits/chosen": 1.3358051776885986, "logits/rejected": 1.3814319372177124, "logps/chosen": -428.4581604003906, "logps/rejected": -350.5341491699219, "loss": 0.6805, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.053123779594898224, "rewards/margins": 0.026760786771774292, "rewards/rejected": -0.07988456636667252, "step": 310 }, { "epoch": 0.08374771002355404, "grad_norm": 3.4826040267944336, "learning_rate": 4.177545691906005e-06, "logits/chosen": 1.4008252620697021, "logits/rejected": 1.653390884399414, "logps/chosen": -378.8886413574219, "logps/rejected": -348.64361572265625, "loss": 0.685, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.06168674677610397, "rewards/margins": 0.017807736992836, "rewards/rejected": -0.07949449121952057, "step": 320 }, { "epoch": 0.08636482596179011, "grad_norm": 3.816162586212158, "learning_rate": 4.308093994778068e-06, "logits/chosen": 1.3938804864883423, "logits/rejected": 1.5117136240005493, "logps/chosen": -373.7430114746094, "logps/rejected": -343.16607666015625, "loss": 0.6838, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07209007441997528, "rewards/margins": 0.020073365420103073, "rewards/rejected": -0.09216342866420746, "step": 330 }, { "epoch": 0.08898194190002617, "grad_norm": 4.358788013458252, "learning_rate": 4.4386422976501306e-06, "logits/chosen": 1.266187071800232, "logits/rejected": 1.4517450332641602, "logps/chosen": -425.5559997558594, "logps/rejected": -387.93243408203125, "loss": 0.6809, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06175302714109421, "rewards/margins": 0.02637804113328457, "rewards/rejected": -0.08813107013702393, "step": 340 }, { "epoch": 0.09159905783826224, "grad_norm": 3.675837755203247, "learning_rate": 4.569190600522193e-06, "logits/chosen": 1.0587990283966064, "logits/rejected": 1.394778847694397, "logps/chosen": -435.980224609375, "logps/rejected": -399.3922424316406, "loss": 0.6819, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07773401588201523, "rewards/margins": 0.02536655031144619, "rewards/rejected": -0.10310056060552597, "step": 350 }, { "epoch": 0.0942161737764983, "grad_norm": 2.5249226093292236, "learning_rate": 4.699738903394257e-06, "logits/chosen": 1.3578028678894043, "logits/rejected": 1.5769567489624023, "logps/chosen": -368.1285705566406, "logps/rejected": -327.1725769042969, "loss": 0.6816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1062256470322609, "rewards/margins": 0.025300273671746254, "rewards/rejected": -0.131525918841362, "step": 360 }, { "epoch": 0.09683328971473436, "grad_norm": 4.508261680603027, "learning_rate": 4.8302872062663196e-06, "logits/chosen": 1.4787975549697876, "logits/rejected": 1.530562400817871, "logps/chosen": -423.5401306152344, "logps/rejected": -350.29522705078125, "loss": 0.6712, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11056496202945709, "rewards/margins": 0.047700513154268265, "rewards/rejected": -0.15826547145843506, "step": 370 }, { "epoch": 0.09945040565297043, "grad_norm": 4.113176345825195, "learning_rate": 4.9608355091383814e-06, "logits/chosen": 1.3677705526351929, "logits/rejected": 1.590041995048523, "logps/chosen": -425.14569091796875, "logps/rejected": -369.8869934082031, "loss": 0.6761, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12270758301019669, "rewards/margins": 0.03834443539381027, "rewards/rejected": -0.16105201840400696, "step": 380 }, { "epoch": 0.1020675215912065, "grad_norm": 4.8123579025268555, "learning_rate": 4.9999488562447675e-06, "logits/chosen": 1.3154010772705078, "logits/rejected": 1.4253056049346924, "logps/chosen": -410.3934631347656, "logps/rejected": -374.3392639160156, "loss": 0.6646, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11665485054254532, "rewards/margins": 0.062035609036684036, "rewards/rejected": -0.17869044840335846, "step": 390 }, { "epoch": 0.10468463752944256, "grad_norm": 4.263484001159668, "learning_rate": 4.999698361256577e-06, "logits/chosen": 1.333717703819275, "logits/rejected": 1.4957849979400635, "logps/chosen": -405.444580078125, "logps/rejected": -337.0886535644531, "loss": 0.6757, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.1346513330936432, "rewards/margins": 0.039198193699121475, "rewards/rejected": -0.17384955286979675, "step": 400 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": 1.2136365175247192, "eval_logits/rejected": 1.3996493816375732, "eval_logps/chosen": -403.8213195800781, "eval_logps/rejected": -363.8988037109375, "eval_loss": 0.6773815751075745, "eval_rewards/accuracies": 0.6025000214576721, "eval_rewards/chosen": -0.15455959737300873, "eval_rewards/margins": 0.03758702799677849, "eval_rewards/rejected": -0.19214662909507751, "eval_runtime": 232.5337, "eval_samples_per_second": 8.601, "eval_steps_per_second": 1.075, "step": 400 }, { "epoch": 0.10730175346767862, "grad_norm": 4.715285301208496, "learning_rate": 4.999239142174581e-06, "logits/chosen": 1.370822548866272, "logits/rejected": 1.4222373962402344, "logps/chosen": -386.025146484375, "logps/rejected": -370.2654724121094, "loss": 0.6825, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.15298400819301605, "rewards/margins": 0.026256907731294632, "rewards/rejected": -0.17924091219902039, "step": 410 }, { "epoch": 0.10991886940591468, "grad_norm": 4.306619167327881, "learning_rate": 4.99857123734344e-06, "logits/chosen": 1.382204294204712, "logits/rejected": 1.4024231433868408, "logps/chosen": -378.82562255859375, "logps/rejected": -337.8069152832031, "loss": 0.6765, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.15577563643455505, "rewards/margins": 0.039799682796001434, "rewards/rejected": -0.1955752968788147, "step": 420 }, { "epoch": 0.11253598534415074, "grad_norm": 4.360618591308594, "learning_rate": 4.997694702533016e-06, "logits/chosen": 1.2584686279296875, "logits/rejected": 1.6165319681167603, "logps/chosen": -416.8006286621094, "logps/rejected": -379.16168212890625, "loss": 0.6744, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1570083498954773, "rewards/margins": 0.043786775320768356, "rewards/rejected": -0.20079509913921356, "step": 430 }, { "epoch": 0.11515310128238682, "grad_norm": 3.9530985355377197, "learning_rate": 4.996609610933713e-06, "logits/chosen": 1.2687292098999023, "logits/rejected": 1.2826169729232788, "logps/chosen": -423.39794921875, "logps/rejected": -383.66290283203125, "loss": 0.6748, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.14532431960105896, "rewards/margins": 0.042948439717292786, "rewards/rejected": -0.18827277421951294, "step": 440 }, { "epoch": 0.11777021722062288, "grad_norm": 4.231596946716309, "learning_rate": 4.995316053150366e-06, "logits/chosen": 1.1246349811553955, "logits/rejected": 1.26097571849823, "logps/chosen": -403.4371032714844, "logps/rejected": -370.98431396484375, "loss": 0.6652, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12421522289514542, "rewards/margins": 0.0634991005063057, "rewards/rejected": -0.18771430850028992, "step": 450 }, { "epoch": 0.12038733315885894, "grad_norm": 4.21218729019165, "learning_rate": 4.9938141371946815e-06, "logits/chosen": 1.165198564529419, "logits/rejected": 1.3747196197509766, "logps/chosen": -396.4094543457031, "logps/rejected": -366.3015441894531, "loss": 0.6628, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1397528052330017, "rewards/margins": 0.06794790923595428, "rewards/rejected": -0.2077006995677948, "step": 460 }, { "epoch": 0.123004449097095, "grad_norm": 5.95582389831543, "learning_rate": 4.992103988476206e-06, "logits/chosen": 1.2146246433258057, "logits/rejected": 1.2889845371246338, "logps/chosen": -386.59368896484375, "logps/rejected": -354.281005859375, "loss": 0.6667, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.13470637798309326, "rewards/margins": 0.06365668773651123, "rewards/rejected": -0.1983630657196045, "step": 470 }, { "epoch": 0.12562156503533106, "grad_norm": 4.180170059204102, "learning_rate": 4.990185749791866e-06, "logits/chosen": 1.0521671772003174, "logits/rejected": 1.2878140211105347, "logps/chosen": -396.9014587402344, "logps/rejected": -366.1920166015625, "loss": 0.6647, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1293117105960846, "rewards/margins": 0.06927163153886795, "rewards/rejected": -0.19858333468437195, "step": 480 }, { "epoch": 0.12823868097356714, "grad_norm": 4.745175361633301, "learning_rate": 4.9880595813140395e-06, "logits/chosen": 1.0240387916564941, "logits/rejected": 1.2297166585922241, "logps/chosen": -430.8408203125, "logps/rejected": -386.9019470214844, "loss": 0.6617, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1291750818490982, "rewards/margins": 0.07389305531978607, "rewards/rejected": -0.20306813716888428, "step": 490 }, { "epoch": 0.13085579691180318, "grad_norm": 4.876718521118164, "learning_rate": 4.985725660577184e-06, "logits/chosen": 0.956185519695282, "logits/rejected": 1.1582107543945312, "logps/chosen": -418.2518615722656, "logps/rejected": -358.01739501953125, "loss": 0.6581, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.15920230746269226, "rewards/margins": 0.08411301672458649, "rewards/rejected": -0.24331530928611755, "step": 500 }, { "epoch": 0.13085579691180318, "eval_logits/chosen": 0.9564015865325928, "eval_logits/rejected": 1.1466065645217896, "eval_logps/chosen": -404.720947265625, "eval_logps/rejected": -367.4447326660156, "eval_loss": 0.6681177020072937, "eval_rewards/accuracies": 0.6240000128746033, "eval_rewards/chosen": -0.16355587542057037, "eval_rewards/margins": 0.06404965370893478, "eval_rewards/rejected": -0.22760552167892456, "eval_runtime": 232.074, "eval_samples_per_second": 8.618, "eval_steps_per_second": 1.077, "step": 500 }, { "epoch": 0.13347291285003926, "grad_norm": 5.33396053314209, "learning_rate": 4.983184182463009e-06, "logits/chosen": 1.1448547840118408, "logits/rejected": 1.1940263509750366, "logps/chosen": -420.2618713378906, "logps/rejected": -372.01531982421875, "loss": 0.6574, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1364787369966507, "rewards/margins": 0.08503785729408264, "rewards/rejected": -0.22151657938957214, "step": 510 }, { "epoch": 0.1360900287882753, "grad_norm": 5.8554887771606445, "learning_rate": 4.980435359184203e-06, "logits/chosen": 1.2189310789108276, "logits/rejected": 1.1558836698532104, "logps/chosen": -412.2118225097656, "logps/rejected": -386.9334411621094, "loss": 0.6653, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.18587224185466766, "rewards/margins": 0.07138343900442123, "rewards/rejected": -0.2572557032108307, "step": 520 }, { "epoch": 0.13870714472651138, "grad_norm": 5.761895656585693, "learning_rate": 4.9774794202667236e-06, "logits/chosen": 1.0874278545379639, "logits/rejected": 1.3288378715515137, "logps/chosen": -404.8669738769531, "logps/rejected": -405.3680419921875, "loss": 0.6552, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.18935520946979523, "rewards/margins": 0.0935024693608284, "rewards/rejected": -0.28285765647888184, "step": 530 }, { "epoch": 0.14132426066474746, "grad_norm": 5.3668413162231445, "learning_rate": 4.974316612530615e-06, "logits/chosen": 1.3489412069320679, "logits/rejected": 1.4930012226104736, "logps/chosen": -424.3651428222656, "logps/rejected": -362.0076599121094, "loss": 0.6354, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.20577266812324524, "rewards/margins": 0.13636724650859833, "rewards/rejected": -0.3421398997306824, "step": 540 }, { "epoch": 0.1439413766029835, "grad_norm": 5.151000022888184, "learning_rate": 4.970947200069416e-06, "logits/chosen": 1.1908769607543945, "logits/rejected": 1.2403991222381592, "logps/chosen": -418.28729248046875, "logps/rejected": -380.58282470703125, "loss": 0.6608, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16425392031669617, "rewards/margins": 0.0870148092508316, "rewards/rejected": -0.2512687146663666, "step": 550 }, { "epoch": 0.14655849254121958, "grad_norm": 5.040604591369629, "learning_rate": 4.967371464228096e-06, "logits/chosen": 1.0223934650421143, "logits/rejected": 1.145374059677124, "logps/chosen": -404.68829345703125, "logps/rejected": -392.9267578125, "loss": 0.6599, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.19183708727359772, "rewards/margins": 0.08984865993261337, "rewards/rejected": -0.2816857099533081, "step": 560 }, { "epoch": 0.14917560847945563, "grad_norm": 6.167598724365234, "learning_rate": 4.963589703579569e-06, "logits/chosen": 1.0712225437164307, "logits/rejected": 1.3134175539016724, "logps/chosen": -472.494384765625, "logps/rejected": -424.765380859375, "loss": 0.6634, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.29787617921829224, "rewards/margins": 0.08284131437540054, "rewards/rejected": -0.3807174861431122, "step": 570 }, { "epoch": 0.1517927244176917, "grad_norm": 5.881194114685059, "learning_rate": 4.9596022338997615e-06, "logits/chosen": 0.8614290356636047, "logits/rejected": 1.003142237663269, "logps/chosen": -461.50421142578125, "logps/rejected": -403.1651916503906, "loss": 0.6505, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2679108679294586, "rewards/margins": 0.11834441125392914, "rewards/rejected": -0.38625526428222656, "step": 580 }, { "epoch": 0.15440984035592778, "grad_norm": 5.650794506072998, "learning_rate": 4.955409388141243e-06, "logits/chosen": 0.8646121025085449, "logits/rejected": 1.1550391912460327, "logps/chosen": -393.10260009765625, "logps/rejected": -357.4300231933594, "loss": 0.6638, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.21748176217079163, "rewards/margins": 0.07719887048006058, "rewards/rejected": -0.2946805953979492, "step": 590 }, { "epoch": 0.15702695629416383, "grad_norm": 5.714654922485352, "learning_rate": 4.951011516405429e-06, "logits/chosen": 0.9980852007865906, "logits/rejected": 1.1865313053131104, "logps/chosen": -385.1533508300781, "logps/rejected": -367.5033264160156, "loss": 0.658, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.20146425068378448, "rewards/margins": 0.09325651824474335, "rewards/rejected": -0.29472076892852783, "step": 600 }, { "epoch": 0.15702695629416383, "eval_logits/chosen": 0.9498924016952515, "eval_logits/rejected": 1.1416826248168945, "eval_logps/chosen": -415.15386962890625, "eval_logps/rejected": -380.47955322265625, "eval_loss": 0.6596394181251526, "eval_rewards/accuracies": 0.6234999895095825, "eval_rewards/chosen": -0.2678852677345276, "eval_rewards/margins": 0.09006918221712112, "eval_rewards/rejected": -0.3579544723033905, "eval_runtime": 231.9217, "eval_samples_per_second": 8.624, "eval_steps_per_second": 1.078, "step": 600 }, { "epoch": 0.1596440722323999, "grad_norm": 5.238458156585693, "learning_rate": 4.946408985913344e-06, "logits/chosen": 1.1951860189437866, "logits/rejected": 1.289475679397583, "logps/chosen": -383.46795654296875, "logps/rejected": -362.46417236328125, "loss": 0.6652, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.29597288370132446, "rewards/margins": 0.07488597929477692, "rewards/rejected": -0.3708588182926178, "step": 610 }, { "epoch": 0.16226118817063595, "grad_norm": 6.363269805908203, "learning_rate": 4.941602180974958e-06, "logits/chosen": 1.051048994064331, "logits/rejected": 1.357006311416626, "logps/chosen": -452.8377380371094, "logps/rejected": -374.8196716308594, "loss": 0.6508, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.29147928953170776, "rewards/margins": 0.10656224191188812, "rewards/rejected": -0.3980415463447571, "step": 620 }, { "epoch": 0.16487830410887203, "grad_norm": 6.250793933868408, "learning_rate": 4.936591502957101e-06, "logits/chosen": 0.9079286456108093, "logits/rejected": 1.184887170791626, "logps/chosen": -387.5869445800781, "logps/rejected": -360.51885986328125, "loss": 0.6415, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.17826662957668304, "rewards/margins": 0.12794804573059082, "rewards/rejected": -0.30621469020843506, "step": 630 }, { "epoch": 0.16749542004710807, "grad_norm": 7.378194332122803, "learning_rate": 4.931377370249946e-06, "logits/chosen": 0.7079142332077026, "logits/rejected": 0.9907251596450806, "logps/chosen": -420.49560546875, "logps/rejected": -371.4837951660156, "loss": 0.657, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.23468086123466492, "rewards/margins": 0.09351176023483276, "rewards/rejected": -0.3281926214694977, "step": 640 }, { "epoch": 0.17011253598534415, "grad_norm": 10.734910011291504, "learning_rate": 4.925960218232073e-06, "logits/chosen": 0.8249115943908691, "logits/rejected": 1.072989583015442, "logps/chosen": -398.7706604003906, "logps/rejected": -382.7182922363281, "loss": 0.6394, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.24996908009052277, "rewards/margins": 0.1392596811056137, "rewards/rejected": -0.3892287611961365, "step": 650 }, { "epoch": 0.17272965192358022, "grad_norm": 6.875140190124512, "learning_rate": 4.920340499234116e-06, "logits/chosen": 0.8695880770683289, "logits/rejected": 1.1250814199447632, "logps/chosen": -394.7156982421875, "logps/rejected": -351.32427978515625, "loss": 0.639, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.18945911526679993, "rewards/margins": 0.13935169577598572, "rewards/rejected": -0.32881081104278564, "step": 660 }, { "epoch": 0.17534676786181627, "grad_norm": 7.728440761566162, "learning_rate": 4.914518682500995e-06, "logits/chosen": 0.9572404623031616, "logits/rejected": 1.0353472232818604, "logps/chosen": -424.74468994140625, "logps/rejected": -388.115234375, "loss": 0.6416, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19141948223114014, "rewards/margins": 0.1302730143070221, "rewards/rejected": -0.32169249653816223, "step": 670 }, { "epoch": 0.17796388380005235, "grad_norm": 8.876616477966309, "learning_rate": 4.9084952541527315e-06, "logits/chosen": 0.7888752222061157, "logits/rejected": 0.9395732879638672, "logps/chosen": -428.37847900390625, "logps/rejected": -380.4373779296875, "loss": 0.6262, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.34512442350387573, "rewards/margins": 0.17706379294395447, "rewards/rejected": -0.5221882462501526, "step": 680 }, { "epoch": 0.1805809997382884, "grad_norm": 7.472078800201416, "learning_rate": 4.902270717143858e-06, "logits/chosen": 1.0732382535934448, "logits/rejected": 1.275301218032837, "logps/chosen": -385.16851806640625, "logps/rejected": -399.53167724609375, "loss": 0.6284, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3828611969947815, "rewards/margins": 0.16961130499839783, "rewards/rejected": -0.5524724721908569, "step": 690 }, { "epoch": 0.18319811567652447, "grad_norm": 6.828216552734375, "learning_rate": 4.895845591221427e-06, "logits/chosen": 0.8568657636642456, "logits/rejected": 0.9334108233451843, "logps/chosen": -401.2918395996094, "logps/rejected": -391.73291015625, "loss": 0.6399, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2788159251213074, "rewards/margins": 0.14835360646247864, "rewards/rejected": -0.4271695017814636, "step": 700 }, { "epoch": 0.18319811567652447, "eval_logits/chosen": 0.7495799660682678, "eval_logits/rejected": 0.9458017349243164, "eval_logps/chosen": -412.70025634765625, "eval_logps/rejected": -382.4018859863281, "eval_loss": 0.6480182409286499, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": -0.24334919452667236, "eval_rewards/margins": 0.13382813334465027, "eval_rewards/rejected": -0.37717729806900024, "eval_runtime": 232.1999, "eval_samples_per_second": 8.613, "eval_steps_per_second": 1.077, "step": 700 }, { "epoch": 0.18581523161476055, "grad_norm": 7.924125671386719, "learning_rate": 4.8892204128816e-06, "logits/chosen": 0.7438673377037048, "logits/rejected": 1.0076682567596436, "logps/chosen": -439.7687072753906, "logps/rejected": -405.40509033203125, "loss": 0.6513, "rewards/accuracies": 0.625, "rewards/chosen": -0.19243761897087097, "rewards/margins": 0.12698553502559662, "rewards/rejected": -0.3194231390953064, "step": 710 }, { "epoch": 0.1884323475529966, "grad_norm": 9.067682266235352, "learning_rate": 4.882395735324864e-06, "logits/chosen": 0.6921774744987488, "logits/rejected": 1.028187870979309, "logps/chosen": -423.8224182128906, "logps/rejected": -385.47412109375, "loss": 0.66, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3247512876987457, "rewards/margins": 0.11575271934270859, "rewards/rejected": -0.4405040144920349, "step": 720 }, { "epoch": 0.19104946349123267, "grad_norm": 7.999166965484619, "learning_rate": 4.87537212840983e-06, "logits/chosen": 0.8577788472175598, "logits/rejected": 1.1802966594696045, "logps/chosen": -425.0936584472656, "logps/rejected": -394.3600769042969, "loss": 0.6348, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.457479327917099, "rewards/margins": 0.16357269883155823, "rewards/rejected": -0.6210519671440125, "step": 730 }, { "epoch": 0.19366657942946872, "grad_norm": 8.635887145996094, "learning_rate": 4.8681501786056545e-06, "logits/chosen": 1.0259983539581299, "logits/rejected": 1.2564369440078735, "logps/chosen": -366.40948486328125, "logps/rejected": -322.6649475097656, "loss": 0.6061, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2946811616420746, "rewards/margins": 0.22826531529426575, "rewards/rejected": -0.5229464769363403, "step": 740 }, { "epoch": 0.1962836953677048, "grad_norm": 7.924373626708984, "learning_rate": 4.860730488943068e-06, "logits/chosen": 0.9873319864273071, "logits/rejected": 1.1646772623062134, "logps/chosen": -393.21209716796875, "logps/rejected": -388.07452392578125, "loss": 0.6309, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.27078738808631897, "rewards/margins": 0.1725112944841385, "rewards/rejected": -0.44329872727394104, "step": 750 }, { "epoch": 0.19890081130594087, "grad_norm": 8.054422378540039, "learning_rate": 4.853113678964022e-06, "logits/chosen": 0.6722021102905273, "logits/rejected": 0.8681543469429016, "logps/chosen": -433.37750244140625, "logps/rejected": -429.5255432128906, "loss": 0.6333, "rewards/accuracies": 0.65625, "rewards/chosen": -0.43984508514404297, "rewards/margins": 0.16787569224834442, "rewards/rejected": -0.6077207326889038, "step": 760 }, { "epoch": 0.20151792724417691, "grad_norm": 7.1459150314331055, "learning_rate": 4.845300384669958e-06, "logits/chosen": 0.7385894060134888, "logits/rejected": 0.9126585721969604, "logps/chosen": -414.93292236328125, "logps/rejected": -372.8153381347656, "loss": 0.6588, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4202548861503601, "rewards/margins": 0.1190139502286911, "rewards/rejected": -0.5392688512802124, "step": 770 }, { "epoch": 0.204135043182413, "grad_norm": 9.462651252746582, "learning_rate": 4.837291258468701e-06, "logits/chosen": 0.6594001650810242, "logits/rejected": 0.8065937161445618, "logps/chosen": -452.4710388183594, "logps/rejected": -414.4306640625, "loss": 0.6375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2922487258911133, "rewards/margins": 0.16755884885787964, "rewards/rejected": -0.4598075747489929, "step": 780 }, { "epoch": 0.20675215912064904, "grad_norm": 9.359882354736328, "learning_rate": 4.829086969119984e-06, "logits/chosen": 0.8586047887802124, "logits/rejected": 1.0241984128952026, "logps/chosen": -411.26116943359375, "logps/rejected": -411.509033203125, "loss": 0.6571, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.47147512435913086, "rewards/margins": 0.1179923564195633, "rewards/rejected": -0.589467465877533, "step": 790 }, { "epoch": 0.2093692750588851, "grad_norm": 10.965747833251953, "learning_rate": 4.820688201679605e-06, "logits/chosen": 0.6815871000289917, "logits/rejected": 0.9268299341201782, "logps/chosen": -432.5615234375, "logps/rejected": -362.436767578125, "loss": 0.624, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5902279019355774, "rewards/margins": 0.18446585536003113, "rewards/rejected": -0.7746937870979309, "step": 800 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": 0.6242519021034241, "eval_logits/rejected": 0.8198153972625732, "eval_logps/chosen": -442.35064697265625, "eval_logps/rejected": -415.1210632324219, "eval_loss": 0.6389787793159485, "eval_rewards/accuracies": 0.6514999866485596, "eval_rewards/chosen": -0.5398533940315247, "eval_rewards/margins": 0.16451531648635864, "eval_rewards/rejected": -0.7043687105178833, "eval_runtime": 232.5028, "eval_samples_per_second": 8.602, "eval_steps_per_second": 1.075, "step": 800 }, { "epoch": 0.21198639099712116, "grad_norm": 9.325600624084473, "learning_rate": 4.8120956574422315e-06, "logits/chosen": 0.5215608477592468, "logits/rejected": 0.7568296194076538, "logps/chosen": -452.6517639160156, "logps/rejected": -429.9583435058594, "loss": 0.6763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4800487160682678, "rewards/margins": 0.09066037833690643, "rewards/rejected": -0.5707091093063354, "step": 810 }, { "epoch": 0.21460350693535724, "grad_norm": 9.269844055175781, "learning_rate": 4.803310053882831e-06, "logits/chosen": 1.0071885585784912, "logits/rejected": 1.0069457292556763, "logps/chosen": -374.1867370605469, "logps/rejected": -393.0777893066406, "loss": 0.6512, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.38367071747779846, "rewards/margins": 0.1374969780445099, "rewards/rejected": -0.5211676955223083, "step": 820 }, { "epoch": 0.2172206228735933, "grad_norm": 9.348451614379883, "learning_rate": 4.794332124596775e-06, "logits/chosen": 0.6886093616485596, "logits/rejected": 0.855501651763916, "logps/chosen": -426.2472229003906, "logps/rejected": -433.111083984375, "loss": 0.6489, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3134116232395172, "rewards/margins": 0.13843798637390137, "rewards/rejected": -0.4518495500087738, "step": 830 }, { "epoch": 0.21983773881182936, "grad_norm": 9.539639472961426, "learning_rate": 4.785162619238575e-06, "logits/chosen": 0.7601666450500488, "logits/rejected": 0.965703010559082, "logps/chosen": -439.4012756347656, "logps/rejected": -400.94439697265625, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": -0.5403390526771545, "rewards/margins": 0.17277175188064575, "rewards/rejected": -0.7131107449531555, "step": 840 }, { "epoch": 0.22245485475006543, "grad_norm": 8.668850898742676, "learning_rate": 4.775802303459288e-06, "logits/chosen": 0.8507216572761536, "logits/rejected": 0.9332345724105835, "logps/chosen": -440.80999755859375, "logps/rejected": -420.0621643066406, "loss": 0.6386, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.583116352558136, "rewards/margins": 0.16593685746192932, "rewards/rejected": -0.7490531206130981, "step": 850 }, { "epoch": 0.22507197068830148, "grad_norm": 12.724005699157715, "learning_rate": 4.766251958842589e-06, "logits/chosen": 0.8601281046867371, "logits/rejected": 0.9346219897270203, "logps/chosen": -455.83502197265625, "logps/rejected": -436.79815673828125, "loss": 0.6329, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6050734519958496, "rewards/margins": 0.1777031421661377, "rewards/rejected": -0.7827765941619873, "step": 860 }, { "epoch": 0.22768908662653756, "grad_norm": 11.345113754272461, "learning_rate": 4.7565123828395066e-06, "logits/chosen": 0.7511667013168335, "logits/rejected": 0.9087456464767456, "logps/chosen": -427.225830078125, "logps/rejected": -417.6124572753906, "loss": 0.6469, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.47927623987197876, "rewards/margins": 0.181664377450943, "rewards/rejected": -0.6609406471252441, "step": 870 }, { "epoch": 0.23030620256477363, "grad_norm": 11.099597930908203, "learning_rate": 4.746584388701831e-06, "logits/chosen": 0.8890976905822754, "logits/rejected": 0.843266487121582, "logps/chosen": -428.44091796875, "logps/rejected": -412.8895568847656, "loss": 0.6171, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4830327033996582, "rewards/margins": 0.21945062279701233, "rewards/rejected": -0.7024833559989929, "step": 880 }, { "epoch": 0.23292331850300968, "grad_norm": 11.448017120361328, "learning_rate": 4.736468805414218e-06, "logits/chosen": 0.9304903745651245, "logits/rejected": 1.2368038892745972, "logps/chosen": -408.22283935546875, "logps/rejected": -436.41815185546875, "loss": 0.6201, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.45535406470298767, "rewards/margins": 0.22440317273139954, "rewards/rejected": -0.6797571778297424, "step": 890 }, { "epoch": 0.23554043444124576, "grad_norm": 14.195487976074219, "learning_rate": 4.7261664776249595e-06, "logits/chosen": 0.8509295582771301, "logits/rejected": 1.094995141029358, "logps/chosen": -421.479248046875, "logps/rejected": -408.42083740234375, "loss": 0.62, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5554867386817932, "rewards/margins": 0.23488807678222656, "rewards/rejected": -0.7903748750686646, "step": 900 }, { "epoch": 0.23554043444124576, "eval_logits/chosen": 0.7130433917045593, "eval_logits/rejected": 0.9079583883285522, "eval_logps/chosen": -454.4474792480469, "eval_logps/rejected": -431.30230712890625, "eval_loss": 0.6320670247077942, "eval_rewards/accuracies": 0.6485000252723694, "eval_rewards/chosen": -0.6608208417892456, "eval_rewards/margins": 0.2053609937429428, "eval_rewards/rejected": -0.8661818504333496, "eval_runtime": 232.3653, "eval_samples_per_second": 8.607, "eval_steps_per_second": 1.076, "step": 900 }, { "epoch": 0.2381575503794818, "grad_norm": 9.822765350341797, "learning_rate": 4.715678265575463e-06, "logits/chosen": 0.9885305166244507, "logits/rejected": 0.9318181872367859, "logps/chosen": -475.46728515625, "logps/rejected": -405.56658935546875, "loss": 0.6189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5485895872116089, "rewards/margins": 0.2235954999923706, "rewards/rejected": -0.7721850872039795, "step": 910 }, { "epoch": 0.24077466631771788, "grad_norm": 8.151389122009277, "learning_rate": 4.705005045028415e-06, "logits/chosen": 0.8163010478019714, "logits/rejected": 0.7590088844299316, "logps/chosen": -439.08563232421875, "logps/rejected": -413.9178771972656, "loss": 0.6244, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5029494166374207, "rewards/margins": 0.2026442587375641, "rewards/rejected": -0.7055937051773071, "step": 920 }, { "epoch": 0.24339178225595393, "grad_norm": 12.819987297058105, "learning_rate": 4.694147707194659e-06, "logits/chosen": 0.6603835225105286, "logits/rejected": 0.8101722598075867, "logps/chosen": -468.04083251953125, "logps/rejected": -449.27423095703125, "loss": 0.595, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5709540247917175, "rewards/margins": 0.2979514002799988, "rewards/rejected": -0.8689054250717163, "step": 930 }, { "epoch": 0.24600889819419, "grad_norm": 11.70290470123291, "learning_rate": 4.683107158658782e-06, "logits/chosen": 0.6703850030899048, "logits/rejected": 1.1179869174957275, "logps/chosen": -492.76904296875, "logps/rejected": -475.07244873046875, "loss": 0.6061, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8049885630607605, "rewards/margins": 0.24929532408714294, "rewards/rejected": -1.054283857345581, "step": 940 }, { "epoch": 0.24862601413242608, "grad_norm": 9.882152557373047, "learning_rate": 4.671884321303407e-06, "logits/chosen": 0.8308131098747253, "logits/rejected": 0.9495989084243774, "logps/chosen": -425.8990173339844, "logps/rejected": -416.7378845214844, "loss": 0.6092, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6653159856796265, "rewards/margins": 0.2662131190299988, "rewards/rejected": -0.9315292239189148, "step": 950 }, { "epoch": 0.2512431300706621, "grad_norm": 10.33724308013916, "learning_rate": 4.660480132232224e-06, "logits/chosen": 0.7533870935440063, "logits/rejected": 0.8427373766899109, "logps/chosen": -445.78387451171875, "logps/rejected": -416.93218994140625, "loss": 0.6309, "rewards/accuracies": 0.65625, "rewards/chosen": -0.417325496673584, "rewards/margins": 0.21080616116523743, "rewards/rejected": -0.6281316876411438, "step": 960 }, { "epoch": 0.25386024600889817, "grad_norm": 12.866530418395996, "learning_rate": 4.6488955436917414e-06, "logits/chosen": 0.6672025322914124, "logits/rejected": 0.8596879243850708, "logps/chosen": -435.473388671875, "logps/rejected": -387.2782287597656, "loss": 0.6097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4055888056755066, "rewards/margins": 0.25641921162605286, "rewards/rejected": -0.6620079278945923, "step": 970 }, { "epoch": 0.2564773619471343, "grad_norm": 12.976202964782715, "learning_rate": 4.6371315229917644e-06, "logits/chosen": 0.6292930841445923, "logits/rejected": 0.7828453183174133, "logps/chosen": -468.067138671875, "logps/rejected": -448.0206604003906, "loss": 0.5999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5305265784263611, "rewards/margins": 0.2725897431373596, "rewards/rejected": -0.8031163215637207, "step": 980 }, { "epoch": 0.2590944778853703, "grad_norm": 11.27432632446289, "learning_rate": 4.625189052424638e-06, "logits/chosen": 0.7714609503746033, "logits/rejected": 1.057544469833374, "logps/chosen": -416.63323974609375, "logps/rejected": -403.56793212890625, "loss": 0.5821, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6991704702377319, "rewards/margins": 0.3505772650241852, "rewards/rejected": -1.0497477054595947, "step": 990 }, { "epoch": 0.26171159382360637, "grad_norm": 11.419754981994629, "learning_rate": 4.613069129183218e-06, "logits/chosen": 0.7187921404838562, "logits/rejected": 1.0341460704803467, "logps/chosen": -520.6634521484375, "logps/rejected": -485.7464904785156, "loss": 0.6255, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8484965562820435, "rewards/margins": 0.24263262748718262, "rewards/rejected": -1.091129183769226, "step": 1000 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": 0.5235105156898499, "eval_logits/rejected": 0.711112916469574, "eval_logps/chosen": -473.25457763671875, "eval_logps/rejected": -453.2762756347656, "eval_loss": 0.6269846558570862, "eval_rewards/accuracies": 0.6455000042915344, "eval_rewards/chosen": -0.848892092704773, "eval_rewards/margins": 0.2370292991399765, "eval_rewards/rejected": -1.0859214067459106, "eval_runtime": 232.3711, "eval_samples_per_second": 8.607, "eval_steps_per_second": 1.076, "step": 1000 }, { "epoch": 0.2643287097618425, "grad_norm": 21.2219295501709, "learning_rate": 4.600772765277607e-06, "logits/chosen": 0.5445064306259155, "logits/rejected": 0.8773614764213562, "logps/chosen": -415.87518310546875, "logps/rejected": -414.38336181640625, "loss": 0.6134, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7228370904922485, "rewards/margins": 0.2761087417602539, "rewards/rejected": -0.9989458322525024, "step": 1010 }, { "epoch": 0.2669458257000785, "grad_norm": 12.491921424865723, "learning_rate": 4.588300987450652e-06, "logits/chosen": 0.7395948767662048, "logits/rejected": 1.0148208141326904, "logps/chosen": -416.64801025390625, "logps/rejected": -378.2071838378906, "loss": 0.6405, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.36920469999313354, "rewards/margins": 0.19941550493240356, "rewards/rejected": -0.5686202645301819, "step": 1020 }, { "epoch": 0.26956294163831457, "grad_norm": 11.955611228942871, "learning_rate": 4.5756548370922136e-06, "logits/chosen": 0.5796680450439453, "logits/rejected": 0.7352942228317261, "logps/chosen": -382.8646545410156, "logps/rejected": -367.3566589355469, "loss": 0.6535, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.2818133234977722, "rewards/margins": 0.14685805141925812, "rewards/rejected": -0.42867136001586914, "step": 1030 }, { "epoch": 0.2721800575765506, "grad_norm": 12.265511512756348, "learning_rate": 4.562835370152206e-06, "logits/chosen": 0.3429097533226013, "logits/rejected": 0.5369440913200378, "logps/chosen": -484.8700256347656, "logps/rejected": -457.33905029296875, "loss": 0.5784, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4302898943424225, "rewards/margins": 0.34176695346832275, "rewards/rejected": -0.7720568180084229, "step": 1040 }, { "epoch": 0.2747971735147867, "grad_norm": 13.330986022949219, "learning_rate": 4.54984365705243e-06, "logits/chosen": 0.48474931716918945, "logits/rejected": 0.6024073362350464, "logps/chosen": -467.50018310546875, "logps/rejected": -466.6949768066406, "loss": 0.5833, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7076524496078491, "rewards/margins": 0.3223820626735687, "rewards/rejected": -1.0300344228744507, "step": 1050 }, { "epoch": 0.27741428945302277, "grad_norm": 18.709505081176758, "learning_rate": 4.536680782597191e-06, "logits/chosen": 0.4200347363948822, "logits/rejected": 0.6729756593704224, "logps/chosen": -439.62579345703125, "logps/rejected": -422.578857421875, "loss": 0.6394, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8081327676773071, "rewards/margins": 0.21756339073181152, "rewards/rejected": -1.0256961584091187, "step": 1060 }, { "epoch": 0.2800314053912588, "grad_norm": 14.954957962036133, "learning_rate": 4.523347845882718e-06, "logits/chosen": 0.44082099199295044, "logits/rejected": 0.5635146498680115, "logps/chosen": -464.96136474609375, "logps/rejected": -430.59521484375, "loss": 0.555, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6111140847206116, "rewards/margins": 0.41279348731040955, "rewards/rejected": -1.0239075422286987, "step": 1070 }, { "epoch": 0.2826485213294949, "grad_norm": 14.649702072143555, "learning_rate": 4.50984596020539e-06, "logits/chosen": 0.3772805631160736, "logits/rejected": 0.5361444354057312, "logps/chosen": -446.0318298339844, "logps/rejected": -424.20208740234375, "loss": 0.6135, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5018765926361084, "rewards/margins": 0.24910588562488556, "rewards/rejected": -0.7509824633598328, "step": 1080 }, { "epoch": 0.28526563726773096, "grad_norm": 12.78677749633789, "learning_rate": 4.4961762529687745e-06, "logits/chosen": 0.40019339323043823, "logits/rejected": 0.5373650789260864, "logps/chosen": -439.0340881347656, "logps/rejected": -412.9305725097656, "loss": 0.6463, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6333376169204712, "rewards/margins": 0.1645718812942505, "rewards/rejected": -0.7979093790054321, "step": 1090 }, { "epoch": 0.287882753205967, "grad_norm": 13.784514427185059, "learning_rate": 4.482339865589492e-06, "logits/chosen": 0.5023793578147888, "logits/rejected": 0.6304869651794434, "logps/chosen": -468.329345703125, "logps/rejected": -410.9677734375, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": -0.7563449144363403, "rewards/margins": 0.21642132103443146, "rewards/rejected": -0.9727662205696106, "step": 1100 }, { "epoch": 0.287882753205967, "eval_logits/chosen": 0.2741233706474304, "eval_logits/rejected": 0.45645061135292053, "eval_logps/chosen": -476.776611328125, "eval_logps/rejected": -455.3140563964844, "eval_loss": 0.6249045133590698, "eval_rewards/accuracies": 0.6539999842643738, "eval_rewards/chosen": -0.8841127753257751, "eval_rewards/margins": 0.22218641638755798, "eval_rewards/rejected": -1.1062991619110107, "eval_runtime": 232.1027, "eval_samples_per_second": 8.617, "eval_steps_per_second": 1.077, "step": 1100 }, { "epoch": 0.2904998691442031, "grad_norm": 13.539247512817383, "learning_rate": 4.468337953401909e-06, "logits/chosen": 0.508640468120575, "logits/rejected": 0.6552512049674988, "logps/chosen": -493.59344482421875, "logps/rejected": -498.8395080566406, "loss": 0.6222, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9703060984611511, "rewards/margins": 0.22064876556396484, "rewards/rejected": -1.1909549236297607, "step": 1110 }, { "epoch": 0.29311698508243916, "grad_norm": 11.673506736755371, "learning_rate": 4.45417168556166e-06, "logits/chosen": 0.32166963815689087, "logits/rejected": 0.6035802960395813, "logps/chosen": -456.86944580078125, "logps/rejected": -448.83294677734375, "loss": 0.6182, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8095690608024597, "rewards/margins": 0.2476453334093094, "rewards/rejected": -1.0572144985198975, "step": 1120 }, { "epoch": 0.2957341010206752, "grad_norm": 9.967942237854004, "learning_rate": 4.439842244948036e-06, "logits/chosen": 0.2381783425807953, "logits/rejected": 0.4971030354499817, "logps/chosen": -444.052001953125, "logps/rejected": -437.64495849609375, "loss": 0.6464, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6728368997573853, "rewards/margins": 0.2183237075805664, "rewards/rejected": -0.8911606073379517, "step": 1130 }, { "epoch": 0.29835121695891126, "grad_norm": 12.411190032958984, "learning_rate": 4.425350828065204e-06, "logits/chosen": 0.4094735085964203, "logits/rejected": 0.5186284184455872, "logps/chosen": -472.64093017578125, "logps/rejected": -419.64739990234375, "loss": 0.5997, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5493424534797668, "rewards/margins": 0.3005039095878601, "rewards/rejected": -0.849846363067627, "step": 1140 }, { "epoch": 0.30096833289714736, "grad_norm": 17.98678207397461, "learning_rate": 4.410698644942303e-06, "logits/chosen": 0.19235338270664215, "logits/rejected": 0.4098784029483795, "logps/chosen": -451.133056640625, "logps/rejected": -423.66448974609375, "loss": 0.6083, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.474482923746109, "rewards/margins": 0.28159815073013306, "rewards/rejected": -0.7560810446739197, "step": 1150 }, { "epoch": 0.3035854488353834, "grad_norm": 11.179768562316895, "learning_rate": 4.395886919032406e-06, "logits/chosen": 0.5191640853881836, "logits/rejected": 0.6151641607284546, "logps/chosen": -423.7428283691406, "logps/rejected": -409.55303955078125, "loss": 0.6194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.469729483127594, "rewards/margins": 0.2616717517375946, "rewards/rejected": -0.7314012050628662, "step": 1160 }, { "epoch": 0.30620256477361946, "grad_norm": 13.860902786254883, "learning_rate": 4.380916887110366e-06, "logits/chosen": 0.3877313733100891, "logits/rejected": 0.25373178720474243, "logps/chosen": -457.0462951660156, "logps/rejected": -414.69024658203125, "loss": 0.631, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8331391215324402, "rewards/margins": 0.2332063466310501, "rewards/rejected": -1.066345453262329, "step": 1170 }, { "epoch": 0.30881968071185556, "grad_norm": 10.883024215698242, "learning_rate": 4.365789799169539e-06, "logits/chosen": 0.5446051955223083, "logits/rejected": 0.45434585213661194, "logps/chosen": -477.3246154785156, "logps/rejected": -486.14422607421875, "loss": 0.6232, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1283986568450928, "rewards/margins": 0.22864672541618347, "rewards/rejected": -1.3570451736450195, "step": 1180 }, { "epoch": 0.3114367966500916, "grad_norm": 11.807137489318848, "learning_rate": 4.350506918317416e-06, "logits/chosen": 0.48607057332992554, "logits/rejected": 0.6121966242790222, "logps/chosen": -470.0353088378906, "logps/rejected": -458.76055908203125, "loss": 0.6312, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.05269455909729, "rewards/margins": 0.19717691838741302, "rewards/rejected": -1.2498712539672852, "step": 1190 }, { "epoch": 0.31405391258832765, "grad_norm": 14.276663780212402, "learning_rate": 4.335069520670149e-06, "logits/chosen": 0.42629900574684143, "logits/rejected": 0.4847659170627594, "logps/chosen": -411.36407470703125, "logps/rejected": -420.76239013671875, "loss": 0.6512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7268288731575012, "rewards/margins": 0.17092524468898773, "rewards/rejected": -0.8977540731430054, "step": 1200 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": 0.18179067969322205, "eval_logits/rejected": 0.35566091537475586, "eval_logps/chosen": -452.5130920410156, "eval_logps/rejected": -432.3995056152344, "eval_loss": 0.6197048425674438, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": -0.6414775252342224, "eval_rewards/margins": 0.23567558825016022, "eval_rewards/rejected": -0.8771531581878662, "eval_runtime": 232.4956, "eval_samples_per_second": 8.602, "eval_steps_per_second": 1.075, "step": 1200 }, { "epoch": 0.3166710285265637, "grad_norm": 9.877076148986816, "learning_rate": 4.319478895246e-06, "logits/chosen": 0.31032776832580566, "logits/rejected": 0.4457179009914398, "logps/chosen": -426.0264587402344, "logps/rejected": -397.55316162109375, "loss": 0.6026, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5681090950965881, "rewards/margins": 0.26915818452835083, "rewards/rejected": -0.837267279624939, "step": 1210 }, { "epoch": 0.3192881444647998, "grad_norm": 11.549198150634766, "learning_rate": 4.303736343857704e-06, "logits/chosen": 0.3116544485092163, "logits/rejected": 0.5375791788101196, "logps/chosen": -420.4837951660156, "logps/rejected": -431.21929931640625, "loss": 0.6428, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3886328339576721, "rewards/margins": 0.19474461674690247, "rewards/rejected": -0.5833774209022522, "step": 1220 }, { "epoch": 0.32190526040303585, "grad_norm": 11.519137382507324, "learning_rate": 4.287843181003772e-06, "logits/chosen": 0.19896575808525085, "logits/rejected": 0.25549182295799255, "logps/chosen": -476.87017822265625, "logps/rejected": -413.09197998046875, "loss": 0.6303, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.39009106159210205, "rewards/margins": 0.20869462192058563, "rewards/rejected": -0.5987856984138489, "step": 1230 }, { "epoch": 0.3245223763412719, "grad_norm": 11.55118465423584, "learning_rate": 4.27180073375873e-06, "logits/chosen": 0.39684659242630005, "logits/rejected": 0.3274468183517456, "logps/chosen": -453.70587158203125, "logps/rejected": -417.8851623535156, "loss": 0.5939, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4496288299560547, "rewards/margins": 0.3144444525241852, "rewards/rejected": -0.7640732526779175, "step": 1240 }, { "epoch": 0.327139492279508, "grad_norm": 10.041382789611816, "learning_rate": 4.255610341662304e-06, "logits/chosen": 0.13150617480278015, "logits/rejected": 0.4586234986782074, "logps/chosen": -432.53570556640625, "logps/rejected": -412.88726806640625, "loss": 0.6162, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.56169593334198, "rewards/margins": 0.2593781054019928, "rewards/rejected": -0.8210738897323608, "step": 1250 }, { "epoch": 0.32975660821774405, "grad_norm": 15.793495178222656, "learning_rate": 4.2392733566075764e-06, "logits/chosen": 0.2116355448961258, "logits/rejected": 0.34284886717796326, "logps/chosen": -430.3287658691406, "logps/rejected": -423.35980224609375, "loss": 0.6349, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6224905252456665, "rewards/margins": 0.2327694147825241, "rewards/rejected": -0.8552600145339966, "step": 1260 }, { "epoch": 0.3323737241559801, "grad_norm": 11.136332511901855, "learning_rate": 4.2227911427280975e-06, "logits/chosen": 0.19596245884895325, "logits/rejected": 0.3888585865497589, "logps/chosen": -438.890625, "logps/rejected": -403.1433410644531, "loss": 0.6353, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6415246725082397, "rewards/margins": 0.2291472852230072, "rewards/rejected": -0.8706718683242798, "step": 1270 }, { "epoch": 0.33499084009421615, "grad_norm": 20.09882926940918, "learning_rate": 4.206165076283983e-06, "logits/chosen": 0.2686145603656769, "logits/rejected": 0.49140438437461853, "logps/chosen": -447.68115234375, "logps/rejected": -423.4256896972656, "loss": 0.585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7579549551010132, "rewards/margins": 0.31806570291519165, "rewards/rejected": -1.07602059841156, "step": 1280 }, { "epoch": 0.33760795603245225, "grad_norm": 15.394529342651367, "learning_rate": 4.189396545546995e-06, "logits/chosen": 0.0032246888149529696, "logits/rejected": 0.3693595230579376, "logps/chosen": -438.0142517089844, "logps/rejected": -418.50775146484375, "loss": 0.6328, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6667572259902954, "rewards/margins": 0.2417328804731369, "rewards/rejected": -0.9084900617599487, "step": 1290 }, { "epoch": 0.3402250719706883, "grad_norm": 17.138986587524414, "learning_rate": 4.172486950684627e-06, "logits/chosen": 0.1562187373638153, "logits/rejected": 0.4008878171443939, "logps/chosen": -412.3981018066406, "logps/rejected": -425.25299072265625, "loss": 0.5864, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4970017075538635, "rewards/margins": 0.32930120825767517, "rewards/rejected": -0.8263028860092163, "step": 1300 }, { "epoch": 0.3402250719706883, "eval_logits/chosen": 0.12899738550186157, "eval_logits/rejected": 0.29827845096588135, "eval_logps/chosen": -457.74737548828125, "eval_logps/rejected": -441.21051025390625, "eval_loss": 0.6130329370498657, "eval_rewards/accuracies": 0.6735000014305115, "eval_rewards/chosen": -0.693820059299469, "eval_rewards/margins": 0.27144384384155273, "eval_rewards/rejected": -0.965263843536377, "eval_runtime": 232.1045, "eval_samples_per_second": 8.617, "eval_steps_per_second": 1.077, "step": 1300 }, { "epoch": 0.34284218790892435, "grad_norm": 15.050552368164062, "learning_rate": 4.155437703643182e-06, "logits/chosen": 0.3422376215457916, "logits/rejected": 0.3849483132362366, "logps/chosen": -431.77166748046875, "logps/rejected": -416.0318298339844, "loss": 0.5837, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7273216843605042, "rewards/margins": 0.3494306206703186, "rewards/rejected": -1.0767523050308228, "step": 1310 }, { "epoch": 0.34545930384716045, "grad_norm": 16.636903762817383, "learning_rate": 4.138250228029882e-06, "logits/chosen": 0.0636025071144104, "logits/rejected": 0.21294847130775452, "logps/chosen": -461.449951171875, "logps/rejected": -475.20361328125, "loss": 0.6341, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8283727765083313, "rewards/margins": 0.2532418668270111, "rewards/rejected": -1.0816147327423096, "step": 1320 }, { "epoch": 0.3480764197853965, "grad_norm": 18.111169815063477, "learning_rate": 4.120925958993994e-06, "logits/chosen": 0.23635880649089813, "logits/rejected": 0.25746363401412964, "logps/chosen": -399.59442138671875, "logps/rejected": -405.0094299316406, "loss": 0.6418, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.571780800819397, "rewards/margins": 0.21083179116249084, "rewards/rejected": -0.7826126217842102, "step": 1330 }, { "epoch": 0.35069353572363254, "grad_norm": 16.408119201660156, "learning_rate": 4.103466343106999e-06, "logits/chosen": 0.3088940680027008, "logits/rejected": 0.4904406666755676, "logps/chosen": -449.12713623046875, "logps/rejected": -436.40716552734375, "loss": 0.6052, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4821189045906067, "rewards/margins": 0.28531405329704285, "rewards/rejected": -0.7674329280853271, "step": 1340 }, { "epoch": 0.35331065166186865, "grad_norm": 13.181236267089844, "learning_rate": 4.085872838241797e-06, "logits/chosen": 0.26650765538215637, "logits/rejected": 0.46960416436195374, "logps/chosen": -435.21270751953125, "logps/rejected": -414.5130310058594, "loss": 0.6259, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4914736747741699, "rewards/margins": 0.24917948246002197, "rewards/rejected": -0.7406532168388367, "step": 1350 }, { "epoch": 0.3559277676001047, "grad_norm": 14.832420349121094, "learning_rate": 4.06814691345098e-06, "logits/chosen": 0.24336537718772888, "logits/rejected": 0.30810093879699707, "logps/chosen": -410.4248962402344, "logps/rejected": -418.159423828125, "loss": 0.5866, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5495951771736145, "rewards/margins": 0.3245389461517334, "rewards/rejected": -0.8741341829299927, "step": 1360 }, { "epoch": 0.35854488353834074, "grad_norm": 17.649641036987305, "learning_rate": 4.050290048844171e-06, "logits/chosen": 0.168039470911026, "logits/rejected": 0.265805184841156, "logps/chosen": -489.49468994140625, "logps/rejected": -486.09100341796875, "loss": 0.6079, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8853768110275269, "rewards/margins": 0.28231385350227356, "rewards/rejected": -1.167690634727478, "step": 1370 }, { "epoch": 0.3611619994765768, "grad_norm": 15.066794395446777, "learning_rate": 4.032303735464422e-06, "logits/chosen": 0.2061309516429901, "logits/rejected": 0.3621533513069153, "logps/chosen": -498.93890380859375, "logps/rejected": -477.98974609375, "loss": 0.5812, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9270712733268738, "rewards/margins": 0.3421303629875183, "rewards/rejected": -1.2692015171051025, "step": 1380 }, { "epoch": 0.3637791154148129, "grad_norm": 14.546058654785156, "learning_rate": 4.014189475163727e-06, "logits/chosen": 0.295467346906662, "logits/rejected": 0.3572823405265808, "logps/chosen": -464.5303649902344, "logps/rejected": -447.1439514160156, "loss": 0.6072, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8640359044075012, "rewards/margins": 0.27750641107559204, "rewards/rejected": -1.1415421962738037, "step": 1390 }, { "epoch": 0.36639623135304894, "grad_norm": 24.52462387084961, "learning_rate": 3.995948780477605e-06, "logits/chosen": 0.19618520140647888, "logits/rejected": 0.33636996150016785, "logps/chosen": -475.90826416015625, "logps/rejected": -455.0912170410156, "loss": 0.6226, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7381902933120728, "rewards/margins": 0.2615019679069519, "rewards/rejected": -0.9996922612190247, "step": 1400 }, { "epoch": 0.36639623135304894, "eval_logits/chosen": 0.1471870094537735, "eval_logits/rejected": 0.3099009096622467, "eval_logps/chosen": -460.8104553222656, "eval_logps/rejected": -446.4751281738281, "eval_loss": 0.608772337436676, "eval_rewards/accuracies": 0.6790000200271606, "eval_rewards/chosen": -0.7244512438774109, "eval_rewards/margins": 0.29345834255218506, "eval_rewards/rejected": -1.0179095268249512, "eval_runtime": 232.5568, "eval_samples_per_second": 8.6, "eval_steps_per_second": 1.075, "step": 1400 }, { "epoch": 0.369013347291285, "grad_norm": 18.559682846069336, "learning_rate": 3.977583174498816e-06, "logits/chosen": 0.2730047106742859, "logits/rejected": 0.5137112140655518, "logps/chosen": -473.2476501464844, "logps/rejected": -456.55609130859375, "loss": 0.5941, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.868022084236145, "rewards/margins": 0.3129437267780304, "rewards/rejected": -1.1809656620025635, "step": 1410 }, { "epoch": 0.3716304632295211, "grad_norm": 15.352100372314453, "learning_rate": 3.959094190750172e-06, "logits/chosen": 0.22451026737689972, "logits/rejected": 0.3914637267589569, "logps/chosen": -482.42010498046875, "logps/rejected": -452.3109436035156, "loss": 0.607, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7052127718925476, "rewards/margins": 0.31599652767181396, "rewards/rejected": -1.0212092399597168, "step": 1420 }, { "epoch": 0.37424757916775714, "grad_norm": 20.098033905029297, "learning_rate": 3.9404833730564975e-06, "logits/chosen": 0.1833394318819046, "logits/rejected": 0.30950406193733215, "logps/chosen": -431.68438720703125, "logps/rejected": -421.0174255371094, "loss": 0.6279, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5920284986495972, "rewards/margins": 0.2607758641242981, "rewards/rejected": -0.8528043627738953, "step": 1430 }, { "epoch": 0.3768646951059932, "grad_norm": 16.98012351989746, "learning_rate": 3.921752275415712e-06, "logits/chosen": 0.39584654569625854, "logits/rejected": 0.6170969605445862, "logps/chosen": -442.19915771484375, "logps/rejected": -441.6769104003906, "loss": 0.5883, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8056619763374329, "rewards/margins": 0.3388480842113495, "rewards/rejected": -1.1445101499557495, "step": 1440 }, { "epoch": 0.37948181104422923, "grad_norm": 13.900838851928711, "learning_rate": 3.902902461869079e-06, "logits/chosen": 0.3026077151298523, "logits/rejected": 0.5092092156410217, "logps/chosen": -435.91571044921875, "logps/rejected": -435.0750427246094, "loss": 0.6001, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8758746981620789, "rewards/margins": 0.34648483991622925, "rewards/rejected": -1.2223594188690186, "step": 1450 }, { "epoch": 0.38209892698246534, "grad_norm": 17.466562271118164, "learning_rate": 3.883935506370605e-06, "logits/chosen": 0.2131034880876541, "logits/rejected": 0.42597731947898865, "logps/chosen": -423.911376953125, "logps/rejected": -408.1385192871094, "loss": 0.613, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6358692049980164, "rewards/margins": 0.3088337481021881, "rewards/rejected": -0.9447029232978821, "step": 1460 }, { "epoch": 0.3847160429207014, "grad_norm": 11.749993324279785, "learning_rate": 3.864852992655617e-06, "logits/chosen": 0.30351871252059937, "logits/rejected": 0.3591347336769104, "logps/chosen": -443.84735107421875, "logps/rejected": -447.32763671875, "loss": 0.5455, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6212112903594971, "rewards/margins": 0.44100433588027954, "rewards/rejected": -1.0622155666351318, "step": 1470 }, { "epoch": 0.38733315885893743, "grad_norm": 15.301169395446777, "learning_rate": 3.845656514108516e-06, "logits/chosen": 0.30313563346862793, "logits/rejected": 0.3141325116157532, "logps/chosen": -478.4485778808594, "logps/rejected": -414.6806640625, "loss": 0.6202, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9370290040969849, "rewards/margins": 0.28061023354530334, "rewards/rejected": -1.2176392078399658, "step": 1480 }, { "epoch": 0.38995027479717354, "grad_norm": 16.264097213745117, "learning_rate": 3.826347673629738e-06, "logits/chosen": 0.2156684696674347, "logits/rejected": 0.3405511975288391, "logps/chosen": -454.978515625, "logps/rejected": -446.1961975097656, "loss": 0.5828, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8145572543144226, "rewards/margins": 0.3832133412361145, "rewards/rejected": -1.1977707147598267, "step": 1490 }, { "epoch": 0.3925673907354096, "grad_norm": 17.33570098876953, "learning_rate": 3.8069280835019062e-06, "logits/chosen": 0.197922945022583, "logits/rejected": 0.31282711029052734, "logps/chosen": -465.8154296875, "logps/rejected": -456.82562255859375, "loss": 0.5748, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8404709696769714, "rewards/margins": 0.37942442297935486, "rewards/rejected": -1.219895362854004, "step": 1500 }, { "epoch": 0.3925673907354096, "eval_logits/chosen": 0.08960460871458054, "eval_logits/rejected": 0.23796047270298004, "eval_logps/chosen": -480.02130126953125, "eval_logps/rejected": -468.2979431152344, "eval_loss": 0.6048462986946106, "eval_rewards/accuracies": 0.6754999756813049, "eval_rewards/chosen": -0.9165594577789307, "eval_rewards/margins": 0.3195783197879791, "eval_rewards/rejected": -1.236137866973877, "eval_runtime": 231.9563, "eval_samples_per_second": 8.622, "eval_steps_per_second": 1.078, "step": 1500 }, { "epoch": 0.39518450667364563, "grad_norm": 14.863387107849121, "learning_rate": 3.7873993652552077e-06, "logits/chosen": 0.23345918953418732, "logits/rejected": 0.28646907210350037, "logps/chosen": -454.75897216796875, "logps/rejected": -453.5748596191406, "loss": 0.6597, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.0461337566375732, "rewards/margins": 0.21926303207874298, "rewards/rejected": -1.2653969526290894, "step": 1510 }, { "epoch": 0.39780162261188173, "grad_norm": 16.0816593170166, "learning_rate": 3.7677631495319953e-06, "logits/chosen": 0.2003081738948822, "logits/rejected": 0.36653000116348267, "logps/chosen": -488.4042053222656, "logps/rejected": -490.30853271484375, "loss": 0.581, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9618334770202637, "rewards/margins": 0.38063231110572815, "rewards/rejected": -1.3424657583236694, "step": 1520 }, { "epoch": 0.4004187385501178, "grad_norm": 15.07925033569336, "learning_rate": 3.748021075950633e-06, "logits/chosen": -0.012658292427659035, "logits/rejected": 0.16820164024829865, "logps/chosen": -478.895751953125, "logps/rejected": -465.083984375, "loss": 0.6449, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.834586501121521, "rewards/margins": 0.23068487644195557, "rewards/rejected": -1.0652713775634766, "step": 1530 }, { "epoch": 0.40303585448835383, "grad_norm": 13.859041213989258, "learning_rate": 3.7281747929685824e-06, "logits/chosen": 0.3084440231323242, "logits/rejected": 0.4003655016422272, "logps/chosen": -465.21844482421875, "logps/rejected": -456.33465576171875, "loss": 0.6143, "rewards/accuracies": 0.625, "rewards/chosen": -1.1239235401153564, "rewards/margins": 0.2595583498477936, "rewards/rejected": -1.3834818601608276, "step": 1540 }, { "epoch": 0.4056529704265899, "grad_norm": 12.961313247680664, "learning_rate": 3.7082259577447604e-06, "logits/chosen": 0.20876283943653107, "logits/rejected": 0.4126996099948883, "logps/chosen": -499.909912109375, "logps/rejected": -486.24749755859375, "loss": 0.5952, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9955336451530457, "rewards/margins": 0.3382071256637573, "rewards/rejected": -1.3337408304214478, "step": 1550 }, { "epoch": 0.408270086364826, "grad_norm": 15.985527038574219, "learning_rate": 3.6881762360011688e-06, "logits/chosen": 0.12506040930747986, "logits/rejected": 0.17536480724811554, "logps/chosen": -477.6473083496094, "logps/rejected": -436.26751708984375, "loss": 0.6062, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7066253423690796, "rewards/margins": 0.3402264714241028, "rewards/rejected": -1.0468518733978271, "step": 1560 }, { "epoch": 0.410887202303062, "grad_norm": 22.8974666595459, "learning_rate": 3.668027301883802e-06, "logits/chosen": 0.0588788278400898, "logits/rejected": 0.12763305008411407, "logps/chosen": -440.4716796875, "logps/rejected": -444.61834716796875, "loss": 0.6024, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7403033375740051, "rewards/margins": 0.37406405806541443, "rewards/rejected": -1.1143672466278076, "step": 1570 }, { "epoch": 0.4135043182412981, "grad_norm": 15.58066463470459, "learning_rate": 3.64778083782286e-06, "logits/chosen": 0.27244722843170166, "logits/rejected": 0.3991895318031311, "logps/chosen": -443.55645751953125, "logps/rejected": -494.92254638671875, "loss": 0.5942, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7955921292304993, "rewards/margins": 0.36010387539863586, "rewards/rejected": -1.1556960344314575, "step": 1580 }, { "epoch": 0.4161214341795342, "grad_norm": 13.258322715759277, "learning_rate": 3.627438534392268e-06, "logits/chosen": -0.04510800167918205, "logits/rejected": 0.03904765844345093, "logps/chosen": -436.66802978515625, "logps/rejected": -471.71142578125, "loss": 0.5738, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.744075357913971, "rewards/margins": 0.400721937417984, "rewards/rejected": -1.1447973251342773, "step": 1590 }, { "epoch": 0.4187385501177702, "grad_norm": 14.653417587280273, "learning_rate": 3.607002090168506e-06, "logits/chosen": 0.014941488392651081, "logits/rejected": 0.006946629378944635, "logps/chosen": -499.9811096191406, "logps/rejected": -463.3321228027344, "loss": 0.6615, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9098555445671082, "rewards/margins": 0.21712689101696014, "rewards/rejected": -1.1269824504852295, "step": 1600 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -0.08284498751163483, "eval_logits/rejected": 0.05344332382082939, "eval_logps/chosen": -487.8919982910156, "eval_logps/rejected": -479.4829406738281, "eval_loss": 0.6062743067741394, "eval_rewards/accuracies": 0.6704999804496765, "eval_rewards/chosen": -0.9952664971351624, "eval_rewards/margins": 0.35272136330604553, "eval_rewards/rejected": -1.3479877710342407, "eval_runtime": 231.9207, "eval_samples_per_second": 8.624, "eval_steps_per_second": 1.078, "step": 1600 }, { "epoch": 0.4213556660560063, "grad_norm": 14.061836242675781, "learning_rate": 3.586473211588787e-06, "logits/chosen": 0.043830014765262604, "logits/rejected": 0.0922635942697525, "logps/chosen": -452.1427307128906, "logps/rejected": -491.6693420410156, "loss": 0.5682, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8795869946479797, "rewards/margins": 0.42901507019996643, "rewards/rejected": -1.3086020946502686, "step": 1610 }, { "epoch": 0.4239727819942423, "grad_norm": 27.227012634277344, "learning_rate": 3.5658536128085623e-06, "logits/chosen": 0.06352569162845612, "logits/rejected": 0.29197776317596436, "logps/chosen": -464.83154296875, "logps/rejected": -456.54644775390625, "loss": 0.6573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.966151237487793, "rewards/margins": 0.261862576007843, "rewards/rejected": -1.2280137538909912, "step": 1620 }, { "epoch": 0.4265898979324784, "grad_norm": 17.129220962524414, "learning_rate": 3.545145015558399e-06, "logits/chosen": 0.09727749973535538, "logits/rejected": 0.07895330339670181, "logps/chosen": -418.5646057128906, "logps/rejected": -414.50347900390625, "loss": 0.635, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9049051403999329, "rewards/margins": 0.2770025134086609, "rewards/rejected": -1.1819076538085938, "step": 1630 }, { "epoch": 0.42920701387071447, "grad_norm": 13.82589340209961, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -0.012812698259949684, "logits/rejected": -0.03551667556166649, "logps/chosen": -469.703857421875, "logps/rejected": -462.33489990234375, "loss": 0.6404, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8767460584640503, "rewards/margins": 0.29123008251190186, "rewards/rejected": -1.1679762601852417, "step": 1640 }, { "epoch": 0.4318241298089505, "grad_norm": 15.320180892944336, "learning_rate": 3.503467749582857e-06, "logits/chosen": 0.16704775393009186, "logits/rejected": 0.20051440596580505, "logps/chosen": -468.2303161621094, "logps/rejected": -435.6004333496094, "loss": 0.6469, "rewards/accuracies": 0.625, "rewards/chosen": -0.9747709035873413, "rewards/margins": 0.2773420214653015, "rewards/rejected": -1.2521127462387085, "step": 1650 }, { "epoch": 0.4344412457471866, "grad_norm": 14.137253761291504, "learning_rate": 3.4825025608971947e-06, "logits/chosen": 0.20161625742912292, "logits/rejected": 0.34558919072151184, "logps/chosen": -457.50775146484375, "logps/rejected": -459.1556091308594, "loss": 0.6473, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1610692739486694, "rewards/margins": 0.21469669044017792, "rewards/rejected": -1.3757660388946533, "step": 1660 }, { "epoch": 0.43705836168542267, "grad_norm": 14.53886604309082, "learning_rate": 3.4614553335304407e-06, "logits/chosen": 0.08533845096826553, "logits/rejected": 0.3017124533653259, "logps/chosen": -496.14801025390625, "logps/rejected": -463.46807861328125, "loss": 0.601, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9761697053909302, "rewards/margins": 0.33948642015457153, "rewards/rejected": -1.3156561851501465, "step": 1670 }, { "epoch": 0.4396754776236587, "grad_norm": 24.965370178222656, "learning_rate": 3.4403278249200222e-06, "logits/chosen": 0.06942877918481827, "logits/rejected": 0.1279851198196411, "logps/chosen": -469.521728515625, "logps/rejected": -445.25592041015625, "loss": 0.5627, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6999729871749878, "rewards/margins": 0.4319036900997162, "rewards/rejected": -1.1318767070770264, "step": 1680 }, { "epoch": 0.44229259356189476, "grad_norm": 15.348676681518555, "learning_rate": 3.4191217992068293e-06, "logits/chosen": 0.06874585151672363, "logits/rejected": 0.13581883907318115, "logps/chosen": -501.0218200683594, "logps/rejected": -451.73345947265625, "loss": 0.6089, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9231773614883423, "rewards/margins": 0.31995171308517456, "rewards/rejected": -1.243129014968872, "step": 1690 }, { "epoch": 0.44490970950013087, "grad_norm": 19.28348731994629, "learning_rate": 3.3978390270879056e-06, "logits/chosen": 0.1793755143880844, "logits/rejected": 0.4792613983154297, "logps/chosen": -450.6305236816406, "logps/rejected": -468.32086181640625, "loss": 0.6395, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2974417209625244, "rewards/margins": 0.2531173825263977, "rewards/rejected": -1.5505590438842773, "step": 1700 }, { "epoch": 0.44490970950013087, "eval_logits/chosen": 0.0606597363948822, "eval_logits/rejected": 0.20719292759895325, "eval_logps/chosen": -515.066162109375, "eval_logps/rejected": -504.28570556640625, "eval_loss": 0.6020700931549072, "eval_rewards/accuracies": 0.6744999885559082, "eval_rewards/chosen": -1.2670079469680786, "eval_rewards/margins": 0.32900768518447876, "eval_rewards/rejected": -1.5960155725479126, "eval_runtime": 232.1756, "eval_samples_per_second": 8.614, "eval_steps_per_second": 1.077, "step": 1700 }, { "epoch": 0.4475268254383669, "grad_norm": 26.0466365814209, "learning_rate": 3.3764812856685995e-06, "logits/chosen": 0.11054261028766632, "logits/rejected": 0.11302468925714493, "logps/chosen": -463.68212890625, "logps/rejected": -505.44464111328125, "loss": 0.6036, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1472562551498413, "rewards/margins": 0.3336459994316101, "rewards/rejected": -1.4809024333953857, "step": 1710 }, { "epoch": 0.45014394137660296, "grad_norm": 12.329084396362305, "learning_rate": 3.3550503583141726e-06, "logits/chosen": 0.14257648587226868, "logits/rejected": 0.2127263993024826, "logps/chosen": -476.0755920410156, "logps/rejected": -482.536376953125, "loss": 0.5637, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8113042712211609, "rewards/margins": 0.41295966506004333, "rewards/rejected": -1.2242640256881714, "step": 1720 }, { "epoch": 0.45276105731483907, "grad_norm": 14.099928855895996, "learning_rate": 3.3335480345008907e-06, "logits/chosen": 0.11843159049749374, "logits/rejected": 0.2315410077571869, "logps/chosen": -438.8074645996094, "logps/rejected": -451.05816650390625, "loss": 0.6125, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6928716897964478, "rewards/margins": 0.3718397617340088, "rewards/rejected": -1.064711570739746, "step": 1730 }, { "epoch": 0.4553781732530751, "grad_norm": 12.286911010742188, "learning_rate": 3.3119761096666055e-06, "logits/chosen": 0.025634441524744034, "logits/rejected": 0.12711207568645477, "logps/chosen": -447.642578125, "logps/rejected": -428.96246337890625, "loss": 0.6114, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6357846856117249, "rewards/margins": 0.2916674017906189, "rewards/rejected": -0.9274520874023438, "step": 1740 }, { "epoch": 0.45799528919131116, "grad_norm": 14.236140251159668, "learning_rate": 3.290336385060832e-06, "logits/chosen": 0.07329438626766205, "logits/rejected": 0.2325226366519928, "logps/chosen": -458.25213623046875, "logps/rejected": -445.90557861328125, "loss": 0.5975, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8362909555435181, "rewards/margins": 0.3581870198249817, "rewards/rejected": -1.1944780349731445, "step": 1750 }, { "epoch": 0.46061240512954726, "grad_norm": 15.764031410217285, "learning_rate": 3.268630667594348e-06, "logits/chosen": 0.11211331933736801, "logits/rejected": 0.12331026792526245, "logps/chosen": -460.3636169433594, "logps/rejected": -454.080078125, "loss": 0.5906, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8475208282470703, "rewards/margins": 0.3742133677005768, "rewards/rejected": -1.2217340469360352, "step": 1760 }, { "epoch": 0.4632295210677833, "grad_norm": 23.4930362701416, "learning_rate": 3.2468607696883147e-06, "logits/chosen": 0.24953755736351013, "logits/rejected": 0.34629741311073303, "logps/chosen": -477.17791748046875, "logps/rejected": -515.0440673828125, "loss": 0.5613, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0197489261627197, "rewards/margins": 0.4735226035118103, "rewards/rejected": -1.4932715892791748, "step": 1770 }, { "epoch": 0.46584663700601936, "grad_norm": 13.870096206665039, "learning_rate": 3.225028509122944e-06, "logits/chosen": 0.12339513003826141, "logits/rejected": 0.26224666833877563, "logps/chosen": -495.1161193847656, "logps/rejected": -489.4485778808594, "loss": 0.613, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2092769145965576, "rewards/margins": 0.3337065279483795, "rewards/rejected": -1.5429834127426147, "step": 1780 }, { "epoch": 0.4684637529442554, "grad_norm": 19.58800506591797, "learning_rate": 3.2031357088857083e-06, "logits/chosen": 0.04813681170344353, "logits/rejected": 0.21235807240009308, "logps/chosen": -539.0248413085938, "logps/rejected": -540.2000732421875, "loss": 0.6242, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3390930891036987, "rewards/margins": 0.3306949734687805, "rewards/rejected": -1.669788122177124, "step": 1790 }, { "epoch": 0.4710808688824915, "grad_norm": 16.04212760925293, "learning_rate": 3.181184197019127e-06, "logits/chosen": 0.1568802297115326, "logits/rejected": 0.3320377767086029, "logps/chosen": -475.43487548828125, "logps/rejected": -508.42083740234375, "loss": 0.5924, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2451696395874023, "rewards/margins": 0.4058682918548584, "rewards/rejected": -1.6510378122329712, "step": 1800 }, { "epoch": 0.4710808688824915, "eval_logits/chosen": -0.0006541285547427833, "eval_logits/rejected": 0.12322327494621277, "eval_logps/chosen": -501.73223876953125, "eval_logps/rejected": -494.2189636230469, "eval_loss": 0.5999693870544434, "eval_rewards/accuracies": 0.6654999852180481, "eval_rewards/chosen": -1.133669137954712, "eval_rewards/margins": 0.3616788983345032, "eval_rewards/rejected": -1.4953482151031494, "eval_runtime": 232.2077, "eval_samples_per_second": 8.613, "eval_steps_per_second": 1.077, "step": 1800 }, { "epoch": 0.47369798482072756, "grad_norm": 16.250635147094727, "learning_rate": 3.159175806468126e-06, "logits/chosen": -0.059101611375808716, "logits/rejected": 0.07854647934436798, "logps/chosen": -468.9906311035156, "logps/rejected": -467.6946716308594, "loss": 0.5778, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1028201580047607, "rewards/margins": 0.41374215483665466, "rewards/rejected": -1.5165622234344482, "step": 1810 }, { "epoch": 0.4763151007589636, "grad_norm": 17.706560134887695, "learning_rate": 3.1371123749269804e-06, "logits/chosen": 0.029452210292220116, "logits/rejected": 0.0658307746052742, "logps/chosen": -503.1075134277344, "logps/rejected": -495.166259765625, "loss": 0.6576, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.9831246137619019, "rewards/margins": 0.2420085221529007, "rewards/rejected": -1.2251330614089966, "step": 1820 }, { "epoch": 0.4789322166971997, "grad_norm": 11.094071388244629, "learning_rate": 3.114995744685877e-06, "logits/chosen": 0.15399818122386932, "logits/rejected": 0.09563325345516205, "logps/chosen": -425.6133728027344, "logps/rejected": -413.8519592285156, "loss": 0.6349, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6909037828445435, "rewards/margins": 0.21615329384803772, "rewards/rejected": -0.907056987285614, "step": 1830 }, { "epoch": 0.48154933263543576, "grad_norm": 13.76625919342041, "learning_rate": 3.0928277624770743e-06, "logits/chosen": 0.03457440435886383, "logits/rejected": 0.3043617010116577, "logps/chosen": -482.749267578125, "logps/rejected": -466.3192443847656, "loss": 0.575, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6658245921134949, "rewards/margins": 0.4141596257686615, "rewards/rejected": -1.079984188079834, "step": 1840 }, { "epoch": 0.4841664485736718, "grad_norm": 13.214118003845215, "learning_rate": 3.070610279320708e-06, "logits/chosen": 0.10331498086452484, "logits/rejected": 0.20391520857810974, "logps/chosen": -504.2576599121094, "logps/rejected": -493.19635009765625, "loss": 0.5699, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9062315225601196, "rewards/margins": 0.4142914414405823, "rewards/rejected": -1.3205230236053467, "step": 1850 }, { "epoch": 0.48678356451190785, "grad_norm": 15.12294864654541, "learning_rate": 3.0483451503702264e-06, "logits/chosen": 0.22254931926727295, "logits/rejected": 0.15011247992515564, "logps/chosen": -541.047119140625, "logps/rejected": -546.2518310546875, "loss": 0.5984, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3822373151779175, "rewards/margins": 0.3731249272823334, "rewards/rejected": -1.7553622722625732, "step": 1860 }, { "epoch": 0.48940068045014395, "grad_norm": 19.084123611450195, "learning_rate": 3.0260342347574916e-06, "logits/chosen": 0.16859467327594757, "logits/rejected": 0.2447008639574051, "logps/chosen": -518.3531494140625, "logps/rejected": -513.4488525390625, "loss": 0.567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2804909944534302, "rewards/margins": 0.4136219620704651, "rewards/rejected": -1.69411301612854, "step": 1870 }, { "epoch": 0.49201779638838, "grad_norm": 15.792978286743164, "learning_rate": 3.0036793954375358e-06, "logits/chosen": 0.11333123594522476, "logits/rejected": 0.27916672825813293, "logps/chosen": -503.6768493652344, "logps/rejected": -482.2354431152344, "loss": 0.5638, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0980170965194702, "rewards/margins": 0.46111616492271423, "rewards/rejected": -1.5591331720352173, "step": 1880 }, { "epoch": 0.49463491232661605, "grad_norm": 16.74842071533203, "learning_rate": 2.981282499033009e-06, "logits/chosen": -0.023114752024412155, "logits/rejected": 0.1362176537513733, "logps/chosen": -517.9249267578125, "logps/rejected": -498.3343200683594, "loss": 0.6287, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0479462146759033, "rewards/margins": 0.3232496380805969, "rewards/rejected": -1.3711960315704346, "step": 1890 }, { "epoch": 0.49725202826485215, "grad_norm": 14.8408784866333, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -0.03425337374210358, "logits/rejected": 0.01658450812101364, "logps/chosen": -510.95263671875, "logps/rejected": -505.4061584472656, "loss": 0.5875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.101326584815979, "rewards/margins": 0.37253618240356445, "rewards/rejected": -1.4738627672195435, "step": 1900 }, { "epoch": 0.49725202826485215, "eval_logits/chosen": 0.06292513012886047, "eval_logits/rejected": 0.18740317225456238, "eval_logps/chosen": -508.2808532714844, "eval_logps/rejected": -502.21710205078125, "eval_loss": 0.5985915660858154, "eval_rewards/accuracies": 0.6744999885559082, "eval_rewards/chosen": -1.1991546154022217, "eval_rewards/margins": 0.3761745095252991, "eval_rewards/rejected": -1.5753291845321655, "eval_runtime": 232.6562, "eval_samples_per_second": 8.596, "eval_steps_per_second": 1.075, "step": 1900 }, { "epoch": 0.4998691442030882, "grad_norm": 13.105058670043945, "learning_rate": 2.9363700188634597e-06, "logits/chosen": 0.08050940185785294, "logits/rejected": 0.27998119592666626, "logps/chosen": -500.57720947265625, "logps/rejected": -478.83624267578125, "loss": 0.5974, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2193187475204468, "rewards/margins": 0.34275728464126587, "rewards/rejected": -1.5620760917663574, "step": 1910 }, { "epoch": 0.5024862601413242, "grad_norm": 17.656320571899414, "learning_rate": 2.9138581852776053e-06, "logits/chosen": 0.2168809473514557, "logits/rejected": 0.3105737566947937, "logps/chosen": -496.070556640625, "logps/rejected": -499.7674865722656, "loss": 0.5806, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1544568538665771, "rewards/margins": 0.4054805636405945, "rewards/rejected": -1.5599374771118164, "step": 1920 }, { "epoch": 0.5051033760795604, "grad_norm": 14.08234977722168, "learning_rate": 2.8913117946523805e-06, "logits/chosen": 0.2844335436820984, "logits/rejected": 0.31109169125556946, "logps/chosen": -513.3364868164062, "logps/rejected": -496.12200927734375, "loss": 0.5716, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3235498666763306, "rewards/margins": 0.4011419713497162, "rewards/rejected": -1.7246919870376587, "step": 1930 }, { "epoch": 0.5077204920177963, "grad_norm": 15.062522888183594, "learning_rate": 2.8687327296049126e-06, "logits/chosen": 0.230653315782547, "logits/rejected": 0.3991672396659851, "logps/chosen": -498.00933837890625, "logps/rejected": -510.3081970214844, "loss": 0.5859, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1839993000030518, "rewards/margins": 0.4453356862068176, "rewards/rejected": -1.6293350458145142, "step": 1940 }, { "epoch": 0.5103376079560324, "grad_norm": 19.26543426513672, "learning_rate": 2.8461228754806376e-06, "logits/chosen": 0.14215265214443207, "logits/rejected": 0.23736266791820526, "logps/chosen": -510.92657470703125, "logps/rejected": -494.1617126464844, "loss": 0.5901, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0615785121917725, "rewards/margins": 0.360460102558136, "rewards/rejected": -1.4220386743545532, "step": 1950 }, { "epoch": 0.5129547238942685, "grad_norm": 13.139251708984375, "learning_rate": 2.823484120195865e-06, "logits/chosen": 0.15903696417808533, "logits/rejected": 0.29216212034225464, "logps/chosen": -534.9984741210938, "logps/rejected": -512.8052978515625, "loss": 0.5718, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2044841051101685, "rewards/margins": 0.41181907057762146, "rewards/rejected": -1.6163032054901123, "step": 1960 }, { "epoch": 0.5155718398325045, "grad_norm": 21.8537654876709, "learning_rate": 2.8008183540801486e-06, "logits/chosen": 0.16536223888397217, "logits/rejected": 0.2516060173511505, "logps/chosen": -516.1781005859375, "logps/rejected": -486.567138671875, "loss": 0.6009, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.140059471130371, "rewards/margins": 0.38968387246131897, "rewards/rejected": -1.5297433137893677, "step": 1970 }, { "epoch": 0.5181889557707406, "grad_norm": 18.018861770629883, "learning_rate": 2.7781274697184353e-06, "logits/chosen": 0.16366654634475708, "logits/rejected": 0.278939425945282, "logps/chosen": -435.4315490722656, "logps/rejected": -477.2860412597656, "loss": 0.6252, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.983991265296936, "rewards/margins": 0.28731080889701843, "rewards/rejected": -1.2713019847869873, "step": 1980 }, { "epoch": 0.5208060717089767, "grad_norm": 15.223612785339355, "learning_rate": 2.7554133617930397e-06, "logits/chosen": 0.05067938566207886, "logits/rejected": 0.06571893393993378, "logps/chosen": -452.55804443359375, "logps/rejected": -454.023193359375, "loss": 0.5889, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8877269625663757, "rewards/margins": 0.38499319553375244, "rewards/rejected": -1.2727200984954834, "step": 1990 }, { "epoch": 0.5234231876472127, "grad_norm": 17.48723602294922, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -0.015536749735474586, "logits/rejected": 0.16148407757282257, "logps/chosen": -518.9827880859375, "logps/rejected": -480.99163818359375, "loss": 0.5849, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1106268167495728, "rewards/margins": 0.4065285623073578, "rewards/rejected": -1.5171552896499634, "step": 2000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -0.0025373969692736864, "eval_logits/rejected": 0.12474588304758072, "eval_logps/chosen": -524.7777099609375, "eval_logps/rejected": -517.0887451171875, "eval_loss": 0.5969316959381104, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": -1.3641233444213867, "eval_rewards/margins": 0.3599224388599396, "eval_rewards/rejected": -1.7240456342697144, "eval_runtime": 232.6406, "eval_samples_per_second": 8.597, "eval_steps_per_second": 1.075, "step": 2000 }, { "epoch": 0.5260403035854488, "grad_norm": 17.689542770385742, "learning_rate": 2.7099230635178954e-06, "logits/chosen": 0.2274688184261322, "logits/rejected": 0.25372716784477234, "logps/chosen": -526.2362060546875, "logps/rejected": -533.8927001953125, "loss": 0.5753, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3567404747009277, "rewards/margins": 0.4240929186344147, "rewards/rejected": -1.7808334827423096, "step": 2010 }, { "epoch": 0.528657419523685, "grad_norm": 15.211199760437012, "learning_rate": 2.6871506715949608e-06, "logits/chosen": 0.058562636375427246, "logits/rejected": 0.18520574271678925, "logps/chosen": -499.6806640625, "logps/rejected": -491.3092346191406, "loss": 0.5954, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1945785284042358, "rewards/margins": 0.3566603362560272, "rewards/rejected": -1.551238775253296, "step": 2020 }, { "epoch": 0.5312745354619209, "grad_norm": 15.991866111755371, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -0.09116406738758087, "logits/rejected": -0.05098678544163704, "logps/chosen": -533.4771728515625, "logps/rejected": -504.9336853027344, "loss": 0.5447, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0096272230148315, "rewards/margins": 0.49922627210617065, "rewards/rejected": -1.508853554725647, "step": 2030 }, { "epoch": 0.533891651400157, "grad_norm": 15.529874801635742, "learning_rate": 2.6415609094604562e-06, "logits/chosen": 0.0014547407627105713, "logits/rejected": -0.014313450083136559, "logps/chosen": -489.4344177246094, "logps/rejected": -481.03558349609375, "loss": 0.6275, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1068042516708374, "rewards/margins": 0.29210469126701355, "rewards/rejected": -1.3989089727401733, "step": 2040 }, { "epoch": 0.5365087673383931, "grad_norm": 14.723438262939453, "learning_rate": 2.618747345980904e-06, "logits/chosen": 0.06219317764043808, "logits/rejected": 0.29134300351142883, "logps/chosen": -482.73651123046875, "logps/rejected": -454.9242248535156, "loss": 0.577, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2507166862487793, "rewards/margins": 0.40432801842689514, "rewards/rejected": -1.6550447940826416, "step": 2050 }, { "epoch": 0.5391258832766291, "grad_norm": 14.240797996520996, "learning_rate": 2.595923867132136e-06, "logits/chosen": -0.00904160737991333, "logits/rejected": -0.042715176939964294, "logps/chosen": -539.594482421875, "logps/rejected": -535.2293701171875, "loss": 0.5718, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3310154676437378, "rewards/margins": 0.48368293046951294, "rewards/rejected": -1.814698576927185, "step": 2060 }, { "epoch": 0.5417429992148652, "grad_norm": 14.852096557617188, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -0.025297870859503746, "logits/rejected": 0.19150254130363464, "logps/chosen": -528.1922607421875, "logps/rejected": -553.71142578125, "loss": 0.587, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5110130310058594, "rewards/margins": 0.3934626281261444, "rewards/rejected": -1.9044758081436157, "step": 2070 }, { "epoch": 0.5443601151531012, "grad_norm": 16.273681640625, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -0.0205762330442667, "logits/rejected": 0.12017925083637238, "logps/chosen": -543.7810668945312, "logps/rejected": -533.4263916015625, "loss": 0.6134, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7067127227783203, "rewards/margins": 0.3284439742565155, "rewards/rejected": -2.035156726837158, "step": 2080 }, { "epoch": 0.5469772310913373, "grad_norm": 23.068370819091797, "learning_rate": 2.527412999094507e-06, "logits/chosen": 0.06104808300733566, "logits/rejected": 0.21988165378570557, "logps/chosen": -568.421630859375, "logps/rejected": -585.5074462890625, "loss": 0.5597, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.414439082145691, "rewards/margins": 0.4884655475616455, "rewards/rejected": -1.902904748916626, "step": 2090 }, { "epoch": 0.5495943470295734, "grad_norm": 18.669837951660156, "learning_rate": 2.504568922200064e-06, "logits/chosen": 0.03404618427157402, "logits/rejected": 0.2708562910556793, "logps/chosen": -476.51324462890625, "logps/rejected": -472.4541931152344, "loss": 0.6106, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2663251161575317, "rewards/margins": 0.32750216126441956, "rewards/rejected": -1.5938273668289185, "step": 2100 }, { "epoch": 0.5495943470295734, "eval_logits/chosen": -0.016485024243593216, "eval_logits/rejected": 0.10226120799779892, "eval_logps/chosen": -514.2800903320312, "eval_logps/rejected": -508.7902526855469, "eval_loss": 0.5930544137954712, "eval_rewards/accuracies": 0.6834999918937683, "eval_rewards/chosen": -1.2591471672058105, "eval_rewards/margins": 0.38191384077072144, "eval_rewards/rejected": -1.6410611867904663, "eval_runtime": 232.4639, "eval_samples_per_second": 8.603, "eval_steps_per_second": 1.075, "step": 2100 }, { "epoch": 0.5522114629678094, "grad_norm": 17.219968795776367, "learning_rate": 2.4817244638019333e-06, "logits/chosen": 0.07551795244216919, "logits/rejected": 0.11292078346014023, "logps/chosen": -530.4259033203125, "logps/rejected": -491.468017578125, "loss": 0.5957, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1608909368515015, "rewards/margins": 0.4003227651119232, "rewards/rejected": -1.561213731765747, "step": 2110 }, { "epoch": 0.5548285789060455, "grad_norm": 19.8704776763916, "learning_rate": 2.4588815314058155e-06, "logits/chosen": 0.16289404034614563, "logits/rejected": 0.24848175048828125, "logps/chosen": -452.452880859375, "logps/rejected": -434.36090087890625, "loss": 0.5882, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9118096232414246, "rewards/margins": 0.3754151463508606, "rewards/rejected": -1.2872246503829956, "step": 2120 }, { "epoch": 0.5574456948442816, "grad_norm": 15.798819541931152, "learning_rate": 2.4360420323899922e-06, "logits/chosen": 0.11843502521514893, "logits/rejected": 0.15708817541599274, "logps/chosen": -505.29168701171875, "logps/rejected": -489.7765197753906, "loss": 0.5713, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.04365873336792, "rewards/margins": 0.4483584761619568, "rewards/rejected": -1.492017149925232, "step": 2130 }, { "epoch": 0.5600628107825176, "grad_norm": 17.67369842529297, "learning_rate": 2.4132078738460585e-06, "logits/chosen": 0.17071916162967682, "logits/rejected": 0.209875226020813, "logps/chosen": -521.3018798828125, "logps/rejected": -482.3414001464844, "loss": 0.5975, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2791138887405396, "rewards/margins": 0.3632059097290039, "rewards/rejected": -1.642319679260254, "step": 2140 }, { "epoch": 0.5626799267207537, "grad_norm": 21.954570770263672, "learning_rate": 2.3903809624196826e-06, "logits/chosen": 0.3113669753074646, "logits/rejected": 0.2932060956954956, "logps/chosen": -475.5943908691406, "logps/rejected": -454.9044494628906, "loss": 0.6099, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2669929265975952, "rewards/margins": 0.3418061137199402, "rewards/rejected": -1.6087989807128906, "step": 2150 }, { "epoch": 0.5652970426589898, "grad_norm": 25.539379119873047, "learning_rate": 2.3675632041513978e-06, "logits/chosen": 0.1442708671092987, "logits/rejected": 0.23462197184562683, "logps/chosen": -535.3931884765625, "logps/rejected": -477.28369140625, "loss": 0.567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.161911964416504, "rewards/margins": 0.46348389983177185, "rewards/rejected": -1.6253957748413086, "step": 2160 }, { "epoch": 0.5679141585972258, "grad_norm": 19.380157470703125, "learning_rate": 2.3447565043174533e-06, "logits/chosen": 0.21937327086925507, "logits/rejected": 0.30190104246139526, "logps/chosen": -508.240966796875, "logps/rejected": -481.8168029785156, "loss": 0.5906, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4066482782363892, "rewards/margins": 0.38738176226615906, "rewards/rejected": -1.7940301895141602, "step": 2170 }, { "epoch": 0.5705312745354619, "grad_norm": 18.660661697387695, "learning_rate": 2.321962767270724e-06, "logits/chosen": 0.22592106461524963, "logits/rejected": 0.28465738892555237, "logps/chosen": -527.2122802734375, "logps/rejected": -483.3441467285156, "loss": 0.6442, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.577532410621643, "rewards/margins": 0.2664092779159546, "rewards/rejected": -1.8439416885375977, "step": 2180 }, { "epoch": 0.573148390473698, "grad_norm": 20.082679748535156, "learning_rate": 2.299183896281692e-06, "logits/chosen": 0.16324841976165771, "logits/rejected": 0.29000192880630493, "logps/chosen": -521.3306274414062, "logps/rejected": -535.11328125, "loss": 0.6251, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5316604375839233, "rewards/margins": 0.3131243586540222, "rewards/rejected": -1.8447847366333008, "step": 2190 }, { "epoch": 0.575765506411934, "grad_norm": 15.897833824157715, "learning_rate": 2.2764217933795297e-06, "logits/chosen": 0.2278885841369629, "logits/rejected": 0.2919641137123108, "logps/chosen": -514.6935424804688, "logps/rejected": -508.80096435546875, "loss": 0.5783, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2934855222702026, "rewards/margins": 0.41693955659866333, "rewards/rejected": -1.7104251384735107, "step": 2200 }, { "epoch": 0.575765506411934, "eval_logits/chosen": 0.14214755594730377, "eval_logits/rejected": 0.2720797061920166, "eval_logps/chosen": -512.494873046875, "eval_logps/rejected": -506.34222412109375, "eval_loss": 0.594093382358551, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": -1.2412952184677124, "eval_rewards/margins": 0.3752853572368622, "eval_rewards/rejected": -1.616580605506897, "eval_runtime": 232.5254, "eval_samples_per_second": 8.601, "eval_steps_per_second": 1.075, "step": 2200 }, { "epoch": 0.5783826223501701, "grad_norm": 15.479647636413574, "learning_rate": 2.2536783591932786e-06, "logits/chosen": 0.140645831823349, "logits/rejected": 0.33969563245773315, "logps/chosen": -515.4466552734375, "logps/rejected": -524.2593994140625, "loss": 0.604, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2855024337768555, "rewards/margins": 0.35720211267471313, "rewards/rejected": -1.6427046060562134, "step": 2210 }, { "epoch": 0.5809997382884062, "grad_norm": 15.445068359375, "learning_rate": 2.230955492793149e-06, "logits/chosen": 0.2617935538291931, "logits/rejected": 0.2599295973777771, "logps/chosen": -544.3765869140625, "logps/rejected": -532.53271484375, "loss": 0.6289, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3259150981903076, "rewards/margins": 0.3112506568431854, "rewards/rejected": -1.6371657848358154, "step": 2220 }, { "epoch": 0.5836168542266422, "grad_norm": 16.804216384887695, "learning_rate": 2.208255091531947e-06, "logits/chosen": 0.33556073904037476, "logits/rejected": 0.504830539226532, "logps/chosen": -567.1902465820312, "logps/rejected": -558.8784790039062, "loss": 0.5847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4798108339309692, "rewards/margins": 0.43180447816848755, "rewards/rejected": -1.9116153717041016, "step": 2230 }, { "epoch": 0.5862339701648783, "grad_norm": 15.775924682617188, "learning_rate": 2.1855790508866435e-06, "logits/chosen": 0.34685009717941284, "logits/rejected": 0.377047598361969, "logps/chosen": -602.8464965820312, "logps/rejected": -586.4207763671875, "loss": 0.6197, "rewards/accuracies": 0.625, "rewards/chosen": -1.6576731204986572, "rewards/margins": 0.36601829528808594, "rewards/rejected": -2.0236916542053223, "step": 2240 }, { "epoch": 0.5888510861031143, "grad_norm": 14.079179763793945, "learning_rate": 2.162929264300107e-06, "logits/chosen": 0.2139190137386322, "logits/rejected": 0.42656293511390686, "logps/chosen": -567.9656982421875, "logps/rejected": -579.9403076171875, "loss": 0.555, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7237701416015625, "rewards/margins": 0.4748326241970062, "rewards/rejected": -2.1986026763916016, "step": 2250 }, { "epoch": 0.5914682020413504, "grad_norm": 18.067665100097656, "learning_rate": 2.1403076230230006e-06, "logits/chosen": 0.4777112603187561, "logits/rejected": 0.43725594878196716, "logps/chosen": -557.2151489257812, "logps/rejected": -545.1485595703125, "loss": 0.6474, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7369823455810547, "rewards/margins": 0.29405802488327026, "rewards/rejected": -2.0310404300689697, "step": 2260 }, { "epoch": 0.5940853179795865, "grad_norm": 19.236621856689453, "learning_rate": 2.11771601595586e-06, "logits/chosen": 0.40373674035072327, "logits/rejected": 0.3856516480445862, "logps/chosen": -574.0790405273438, "logps/rejected": -524.0260009765625, "loss": 0.5974, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5017083883285522, "rewards/margins": 0.43927374482154846, "rewards/rejected": -1.9409822225570679, "step": 2270 }, { "epoch": 0.5967024339178225, "grad_norm": 16.725879669189453, "learning_rate": 2.0951563294913737e-06, "logits/chosen": 0.3718245029449463, "logits/rejected": 0.4380251467227936, "logps/chosen": -513.0089111328125, "logps/rejected": -508.85760498046875, "loss": 0.5285, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3701801300048828, "rewards/margins": 0.5219612717628479, "rewards/rejected": -1.892141342163086, "step": 2280 }, { "epoch": 0.5993195498560586, "grad_norm": 17.321361541748047, "learning_rate": 2.0726304473568693e-06, "logits/chosen": 0.25947481393814087, "logits/rejected": 0.3379734754562378, "logps/chosen": -519.076904296875, "logps/rejected": -498.84307861328125, "loss": 0.5948, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3756697177886963, "rewards/margins": 0.35469603538513184, "rewards/rejected": -1.730365514755249, "step": 2290 }, { "epoch": 0.6019366657942947, "grad_norm": 27.503204345703125, "learning_rate": 2.050140250457023e-06, "logits/chosen": 0.13081859052181244, "logits/rejected": 0.22465069591999054, "logps/chosen": -527.054931640625, "logps/rejected": -524.3948974609375, "loss": 0.574, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3721132278442383, "rewards/margins": 0.4912486672401428, "rewards/rejected": -1.8633617162704468, "step": 2300 }, { "epoch": 0.6019366657942947, "eval_logits/chosen": 0.20482583343982697, "eval_logits/rejected": 0.3316424489021301, "eval_logps/chosen": -533.3546752929688, "eval_logps/rejected": -529.7435302734375, "eval_loss": 0.5939305424690247, "eval_rewards/accuracies": 0.6819999814033508, "eval_rewards/chosen": -1.4498937129974365, "eval_rewards/margins": 0.40070000290870667, "eval_rewards/rejected": -1.8505936861038208, "eval_runtime": 232.6476, "eval_samples_per_second": 8.597, "eval_steps_per_second": 1.075, "step": 2300 }, { "epoch": 0.6045537817325307, "grad_norm": 14.945230484008789, "learning_rate": 2.0276876167168042e-06, "logits/chosen": 0.3673093914985657, "logits/rejected": 0.3980174660682678, "logps/chosen": -465.44775390625, "logps/rejected": -453.03076171875, "loss": 0.5977, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4102133512496948, "rewards/margins": 0.36651554703712463, "rewards/rejected": -1.7767289876937866, "step": 2310 }, { "epoch": 0.6071708976707668, "grad_norm": 14.364018440246582, "learning_rate": 2.0052744209246682e-06, "logits/chosen": 0.34174802899360657, "logits/rejected": 0.3505280613899231, "logps/chosen": -476.4820861816406, "logps/rejected": -462.1893615722656, "loss": 0.5789, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1048295497894287, "rewards/margins": 0.4239567816257477, "rewards/rejected": -1.5287864208221436, "step": 2320 }, { "epoch": 0.6097880136090029, "grad_norm": 19.915328979492188, "learning_rate": 1.9829025345760127e-06, "logits/chosen": 0.22554683685302734, "logits/rejected": 0.3321411609649658, "logps/chosen": -512.6522827148438, "logps/rejected": -518.2416381835938, "loss": 0.618, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0541471242904663, "rewards/margins": 0.31440818309783936, "rewards/rejected": -1.3685553073883057, "step": 2330 }, { "epoch": 0.6124051295472389, "grad_norm": 27.91488265991211, "learning_rate": 1.9605738257169115e-06, "logits/chosen": 0.38059157133102417, "logits/rejected": 0.4925920367240906, "logps/chosen": -471.82098388671875, "logps/rejected": -469.058837890625, "loss": 0.6262, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1800696849822998, "rewards/margins": 0.3158087134361267, "rewards/rejected": -1.4958784580230713, "step": 2340 }, { "epoch": 0.615022245485475, "grad_norm": 13.445463180541992, "learning_rate": 1.9382901587881275e-06, "logits/chosen": 0.22165732085704803, "logits/rejected": 0.3755477964878082, "logps/chosen": -512.750244140625, "logps/rejected": -493.84869384765625, "loss": 0.5605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2496296167373657, "rewards/margins": 0.4642421305179596, "rewards/rejected": -1.713871717453003, "step": 2350 }, { "epoch": 0.6176393614237111, "grad_norm": 17.654233932495117, "learning_rate": 1.916053394469437e-06, "logits/chosen": 0.22464020550251007, "logits/rejected": 0.412786066532135, "logps/chosen": -504.1710510253906, "logps/rejected": -522.4702758789062, "loss": 0.5487, "rewards/accuracies": 0.75, "rewards/chosen": -1.2194569110870361, "rewards/margins": 0.5263023972511292, "rewards/rejected": -1.7457596063613892, "step": 2360 }, { "epoch": 0.6202564773619471, "grad_norm": 14.313925743103027, "learning_rate": 1.8938653895242604e-06, "logits/chosen": 0.3136211335659027, "logits/rejected": 0.38369834423065186, "logps/chosen": -522.3583984375, "logps/rejected": -514.4608154296875, "loss": 0.5689, "rewards/accuracies": 0.75, "rewards/chosen": -1.2894423007965088, "rewards/margins": 0.50419020652771, "rewards/rejected": -1.7936325073242188, "step": 2370 }, { "epoch": 0.6228735933001832, "grad_norm": 22.269620895385742, "learning_rate": 1.8717279966446267e-06, "logits/chosen": 0.28245988488197327, "logits/rejected": 0.3623971939086914, "logps/chosen": -478.1153869628906, "logps/rejected": -492.8341369628906, "loss": 0.6142, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2947511672973633, "rewards/margins": 0.3845621645450592, "rewards/rejected": -1.6793134212493896, "step": 2380 }, { "epoch": 0.6254907092384192, "grad_norm": 15.111832618713379, "learning_rate": 1.8496430642964698e-06, "logits/chosen": 0.28817129135131836, "logits/rejected": 0.31284889578819275, "logps/chosen": -510.97576904296875, "logps/rejected": -507.06689453125, "loss": 0.6027, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2796354293823242, "rewards/margins": 0.38231566548347473, "rewards/rejected": -1.6619510650634766, "step": 2390 }, { "epoch": 0.6281078251766553, "grad_norm": 17.842960357666016, "learning_rate": 1.827612436565286e-06, "logits/chosen": 0.2098797857761383, "logits/rejected": 0.3998289704322815, "logps/chosen": -493.5147399902344, "logps/rejected": -490.57366943359375, "loss": 0.581, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1564563512802124, "rewards/margins": 0.40797433257102966, "rewards/rejected": -1.564430594444275, "step": 2400 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": 0.1449422538280487, "eval_logits/rejected": 0.2640918791294098, "eval_logps/chosen": -505.70361328125, "eval_logps/rejected": -501.2297668457031, "eval_loss": 0.5944039225578308, "eval_rewards/accuracies": 0.6869999766349792, "eval_rewards/chosen": -1.1733826398849487, "eval_rewards/margins": 0.39207327365875244, "eval_rewards/rejected": -1.5654560327529907, "eval_runtime": 232.3606, "eval_samples_per_second": 8.607, "eval_steps_per_second": 1.076, "step": 2400 }, { "epoch": 0.6307249411148914, "grad_norm": 17.78997802734375, "learning_rate": 1.8056379530021492e-06, "logits/chosen": 0.18455150723457336, "logits/rejected": 0.3481082618236542, "logps/chosen": -462.8374938964844, "logps/rejected": -461.21051025390625, "loss": 0.6006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1027500629425049, "rewards/margins": 0.3634553551673889, "rewards/rejected": -1.466205358505249, "step": 2410 }, { "epoch": 0.6333420570531274, "grad_norm": 20.840116500854492, "learning_rate": 1.7837214484701154e-06, "logits/chosen": 0.2612837255001068, "logits/rejected": 0.32097476720809937, "logps/chosen": -474.12774658203125, "logps/rejected": -475.11376953125, "loss": 0.5625, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1177818775177002, "rewards/margins": 0.450967937707901, "rewards/rejected": -1.5687499046325684, "step": 2420 }, { "epoch": 0.6359591729913635, "grad_norm": 17.94804573059082, "learning_rate": 1.7618647529910043e-06, "logits/chosen": 0.24861130118370056, "logits/rejected": 0.3079971969127655, "logps/chosen": -495.91021728515625, "logps/rejected": -495.01434326171875, "loss": 0.5697, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1746553182601929, "rewards/margins": 0.4058234691619873, "rewards/rejected": -1.5804787874221802, "step": 2430 }, { "epoch": 0.6385762889295996, "grad_norm": 16.069711685180664, "learning_rate": 1.7400696915925996e-06, "logits/chosen": 0.05449223518371582, "logits/rejected": 0.3324371874332428, "logps/chosen": -504.40509033203125, "logps/rejected": -470.92529296875, "loss": 0.5925, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2023115158081055, "rewards/margins": 0.4562528729438782, "rewards/rejected": -1.6585643291473389, "step": 2440 }, { "epoch": 0.6411934048678356, "grad_norm": 23.202844619750977, "learning_rate": 1.718338084156254e-06, "logits/chosen": 0.07136271893978119, "logits/rejected": 0.224413201212883, "logps/chosen": -545.9983520507812, "logps/rejected": -521.3710327148438, "loss": 0.5708, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2068856954574585, "rewards/margins": 0.4760914742946625, "rewards/rejected": -1.682977318763733, "step": 2450 }, { "epoch": 0.6438105208060717, "grad_norm": 13.684337615966797, "learning_rate": 1.6966717452649372e-06, "logits/chosen": 0.2635645270347595, "logits/rejected": 0.22264519333839417, "logps/chosen": -511.80584716796875, "logps/rejected": -482.666015625, "loss": 0.5498, "rewards/accuracies": 0.75, "rewards/chosen": -1.1622530221939087, "rewards/margins": 0.5128196477890015, "rewards/rejected": -1.6750726699829102, "step": 2460 }, { "epoch": 0.6464276367443078, "grad_norm": 16.29932975769043, "learning_rate": 1.6750724840517103e-06, "logits/chosen": 0.17669948935508728, "logits/rejected": 0.28178560733795166, "logps/chosen": -506.5672912597656, "logps/rejected": -530.7100830078125, "loss": 0.5961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.264910340309143, "rewards/margins": 0.4237841069698334, "rewards/rejected": -1.6886943578720093, "step": 2470 }, { "epoch": 0.6490447526825438, "grad_norm": 22.445497512817383, "learning_rate": 1.6535421040486686e-06, "logits/chosen": 0.25904372334480286, "logits/rejected": 0.34389209747314453, "logps/chosen": -510.6214294433594, "logps/rejected": -497.93377685546875, "loss": 0.5676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5339219570159912, "rewards/margins": 0.4442078173160553, "rewards/rejected": -1.9781297445297241, "step": 2480 }, { "epoch": 0.6516618686207799, "grad_norm": 13.630302429199219, "learning_rate": 1.6320824030363458e-06, "logits/chosen": 0.07676917314529419, "logits/rejected": 0.0776657983660698, "logps/chosen": -501.290771484375, "logps/rejected": -504.23504638671875, "loss": 0.5775, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.433424711227417, "rewards/margins": 0.43815937638282776, "rewards/rejected": -1.871584177017212, "step": 2490 }, { "epoch": 0.654278984559016, "grad_norm": 17.672494888305664, "learning_rate": 1.6106951728936028e-06, "logits/chosen": 0.06636019051074982, "logits/rejected": 0.24429766833782196, "logps/chosen": -503.6896057128906, "logps/rejected": -528.3905029296875, "loss": 0.5516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2013368606567383, "rewards/margins": 0.5229911208152771, "rewards/rejected": -1.724327802658081, "step": 2500 }, { "epoch": 0.654278984559016, "eval_logits/chosen": 0.06589578092098236, "eval_logits/rejected": 0.17816391587257385, "eval_logps/chosen": -510.28302001953125, "eval_logps/rejected": -507.39532470703125, "eval_loss": 0.5968104004859924, "eval_rewards/accuracies": 0.6775000095367432, "eval_rewards/chosen": -1.219177007675171, "eval_rewards/margins": 0.4079345464706421, "eval_rewards/rejected": -1.627111554145813, "eval_runtime": 232.5509, "eval_samples_per_second": 8.6, "eval_steps_per_second": 1.075, "step": 2500 }, { "epoch": 0.656896100497252, "grad_norm": 21.569828033447266, "learning_rate": 1.5893821994479996e-06, "logits/chosen": 0.22845594584941864, "logits/rejected": 0.29911938309669495, "logps/chosen": -505.616943359375, "logps/rejected": -486.45947265625, "loss": 0.5867, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.05279541015625, "rewards/margins": 0.43759018182754517, "rewards/rejected": -1.4903854131698608, "step": 2510 }, { "epoch": 0.6595132164354881, "grad_norm": 19.897043228149414, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -0.006847086362540722, "logits/rejected": 0.17560932040214539, "logps/chosen": -525.3328857421875, "logps/rejected": -500.6075134277344, "loss": 0.5357, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1965337991714478, "rewards/margins": 0.5665737390518188, "rewards/rejected": -1.7631075382232666, "step": 2520 }, { "epoch": 0.6621303323737242, "grad_norm": 14.37187385559082, "learning_rate": 1.5469861348078014e-06, "logits/chosen": 0.15944847464561462, "logits/rejected": 0.2913690209388733, "logps/chosen": -473.12652587890625, "logps/rejected": -505.186279296875, "loss": 0.5359, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2122926712036133, "rewards/margins": 0.5305719375610352, "rewards/rejected": -1.7428646087646484, "step": 2530 }, { "epoch": 0.6647474483119602, "grad_norm": 14.125144958496094, "learning_rate": 1.5259065836724035e-06, "logits/chosen": 0.17984794080257416, "logits/rejected": 0.2315172702074051, "logps/chosen": -485.05938720703125, "logps/rejected": -505.0470275878906, "loss": 0.6083, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2600443363189697, "rewards/margins": 0.4279232621192932, "rewards/rejected": -1.6879676580429077, "step": 2540 }, { "epoch": 0.6673645642501963, "grad_norm": 27.50576400756836, "learning_rate": 1.5049083690569456e-06, "logits/chosen": 0.17406558990478516, "logits/rejected": 0.3330245018005371, "logps/chosen": -472.1449279785156, "logps/rejected": -494.12677001953125, "loss": 0.6191, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.262279987335205, "rewards/margins": 0.39653074741363525, "rewards/rejected": -1.6588106155395508, "step": 2550 }, { "epoch": 0.6699816801884323, "grad_norm": 27.87693977355957, "learning_rate": 1.4839932443063057e-06, "logits/chosen": 0.2273554801940918, "logits/rejected": 0.20852844417095184, "logps/chosen": -547.5466918945312, "logps/rejected": -501.408203125, "loss": 0.5473, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1462863683700562, "rewards/margins": 0.48758673667907715, "rewards/rejected": -1.6338729858398438, "step": 2560 }, { "epoch": 0.6725987961266684, "grad_norm": 23.339811325073242, "learning_rate": 1.4631629558273803e-06, "logits/chosen": 0.1783367097377777, "logits/rejected": 0.3085087239742279, "logps/chosen": -486.72100830078125, "logps/rejected": -489.66387939453125, "loss": 0.6158, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1978503465652466, "rewards/margins": 0.3335895240306854, "rewards/rejected": -1.5314397811889648, "step": 2570 }, { "epoch": 0.6752159120649045, "grad_norm": 15.076952934265137, "learning_rate": 1.4424192429432657e-06, "logits/chosen": 0.16626477241516113, "logits/rejected": 0.18546536564826965, "logps/chosen": -478.89300537109375, "logps/rejected": -508.7444763183594, "loss": 0.5569, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0055302381515503, "rewards/margins": 0.4730769693851471, "rewards/rejected": -1.478607177734375, "step": 2580 }, { "epoch": 0.6778330280031405, "grad_norm": 27.09433364868164, "learning_rate": 1.421763837748016e-06, "logits/chosen": 0.2768346667289734, "logits/rejected": 0.35653841495513916, "logps/chosen": -494.996337890625, "logps/rejected": -491.51116943359375, "loss": 0.6019, "rewards/accuracies": 0.65625, "rewards/chosen": -1.140509009361267, "rewards/margins": 0.38551202416419983, "rewards/rejected": -1.5260212421417236, "step": 2590 }, { "epoch": 0.6804501439413766, "grad_norm": 22.913333892822266, "learning_rate": 1.401198464962021e-06, "logits/chosen": 0.15527863800525665, "logits/rejected": 0.21972744166851044, "logps/chosen": -525.5530395507812, "logps/rejected": -504.53814697265625, "loss": 0.5515, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2042077779769897, "rewards/margins": 0.5288984179496765, "rewards/rejected": -1.7331063747406006, "step": 2600 }, { "epoch": 0.6804501439413766, "eval_logits/chosen": 0.08117599785327911, "eval_logits/rejected": 0.1933600753545761, "eval_logps/chosen": -520.3548583984375, "eval_logps/rejected": -520.2850952148438, "eval_loss": 0.5958514213562012, "eval_rewards/accuracies": 0.6765000224113464, "eval_rewards/chosen": -1.3198949098587036, "eval_rewards/margins": 0.43611443042755127, "eval_rewards/rejected": -1.7560093402862549, "eval_runtime": 232.575, "eval_samples_per_second": 8.599, "eval_steps_per_second": 1.075, "step": 2600 }, { "epoch": 0.6830672598796127, "grad_norm": 23.55072593688965, "learning_rate": 1.3807248417879896e-06, "logits/chosen": 0.03950003907084465, "logits/rejected": 0.09041625261306763, "logps/chosen": -530.7230224609375, "logps/rejected": -526.6931762695312, "loss": 0.5821, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3074090480804443, "rewards/margins": 0.4792531430721283, "rewards/rejected": -1.7866621017456055, "step": 2610 }, { "epoch": 0.6856843758178487, "grad_norm": 52.15616226196289, "learning_rate": 1.3603446777675665e-06, "logits/chosen": 0.2288985550403595, "logits/rejected": 0.39459601044654846, "logps/chosen": -510.23687744140625, "logps/rejected": -519.2042236328125, "loss": 0.5945, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2479314804077148, "rewards/margins": 0.4636309742927551, "rewards/rejected": -1.7115623950958252, "step": 2620 }, { "epoch": 0.6883014917560848, "grad_norm": 17.42203712463379, "learning_rate": 1.3400596746385817e-06, "logits/chosen": 0.1373758614063263, "logits/rejected": 0.24959711730480194, "logps/chosen": -511.2616271972656, "logps/rejected": -500.71527099609375, "loss": 0.6122, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1762168407440186, "rewards/margins": 0.40532511472702026, "rewards/rejected": -1.5815417766571045, "step": 2630 }, { "epoch": 0.6909186076943209, "grad_norm": 23.382965087890625, "learning_rate": 1.3198715261929587e-06, "logits/chosen": 0.24336513876914978, "logits/rejected": 0.290175199508667, "logps/chosen": -476.31549072265625, "logps/rejected": -488.55010986328125, "loss": 0.5516, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.297133207321167, "rewards/margins": 0.47855645418167114, "rewards/rejected": -1.775689721107483, "step": 2640 }, { "epoch": 0.6935357236325569, "grad_norm": 27.794416427612305, "learning_rate": 1.2997819181352823e-06, "logits/chosen": 0.029675770550966263, "logits/rejected": 0.17278780043125153, "logps/chosen": -564.1571655273438, "logps/rejected": -551.5831909179688, "loss": 0.552, "rewards/accuracies": 0.75, "rewards/chosen": -1.2936184406280518, "rewards/margins": 0.5748482942581177, "rewards/rejected": -1.8684667348861694, "step": 2650 }, { "epoch": 0.696152839570793, "grad_norm": 27.250629425048828, "learning_rate": 1.2797925279420454e-06, "logits/chosen": 0.1551249921321869, "logits/rejected": 0.22874709963798523, "logps/chosen": -531.533447265625, "logps/rejected": -539.8661499023438, "loss": 0.5713, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3435332775115967, "rewards/margins": 0.5102267265319824, "rewards/rejected": -1.8537601232528687, "step": 2660 }, { "epoch": 0.6987699555090291, "grad_norm": 17.067325592041016, "learning_rate": 1.2599050247215764e-06, "logits/chosen": 0.08357984572649002, "logits/rejected": 0.1700626015663147, "logps/chosen": -526.3895263671875, "logps/rejected": -539.6080322265625, "loss": 0.5181, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2903873920440674, "rewards/margins": 0.6351937651634216, "rewards/rejected": -1.9255812168121338, "step": 2670 }, { "epoch": 0.7013870714472651, "grad_norm": 21.607013702392578, "learning_rate": 1.2401210690746705e-06, "logits/chosen": 0.07735034078359604, "logits/rejected": 0.2136649787425995, "logps/chosen": -531.7027587890625, "logps/rejected": -509.68701171875, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": -1.3712577819824219, "rewards/margins": 0.37780770659446716, "rewards/rejected": -1.749065637588501, "step": 2680 }, { "epoch": 0.7040041873855012, "grad_norm": 22.4666748046875, "learning_rate": 1.2204423129559306e-06, "logits/chosen": 0.19079174101352692, "logits/rejected": 0.2903195023536682, "logps/chosen": -524.7943725585938, "logps/rejected": -556.605224609375, "loss": 0.5842, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4239646196365356, "rewards/margins": 0.49202004075050354, "rewards/rejected": -1.9159847497940063, "step": 2690 }, { "epoch": 0.7066213033237373, "grad_norm": 26.54542350769043, "learning_rate": 1.20087039953583e-06, "logits/chosen": 0.18427929282188416, "logits/rejected": 0.3345043957233429, "logps/chosen": -519.188720703125, "logps/rejected": -516.6758422851562, "loss": 0.6139, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3359040021896362, "rewards/margins": 0.4217115342617035, "rewards/rejected": -1.7576156854629517, "step": 2700 }, { "epoch": 0.7066213033237373, "eval_logits/chosen": 0.08023716509342194, "eval_logits/rejected": 0.194534033536911, "eval_logps/chosen": -529.7188720703125, "eval_logps/rejected": -528.66455078125, "eval_loss": 0.594265878200531, "eval_rewards/accuracies": 0.6784999966621399, "eval_rewards/chosen": -1.4135349988937378, "eval_rewards/margins": 0.4262690842151642, "eval_rewards/rejected": -1.8398040533065796, "eval_runtime": 232.327, "eval_samples_per_second": 8.609, "eval_steps_per_second": 1.076, "step": 2700 }, { "epoch": 0.7092384192619733, "grad_norm": 21.013721466064453, "learning_rate": 1.181406963063507e-06, "logits/chosen": 0.23000892996788025, "logits/rejected": 0.30790433287620544, "logps/chosen": -526.0445556640625, "logps/rejected": -554.0514526367188, "loss": 0.594, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3565577268600464, "rewards/margins": 0.46391409635543823, "rewards/rejected": -1.8204717636108398, "step": 2710 }, { "epoch": 0.7118555352002094, "grad_norm": 18.518587112426758, "learning_rate": 1.1620536287303052e-06, "logits/chosen": 0.19795748591423035, "logits/rejected": 0.2533331513404846, "logps/chosen": -560.046630859375, "logps/rejected": -535.3902587890625, "loss": 0.653, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4724786281585693, "rewards/margins": 0.31183096766471863, "rewards/rejected": -1.7843097448349, "step": 2720 }, { "epoch": 0.7144726511384454, "grad_norm": 18.251569747924805, "learning_rate": 1.1428120125340717e-06, "logits/chosen": 0.37381523847579956, "logits/rejected": 0.42608457803726196, "logps/chosen": -497.0757751464844, "logps/rejected": -489.75286865234375, "loss": 0.5237, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.37063729763031, "rewards/margins": 0.5976725816726685, "rewards/rejected": -1.9683096408843994, "step": 2730 }, { "epoch": 0.7170897670766815, "grad_norm": 33.466217041015625, "learning_rate": 1.123683721144223e-06, "logits/chosen": 0.33738958835601807, "logits/rejected": 0.4564022123813629, "logps/chosen": -546.6898193359375, "logps/rejected": -542.0783081054688, "loss": 0.5693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.379553198814392, "rewards/margins": 0.47975024580955505, "rewards/rejected": -1.8593032360076904, "step": 2740 }, { "epoch": 0.7197068830149176, "grad_norm": 16.89527702331543, "learning_rate": 1.1046703517675848e-06, "logits/chosen": 0.325847327709198, "logits/rejected": 0.5274810791015625, "logps/chosen": -478.98846435546875, "logps/rejected": -527.6720581054688, "loss": 0.5674, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1666440963745117, "rewards/margins": 0.46127814054489136, "rewards/rejected": -1.6279222965240479, "step": 2750 }, { "epoch": 0.7223239989531536, "grad_norm": 26.121501922607422, "learning_rate": 1.085773492015028e-06, "logits/chosen": 0.14125783741474152, "logits/rejected": 0.25838667154312134, "logps/chosen": -488.41827392578125, "logps/rejected": -485.6160583496094, "loss": 0.5577, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.267000436782837, "rewards/margins": 0.5229637622833252, "rewards/rejected": -1.7899643182754517, "step": 2760 }, { "epoch": 0.7249411148913897, "grad_norm": 26.272478103637695, "learning_rate": 1.0669947197689034e-06, "logits/chosen": 0.3034301698207855, "logits/rejected": 0.33288899064064026, "logps/chosen": -546.6353149414062, "logps/rejected": -535.7976684570312, "loss": 0.5917, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4825105667114258, "rewards/margins": 0.4382646679878235, "rewards/rejected": -1.920775055885315, "step": 2770 }, { "epoch": 0.7275582308296258, "grad_norm": 24.98711585998535, "learning_rate": 1.048335603051291e-06, "logits/chosen": 0.2563607692718506, "logits/rejected": 0.35739272832870483, "logps/chosen": -576.2966918945312, "logps/rejected": -583.8340454101562, "loss": 0.5134, "rewards/accuracies": 0.75, "rewards/chosen": -1.4835859537124634, "rewards/margins": 0.6780148148536682, "rewards/rejected": -2.1616008281707764, "step": 2780 }, { "epoch": 0.7301753467678618, "grad_norm": 28.010181427001953, "learning_rate": 1.0297976998930665e-06, "logits/chosen": 0.23113617300987244, "logits/rejected": 0.324890673160553, "logps/chosen": -531.7484130859375, "logps/rejected": -531.4666748046875, "loss": 0.5544, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5267302989959717, "rewards/margins": 0.5725789666175842, "rewards/rejected": -2.0993094444274902, "step": 2790 }, { "epoch": 0.7327924627060979, "grad_norm": 32.755393981933594, "learning_rate": 1.0113825582038078e-06, "logits/chosen": 0.2566104531288147, "logits/rejected": 0.37072187662124634, "logps/chosen": -558.6234741210938, "logps/rejected": -554.4527587890625, "loss": 0.5976, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.625914216041565, "rewards/margins": 0.4005354344844818, "rewards/rejected": -2.026449680328369, "step": 2800 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": 0.11360778659582138, "eval_logits/rejected": 0.23131908476352692, "eval_logps/chosen": -545.891845703125, "eval_logps/rejected": -547.037109375, "eval_loss": 0.5920617580413818, "eval_rewards/accuracies": 0.6784999966621399, "eval_rewards/chosen": -1.5752650499343872, "eval_rewards/margins": 0.44826528429985046, "eval_rewards/rejected": -2.0235302448272705, "eval_runtime": 232.6397, "eval_samples_per_second": 8.597, "eval_steps_per_second": 1.075, "step": 2800 }, { "epoch": 0.735409578644334, "grad_norm": 14.789016723632812, "learning_rate": 9.930917156425477e-07, "logits/chosen": 0.17329376935958862, "logits/rejected": 0.3116983473300934, "logps/chosen": -529.4105224609375, "logps/rejected": -557.4210815429688, "loss": 0.574, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6027743816375732, "rewards/margins": 0.5013204216957092, "rewards/rejected": -2.104094982147217, "step": 2810 }, { "epoch": 0.73802669458257, "grad_norm": 22.683055877685547, "learning_rate": 9.749266994893756e-07, "logits/chosen": 0.33349448442459106, "logits/rejected": 0.34794288873672485, "logps/chosen": -520.3793334960938, "logps/rejected": -538.2870483398438, "loss": 0.6172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6407073736190796, "rewards/margins": 0.37328147888183594, "rewards/rejected": -2.013988971710205, "step": 2820 }, { "epoch": 0.7406438105208061, "grad_norm": 27.596086502075195, "learning_rate": 9.56889026517913e-07, "logits/chosen": 0.3088427484035492, "logits/rejected": 0.48099619150161743, "logps/chosen": -532.8345947265625, "logps/rejected": -520.9815673828125, "loss": 0.6012, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.565934181213379, "rewards/margins": 0.41419801115989685, "rewards/rejected": -1.9801323413848877, "step": 2830 }, { "epoch": 0.7432609264590422, "grad_norm": 27.7423095703125, "learning_rate": 9.389802028686617e-07, "logits/chosen": 0.35369396209716797, "logits/rejected": 0.2665908932685852, "logps/chosen": -524.5148315429688, "logps/rejected": -510.5908203125, "loss": 0.616, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4806346893310547, "rewards/margins": 0.37274661660194397, "rewards/rejected": -1.8533813953399658, "step": 2840 }, { "epoch": 0.7458780423972782, "grad_norm": 16.51445770263672, "learning_rate": 9.212017239232427e-07, "logits/chosen": 0.1338292360305786, "logits/rejected": 0.34962359070777893, "logps/chosen": -557.58154296875, "logps/rejected": -550.517578125, "loss": 0.5427, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4360605478286743, "rewards/margins": 0.5636521577835083, "rewards/rejected": -1.9997127056121826, "step": 2850 }, { "epoch": 0.7484951583355143, "grad_norm": 25.57477378845215, "learning_rate": 9.03555074179533e-07, "logits/chosen": 0.12778687477111816, "logits/rejected": 0.31376713514328003, "logps/chosen": -526.8626098632812, "logps/rejected": -553.7708740234375, "loss": 0.5618, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3927412033081055, "rewards/margins": 0.521297812461853, "rewards/rejected": -1.9140390157699585, "step": 2860 }, { "epoch": 0.7511122742737504, "grad_norm": 18.769468307495117, "learning_rate": 8.860417271277067e-07, "logits/chosen": 0.10975948721170425, "logits/rejected": 0.36845940351486206, "logps/chosen": -543.9274291992188, "logps/rejected": -546.5499267578125, "loss": 0.6186, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3972618579864502, "rewards/margins": 0.3473066985607147, "rewards/rejected": -1.7445685863494873, "step": 2870 }, { "epoch": 0.7537293902119864, "grad_norm": 22.453828811645508, "learning_rate": 8.686631451272029e-07, "logits/chosen": 0.16480425000190735, "logits/rejected": 0.2969481647014618, "logps/chosen": -519.5490112304688, "logps/rejected": -512.5626831054688, "loss": 0.6033, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5151922702789307, "rewards/margins": 0.39446836709976196, "rewards/rejected": -1.9096605777740479, "step": 2880 }, { "epoch": 0.7563465061502225, "grad_norm": 20.046194076538086, "learning_rate": 8.514207792846168e-07, "logits/chosen": 0.31565916538238525, "logits/rejected": 0.39657875895500183, "logps/chosen": -525.2118530273438, "logps/rejected": -516.3604736328125, "loss": 0.5859, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.533144235610962, "rewards/margins": 0.43004053831100464, "rewards/rejected": -1.9631845951080322, "step": 2890 }, { "epoch": 0.7589636220884585, "grad_norm": 19.951534271240234, "learning_rate": 8.343160693325356e-07, "logits/chosen": 0.24349746108055115, "logits/rejected": 0.3447602689266205, "logps/chosen": -516.9713134765625, "logps/rejected": -535.793212890625, "loss": 0.586, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4326432943344116, "rewards/margins": 0.43857187032699585, "rewards/rejected": -1.8712152242660522, "step": 2900 }, { "epoch": 0.7589636220884585, "eval_logits/chosen": 0.12212979793548584, "eval_logits/rejected": 0.24171759188175201, "eval_logps/chosen": -535.29541015625, "eval_logps/rejected": -534.5630493164062, "eval_loss": 0.5905064940452576, "eval_rewards/accuracies": 0.6779999732971191, "eval_rewards/chosen": -1.4693007469177246, "eval_rewards/margins": 0.4294882118701935, "eval_rewards/rejected": -1.8987890481948853, "eval_runtime": 232.2009, "eval_samples_per_second": 8.613, "eval_steps_per_second": 1.077, "step": 2900 }, { "epoch": 0.7615807380266946, "grad_norm": 15.65670394897461, "learning_rate": 8.173504435093174e-07, "logits/chosen": 0.3191450834274292, "logits/rejected": 0.452759325504303, "logps/chosen": -491.33831787109375, "logps/rejected": -494.2073669433594, "loss": 0.5618, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4124243259429932, "rewards/margins": 0.518468976020813, "rewards/rejected": -1.9308933019638062, "step": 2910 }, { "epoch": 0.7641978539649307, "grad_norm": 17.081989288330078, "learning_rate": 8.00525318439836e-07, "logits/chosen": 0.25279700756073, "logits/rejected": 0.3255782127380371, "logps/chosen": -534.6649169921875, "logps/rejected": -551.4168701171875, "loss": 0.6125, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3950837850570679, "rewards/margins": 0.39895501732826233, "rewards/rejected": -1.7940387725830078, "step": 2920 }, { "epoch": 0.7668149699031667, "grad_norm": 21.055017471313477, "learning_rate": 7.838420990171927e-07, "logits/chosen": 0.20293910801410675, "logits/rejected": 0.274809867143631, "logps/chosen": -526.1995849609375, "logps/rejected": -538.4735717773438, "loss": 0.5446, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3020565509796143, "rewards/margins": 0.5154568552970886, "rewards/rejected": -1.8175132274627686, "step": 2930 }, { "epoch": 0.7694320858414028, "grad_norm": 17.768627166748047, "learning_rate": 7.673021782854084e-07, "logits/chosen": 0.3839934468269348, "logits/rejected": 0.3556897044181824, "logps/chosen": -529.2373046875, "logps/rejected": -502.952880859375, "loss": 0.5787, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4653526544570923, "rewards/margins": 0.48818325996398926, "rewards/rejected": -1.953536033630371, "step": 2940 }, { "epoch": 0.7720492017796389, "grad_norm": 19.30914878845215, "learning_rate": 7.509069373231039e-07, "logits/chosen": 0.23126430809497833, "logits/rejected": 0.22901423275470734, "logps/chosen": -515.2665405273438, "logps/rejected": -515.766357421875, "loss": 0.5974, "rewards/accuracies": 0.65625, "rewards/chosen": -1.515284538269043, "rewards/margins": 0.42790335416793823, "rewards/rejected": -1.9431880712509155, "step": 2950 }, { "epoch": 0.7746663177178749, "grad_norm": 20.416574478149414, "learning_rate": 7.346577451281822e-07, "logits/chosen": 0.31689321994781494, "logits/rejected": 0.37642043828964233, "logps/chosen": -539.6643676757812, "logps/rejected": -538.9885864257812, "loss": 0.5562, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.518949270248413, "rewards/margins": 0.5324884653091431, "rewards/rejected": -2.0514376163482666, "step": 2960 }, { "epoch": 0.777283433656111, "grad_norm": 28.220792770385742, "learning_rate": 7.185559585035138e-07, "logits/chosen": 0.07478724420070648, "logits/rejected": 0.2702252268791199, "logps/chosen": -551.8621215820312, "logps/rejected": -564.0684814453125, "loss": 0.5573, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.43454909324646, "rewards/margins": 0.5544053912162781, "rewards/rejected": -1.988954782485962, "step": 2970 }, { "epoch": 0.7799005495943471, "grad_norm": 18.90873146057129, "learning_rate": 7.026029219436504e-07, "logits/chosen": 0.16967260837554932, "logits/rejected": 0.33861225843429565, "logps/chosen": -534.6746215820312, "logps/rejected": -526.2569580078125, "loss": 0.6105, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4762811660766602, "rewards/margins": 0.38691508769989014, "rewards/rejected": -1.8631963729858398, "step": 2980 }, { "epoch": 0.7825176655325831, "grad_norm": 15.61099910736084, "learning_rate": 6.867999675225523e-07, "logits/chosen": 0.2207004576921463, "logits/rejected": 0.2613358795642853, "logps/chosen": -489.48529052734375, "logps/rejected": -498.1441955566406, "loss": 0.5601, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4658935070037842, "rewards/margins": 0.5164823532104492, "rewards/rejected": -1.9823758602142334, "step": 2990 }, { "epoch": 0.7851347814708192, "grad_norm": 29.250232696533203, "learning_rate": 6.711484147823663e-07, "logits/chosen": 0.1829605996608734, "logits/rejected": 0.3020482361316681, "logps/chosen": -495.75927734375, "logps/rejected": -530.975830078125, "loss": 0.5671, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4143750667572021, "rewards/margins": 0.502804696559906, "rewards/rejected": -1.9171797037124634, "step": 3000 }, { "epoch": 0.7851347814708192, "eval_logits/chosen": 0.1228410005569458, "eval_logits/rejected": 0.24235635995864868, "eval_logps/chosen": -534.0778198242188, "eval_logps/rejected": -533.9715576171875, "eval_loss": 0.5899218916893005, "eval_rewards/accuracies": 0.6794999837875366, "eval_rewards/chosen": -1.4571242332458496, "eval_rewards/margins": 0.4357497990131378, "eval_rewards/rejected": -1.8928741216659546, "eval_runtime": 232.4162, "eval_samples_per_second": 8.605, "eval_steps_per_second": 1.076, "step": 3000 }, { "epoch": 0.7877518974090553, "grad_norm": 21.612812042236328, "learning_rate": 6.556495706232413e-07, "logits/chosen": 0.28465574979782104, "logits/rejected": 0.3168385922908783, "logps/chosen": -515.8154296875, "logps/rejected": -536.3228759765625, "loss": 0.5725, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4145126342773438, "rewards/margins": 0.49681711196899414, "rewards/rejected": -1.9113296270370483, "step": 3010 }, { "epoch": 0.7903690133472913, "grad_norm": 19.919925689697266, "learning_rate": 6.403047291942057e-07, "logits/chosen": 0.25551754236221313, "logits/rejected": 0.3729092478752136, "logps/chosen": -478.81451416015625, "logps/rejected": -482.24725341796875, "loss": 0.5659, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4676393270492554, "rewards/margins": 0.47457486391067505, "rewards/rejected": -1.9422142505645752, "step": 3020 }, { "epoch": 0.7929861292855274, "grad_norm": 25.699249267578125, "learning_rate": 6.251151717851023e-07, "logits/chosen": 0.3613748848438263, "logits/rejected": 0.464524507522583, "logps/chosen": -483.13775634765625, "logps/rejected": -510.97576904296875, "loss": 0.6078, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4159023761749268, "rewards/margins": 0.4669385850429535, "rewards/rejected": -1.882840871810913, "step": 3030 }, { "epoch": 0.7956032452237635, "grad_norm": 18.529563903808594, "learning_rate": 6.100821667196041e-07, "logits/chosen": 0.1572251319885254, "logits/rejected": 0.26168403029441833, "logps/chosen": -538.1602783203125, "logps/rejected": -491.727783203125, "loss": 0.5572, "rewards/accuracies": 0.71875, "rewards/chosen": -1.387502908706665, "rewards/margins": 0.5043372511863708, "rewards/rejected": -1.8918402194976807, "step": 3040 }, { "epoch": 0.7982203611619995, "grad_norm": 18.66206169128418, "learning_rate": 5.952069692493062e-07, "logits/chosen": 0.15295560657978058, "logits/rejected": 0.2732795178890228, "logps/chosen": -466.65350341796875, "logps/rejected": -510.0077209472656, "loss": 0.5483, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4113075733184814, "rewards/margins": 0.5156591534614563, "rewards/rejected": -1.926966667175293, "step": 3050 }, { "epoch": 0.8008374771002356, "grad_norm": 32.57720947265625, "learning_rate": 5.80490821448918e-07, "logits/chosen": 0.18402081727981567, "logits/rejected": 0.25058144330978394, "logps/chosen": -528.1961059570312, "logps/rejected": -616.8970947265625, "loss": 0.5587, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4876148700714111, "rewards/margins": 0.5890025496482849, "rewards/rejected": -2.0766172409057617, "step": 3060 }, { "epoch": 0.8034545930384716, "grad_norm": 22.268369674682617, "learning_rate": 5.659349521125459e-07, "logits/chosen": 0.08234192430973053, "logits/rejected": 0.09182853996753693, "logps/chosen": -555.3732299804688, "logps/rejected": -554.161865234375, "loss": 0.596, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4457889795303345, "rewards/margins": 0.41241535544395447, "rewards/rejected": -1.8582042455673218, "step": 3070 }, { "epoch": 0.8060717089767077, "grad_norm": 18.810224533081055, "learning_rate": 5.5154057665109e-07, "logits/chosen": 0.11081822216510773, "logits/rejected": 0.25048089027404785, "logps/chosen": -493.99493408203125, "logps/rejected": -511.19464111328125, "loss": 0.5361, "rewards/accuracies": 0.75, "rewards/chosen": -1.3138765096664429, "rewards/margins": 0.5303700566291809, "rewards/rejected": -1.844246506690979, "step": 3080 }, { "epoch": 0.8086888249149438, "grad_norm": 20.452537536621094, "learning_rate": 5.373088969907586e-07, "logits/chosen": 0.1712993085384369, "logits/rejected": 0.28175634145736694, "logps/chosen": -530.8388671875, "logps/rejected": -525.1220703125, "loss": 0.5466, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.279170274734497, "rewards/margins": 0.5140555500984192, "rewards/rejected": -1.793225884437561, "step": 3090 }, { "epoch": 0.8113059408531798, "grad_norm": 18.429285049438477, "learning_rate": 5.23241101472709e-07, "logits/chosen": 0.1318766474723816, "logits/rejected": 0.12211046367883682, "logps/chosen": -517.6220092773438, "logps/rejected": -529.6044921875, "loss": 0.56, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.189562201499939, "rewards/margins": 0.5340698957443237, "rewards/rejected": -1.7236320972442627, "step": 3100 }, { "epoch": 0.8113059408531798, "eval_logits/chosen": 0.08669886738061905, "eval_logits/rejected": 0.2020561546087265, "eval_logps/chosen": -520.4529418945312, "eval_logps/rejected": -520.4334106445312, "eval_loss": 0.5915951132774353, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": -1.3208762407302856, "eval_rewards/margins": 0.43661609292030334, "eval_rewards/rejected": -1.757492184638977, "eval_runtime": 232.359, "eval_samples_per_second": 8.607, "eval_steps_per_second": 1.076, "step": 3100 }, { "epoch": 0.8139230567914159, "grad_norm": 21.272666931152344, "learning_rate": 5.09338364753818e-07, "logits/chosen": 0.22215643525123596, "logits/rejected": 0.2583310008049011, "logps/chosen": -535.0568237304688, "logps/rejected": -544.3440551757812, "loss": 0.5716, "rewards/accuracies": 0.6875, "rewards/chosen": -1.234817624092102, "rewards/margins": 0.4773196280002594, "rewards/rejected": -1.71213698387146, "step": 3110 }, { "epoch": 0.816540172729652, "grad_norm": 26.98297882080078, "learning_rate": 4.956018477086005e-07, "logits/chosen": 0.21868661046028137, "logits/rejected": 0.32554829120635986, "logps/chosen": -545.9993896484375, "logps/rejected": -515.1287231445312, "loss": 0.6281, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3831281661987305, "rewards/margins": 0.3917158544063568, "rewards/rejected": -1.7748441696166992, "step": 3120 }, { "epoch": 0.819157288667888, "grad_norm": 19.549560546875, "learning_rate": 4.820326973322764e-07, "logits/chosen": 0.072993703186512, "logits/rejected": 0.19295060634613037, "logps/chosen": -512.5438842773438, "logps/rejected": -527.2860107421875, "loss": 0.5642, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.34549880027771, "rewards/margins": 0.4567781090736389, "rewards/rejected": -1.8022769689559937, "step": 3130 }, { "epoch": 0.821774404606124, "grad_norm": 22.311878204345703, "learning_rate": 4.686320466449981e-07, "logits/chosen": 0.20619972050189972, "logits/rejected": 0.35505905747413635, "logps/chosen": -488.4278869628906, "logps/rejected": -507.36785888671875, "loss": 0.5786, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3550437688827515, "rewards/margins": 0.4867793619632721, "rewards/rejected": -1.841822862625122, "step": 3140 }, { "epoch": 0.8243915205443602, "grad_norm": 16.643848419189453, "learning_rate": 4.554010145972418e-07, "logits/chosen": 0.21577072143554688, "logits/rejected": 0.33193427324295044, "logps/chosen": -521.0413208007812, "logps/rejected": -540.6106567382812, "loss": 0.5785, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3828619718551636, "rewards/margins": 0.4862847924232483, "rewards/rejected": -1.869146704673767, "step": 3150 }, { "epoch": 0.8270086364825961, "grad_norm": 17.693086624145508, "learning_rate": 4.4234070597637455e-07, "logits/chosen": 0.11184321343898773, "logits/rejected": 0.19088369607925415, "logps/chosen": -536.9678955078125, "logps/rejected": -554.2827758789062, "loss": 0.5834, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.369461178779602, "rewards/margins": 0.48122233152389526, "rewards/rejected": -1.850683569908142, "step": 3160 }, { "epoch": 0.8296257524208323, "grad_norm": 16.465007781982422, "learning_rate": 4.2945221131440783e-07, "logits/chosen": 0.24646055698394775, "logits/rejected": 0.306671679019928, "logps/chosen": -524.2401123046875, "logps/rejected": -516.0563354492188, "loss": 0.5793, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3744683265686035, "rewards/margins": 0.4835619032382965, "rewards/rejected": -1.8580303192138672, "step": 3170 }, { "epoch": 0.8322428683590684, "grad_norm": 22.869001388549805, "learning_rate": 4.167366067969381e-07, "logits/chosen": 0.07770199328660965, "logits/rejected": 0.25347238779067993, "logps/chosen": -455.5186462402344, "logps/rejected": -507.84356689453125, "loss": 0.5827, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2709424495697021, "rewards/margins": 0.4785892367362976, "rewards/rejected": -1.7495317459106445, "step": 3180 }, { "epoch": 0.8348599842973043, "grad_norm": 20.31063461303711, "learning_rate": 4.041949541732826e-07, "logits/chosen": 0.24958959221839905, "logits/rejected": 0.255808025598526, "logps/chosen": -538.5, "logps/rejected": -554.1825561523438, "loss": 0.5788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4217197895050049, "rewards/margins": 0.490082323551178, "rewards/rejected": -1.9118019342422485, "step": 3190 }, { "epoch": 0.8374771002355405, "grad_norm": 17.181446075439453, "learning_rate": 3.9182830066782614e-07, "logits/chosen": 0.1832321137189865, "logits/rejected": 0.20756061375141144, "logps/chosen": -523.9074096679688, "logps/rejected": -562.7293701171875, "loss": 0.5796, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.473734736442566, "rewards/margins": 0.465187132358551, "rewards/rejected": -1.9389216899871826, "step": 3200 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": 0.07104705274105072, "eval_logits/rejected": 0.18436755239963531, "eval_logps/chosen": -532.9956665039062, "eval_logps/rejected": -534.0466918945312, "eval_loss": 0.590150773525238, "eval_rewards/accuracies": 0.684499979019165, "eval_rewards/chosen": -1.4463036060333252, "eval_rewards/margins": 0.44732123613357544, "eval_rewards/rejected": -1.8936247825622559, "eval_runtime": 232.4834, "eval_samples_per_second": 8.603, "eval_steps_per_second": 1.075, "step": 3200 }, { "epoch": 0.8400942161737766, "grad_norm": 15.772056579589844, "learning_rate": 3.796376788925771e-07, "logits/chosen": 0.17947904765605927, "logits/rejected": 0.38756972551345825, "logps/chosen": -536.089599609375, "logps/rejected": -513.5861206054688, "loss": 0.6027, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4111121892929077, "rewards/margins": 0.3696535527706146, "rewards/rejected": -1.7807658910751343, "step": 3210 }, { "epoch": 0.8427113321120125, "grad_norm": 22.664976119995117, "learning_rate": 3.676241067609465e-07, "logits/chosen": 0.21588890254497528, "logits/rejected": 0.31630846858024597, "logps/chosen": -577.00146484375, "logps/rejected": -551.9093017578125, "loss": 0.5948, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4189621210098267, "rewards/margins": 0.4751489758491516, "rewards/rejected": -1.8941110372543335, "step": 3220 }, { "epoch": 0.8453284480502486, "grad_norm": 30.950883865356445, "learning_rate": 3.5578858740274976e-07, "logits/chosen": 0.08771739155054092, "logits/rejected": 0.16890794038772583, "logps/chosen": -532.2547607421875, "logps/rejected": -527.4913330078125, "loss": 0.6527, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4851154088974, "rewards/margins": 0.3038921654224396, "rewards/rejected": -1.789007544517517, "step": 3230 }, { "epoch": 0.8479455639884846, "grad_norm": 22.29638671875, "learning_rate": 3.44132109080447e-07, "logits/chosen": 0.05612761899828911, "logits/rejected": 0.13626542687416077, "logps/chosen": -520.4820556640625, "logps/rejected": -517.2299194335938, "loss": 0.554, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3912557363510132, "rewards/margins": 0.4790167808532715, "rewards/rejected": -1.8702728748321533, "step": 3240 }, { "epoch": 0.8505626799267207, "grad_norm": 18.589750289916992, "learning_rate": 3.3265564510662344e-07, "logits/chosen": 0.04296109080314636, "logits/rejected": 0.2635151743888855, "logps/chosen": -563.4909057617188, "logps/rejected": -560.6986694335938, "loss": 0.538, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3561357259750366, "rewards/margins": 0.6071338653564453, "rewards/rejected": -1.963269591331482, "step": 3250 }, { "epoch": 0.8531797958649568, "grad_norm": 23.977201461791992, "learning_rate": 3.213601537627195e-07, "logits/chosen": 0.2044192999601364, "logits/rejected": 0.20763865113258362, "logps/chosen": -533.2398681640625, "logps/rejected": -539.3159790039062, "loss": 0.6017, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5438867807388306, "rewards/margins": 0.37856417894363403, "rewards/rejected": -1.9224510192871094, "step": 3260 }, { "epoch": 0.8557969118031928, "grad_norm": 19.610824584960938, "learning_rate": 3.1024657821901063e-07, "logits/chosen": 0.09625023603439331, "logits/rejected": 0.09076298773288727, "logps/chosen": -519.69287109375, "logps/rejected": -526.6182861328125, "loss": 0.5844, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3638288974761963, "rewards/margins": 0.4889647364616394, "rewards/rejected": -1.8527934551239014, "step": 3270 }, { "epoch": 0.8584140277414289, "grad_norm": 17.58568000793457, "learning_rate": 2.9931584645585654e-07, "logits/chosen": 0.24739189445972443, "logits/rejected": 0.3013184666633606, "logps/chosen": -541.0255126953125, "logps/rejected": -576.6962890625, "loss": 0.5773, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3129136562347412, "rewards/margins": 0.46883121132850647, "rewards/rejected": -1.781745195388794, "step": 3280 }, { "epoch": 0.861031143679665, "grad_norm": 12.26266860961914, "learning_rate": 2.885688711862136e-07, "logits/chosen": 0.14454945921897888, "logits/rejected": 0.2927783131599426, "logps/chosen": -537.5690307617188, "logps/rejected": -562.3256225585938, "loss": 0.5883, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4469431638717651, "rewards/margins": 0.4720059335231781, "rewards/rejected": -1.9189491271972656, "step": 3290 }, { "epoch": 0.863648259617901, "grad_norm": 20.302330017089844, "learning_rate": 2.7800654977942486e-07, "logits/chosen": 0.06817831099033356, "logits/rejected": 0.22290131449699402, "logps/chosen": -533.7552490234375, "logps/rejected": -575.96875, "loss": 0.55, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4392116069793701, "rewards/margins": 0.5773890018463135, "rewards/rejected": -2.0166003704071045, "step": 3300 }, { "epoch": 0.863648259617901, "eval_logits/chosen": 0.06103089079260826, "eval_logits/rejected": 0.17368435859680176, "eval_logps/chosen": -540.1639404296875, "eval_logps/rejected": -541.0390625, "eval_loss": 0.5897097587585449, "eval_rewards/accuracies": 0.6834999918937683, "eval_rewards/chosen": -1.5179859399795532, "eval_rewards/margins": 0.44556280970573425, "eval_rewards/rejected": -1.9635487794876099, "eval_runtime": 232.0365, "eval_samples_per_second": 8.619, "eval_steps_per_second": 1.077, "step": 3300 }, { "epoch": 0.8662653755561371, "grad_norm": 45.810630798339844, "learning_rate": 2.6762976418628797e-07, "logits/chosen": 0.32271671295166016, "logits/rejected": 0.39709824323654175, "logps/chosen": -474.7618103027344, "logps/rejected": -456.5228576660156, "loss": 0.5846, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.407684087753296, "rewards/margins": 0.46905022859573364, "rewards/rejected": -1.8767343759536743, "step": 3310 }, { "epoch": 0.8688824914943732, "grad_norm": 20.996036529541016, "learning_rate": 2.5743938086541354e-07, "logits/chosen": 0.15398995578289032, "logits/rejected": 0.3004111349582672, "logps/chosen": -523.9410400390625, "logps/rejected": -531.103759765625, "loss": 0.5832, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.478339672088623, "rewards/margins": 0.49801668524742126, "rewards/rejected": -1.9763562679290771, "step": 3320 }, { "epoch": 0.8714996074326092, "grad_norm": 21.416170120239258, "learning_rate": 2.4743625071087574e-07, "logits/chosen": 0.09900239109992981, "logits/rejected": 0.1008148342370987, "logps/chosen": -542.8681640625, "logps/rejected": -543.7512817382812, "loss": 0.562, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3520573377609253, "rewards/margins": 0.5512968301773071, "rewards/rejected": -1.9033544063568115, "step": 3330 }, { "epoch": 0.8741167233708453, "grad_norm": 20.47162437438965, "learning_rate": 2.3762120898116498e-07, "logits/chosen": 0.07932907342910767, "logits/rejected": 0.2061280757188797, "logps/chosen": -550.5972290039062, "logps/rejected": -576.6636962890625, "loss": 0.5604, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.505156397819519, "rewards/margins": 0.4955100119113922, "rewards/rejected": -2.000666379928589, "step": 3340 }, { "epoch": 0.8767338393090814, "grad_norm": 18.11985969543457, "learning_rate": 2.2799507522944048e-07, "logits/chosen": 0.16951636970043182, "logits/rejected": 0.21532103419303894, "logps/chosen": -531.1329345703125, "logps/rejected": -555.5535278320312, "loss": 0.5539, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3905164003372192, "rewards/margins": 0.5168679356575012, "rewards/rejected": -1.9073841571807861, "step": 3350 }, { "epoch": 0.8793509552473174, "grad_norm": 23.173076629638672, "learning_rate": 2.1855865323510056e-07, "logits/chosen": 0.10670924186706543, "logits/rejected": 0.25408512353897095, "logps/chosen": -534.140625, "logps/rejected": -571.8042602539062, "loss": 0.5092, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2978779077529907, "rewards/margins": 0.7049534320831299, "rewards/rejected": -2.00283145904541, "step": 3360 }, { "epoch": 0.8819680711855535, "grad_norm": 26.09234619140625, "learning_rate": 2.0931273093666575e-07, "logits/chosen": 0.1856442391872406, "logits/rejected": 0.2775765657424927, "logps/chosen": -504.78472900390625, "logps/rejected": -510.97772216796875, "loss": 0.5706, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4099186658859253, "rewards/margins": 0.48924511671066284, "rewards/rejected": -1.899163842201233, "step": 3370 }, { "epoch": 0.8845851871237895, "grad_norm": 18.44064712524414, "learning_rate": 2.002580803659873e-07, "logits/chosen": 0.18279746174812317, "logits/rejected": 0.25130945444107056, "logps/chosen": -501.90191650390625, "logps/rejected": -510.632080078125, "loss": 0.5763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.326120138168335, "rewards/margins": 0.5073062777519226, "rewards/rejected": -1.8334262371063232, "step": 3380 }, { "epoch": 0.8872023030620256, "grad_norm": 20.44449234008789, "learning_rate": 1.913954575837826e-07, "logits/chosen": 0.26278918981552124, "logits/rejected": 0.3742792010307312, "logps/chosen": -547.27490234375, "logps/rejected": -502.9208984375, "loss": 0.569, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3170632123947144, "rewards/margins": 0.5174871683120728, "rewards/rejected": -1.8345504999160767, "step": 3390 }, { "epoch": 0.8898194190002617, "grad_norm": 23.813940048217773, "learning_rate": 1.827256026165028e-07, "logits/chosen": 0.15093761682510376, "logits/rejected": 0.15371516346931458, "logps/chosen": -576.1170654296875, "logps/rejected": -542.66357421875, "loss": 0.5519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2780691385269165, "rewards/margins": 0.5459933280944824, "rewards/rejected": -1.8240623474121094, "step": 3400 }, { "epoch": 0.8898194190002617, "eval_logits/chosen": 0.049354203045368195, "eval_logits/rejected": 0.16207368671894073, "eval_logps/chosen": -524.5573120117188, "eval_logps/rejected": -524.9038696289062, "eval_loss": 0.5905484557151794, "eval_rewards/accuracies": 0.684499979019165, "eval_rewards/chosen": -1.3619197607040405, "eval_rewards/margins": 0.4402773082256317, "eval_rewards/rejected": -1.8021970987319946, "eval_runtime": 232.2898, "eval_samples_per_second": 8.61, "eval_steps_per_second": 1.076, "step": 3400 }, { "epoch": 0.8924365349384977, "grad_norm": 22.652015686035156, "learning_rate": 1.7424923939454274e-07, "logits/chosen": 0.09193596988916397, "logits/rejected": 0.12269000709056854, "logps/chosen": -560.306884765625, "logps/rejected": -547.3897705078125, "loss": 0.5522, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3574447631835938, "rewards/margins": 0.512332558631897, "rewards/rejected": -1.8697776794433594, "step": 3410 }, { "epoch": 0.8950536508767338, "grad_norm": 20.60724449157715, "learning_rate": 1.6596707569179304e-07, "logits/chosen": 0.11068376153707504, "logits/rejected": 0.2603934407234192, "logps/chosen": -541.2059326171875, "logps/rejected": -526.3267211914062, "loss": 0.5604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.222874402999878, "rewards/margins": 0.528007447719574, "rewards/rejected": -1.7508817911148071, "step": 3420 }, { "epoch": 0.8976707668149699, "grad_norm": 19.67418098449707, "learning_rate": 1.578798030665385e-07, "logits/chosen": 0.07019755989313126, "logits/rejected": 0.25848856568336487, "logps/chosen": -518.75146484375, "logps/rejected": -542.2596435546875, "loss": 0.528, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2500635385513306, "rewards/margins": 0.5684723854064941, "rewards/rejected": -1.8185360431671143, "step": 3430 }, { "epoch": 0.9002878827532059, "grad_norm": 21.947776794433594, "learning_rate": 1.499880968037165e-07, "logits/chosen": 0.2768905758857727, "logits/rejected": 0.3939592242240906, "logps/chosen": -490.6131896972656, "logps/rejected": -477.58380126953125, "loss": 0.5807, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2620429992675781, "rewards/margins": 0.421647846698761, "rewards/rejected": -1.6836907863616943, "step": 3440 }, { "epoch": 0.902904998691442, "grad_norm": 23.7869930267334, "learning_rate": 1.4229261585852805e-07, "logits/chosen": 0.28201746940612793, "logits/rejected": 0.20732636749744415, "logps/chosen": -523.0221557617188, "logps/rejected": -520.0089721679688, "loss": 0.5617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2637730836868286, "rewards/margins": 0.4759213328361511, "rewards/rejected": -1.739694595336914, "step": 3450 }, { "epoch": 0.9055221146296781, "grad_norm": 24.20222282409668, "learning_rate": 1.3479400280141886e-07, "logits/chosen": 0.11287301778793335, "logits/rejected": 0.17427489161491394, "logps/chosen": -487.00262451171875, "logps/rejected": -517.9505615234375, "loss": 0.5644, "rewards/accuracies": 0.71875, "rewards/chosen": -1.300456166267395, "rewards/margins": 0.5554144382476807, "rewards/rejected": -1.8558704853057861, "step": 3460 }, { "epoch": 0.9081392305679141, "grad_norm": 23.366870880126953, "learning_rate": 1.2749288376442044e-07, "logits/chosen": 0.08213616907596588, "logits/rejected": 0.1927454173564911, "logps/chosen": -560.6731567382812, "logps/rejected": -532.0469970703125, "loss": 0.5432, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3204230070114136, "rewards/margins": 0.5710306763648987, "rewards/rejected": -1.891453742980957, "step": 3470 }, { "epoch": 0.9107563465061502, "grad_norm": 21.51507568359375, "learning_rate": 1.203898683888713e-07, "logits/chosen": 0.22756421566009521, "logits/rejected": 0.25787419080734253, "logps/chosen": -500.9013671875, "logps/rejected": -513.2033081054688, "loss": 0.6123, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4208608865737915, "rewards/margins": 0.40193843841552734, "rewards/rejected": -1.8227994441986084, "step": 3480 }, { "epoch": 0.9133734624443863, "grad_norm": 22.2541446685791, "learning_rate": 1.1348554977451132e-07, "logits/chosen": 0.16574542224407196, "logits/rejected": 0.22871682047843933, "logps/chosen": -550.3922119140625, "logps/rejected": -538.7455444335938, "loss": 0.5858, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4138902425765991, "rewards/margins": 0.44176238775253296, "rewards/rejected": -1.8556525707244873, "step": 3490 }, { "epoch": 0.9159905783826223, "grad_norm": 17.759902954101562, "learning_rate": 1.0678050442995802e-07, "logits/chosen": 0.006184411235153675, "logits/rejected": 0.05719981715083122, "logps/chosen": -541.510009765625, "logps/rejected": -514.5938110351562, "loss": 0.5939, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4001209735870361, "rewards/margins": 0.437242329120636, "rewards/rejected": -1.8373632431030273, "step": 3500 }, { "epoch": 0.9159905783826223, "eval_logits/chosen": 0.056149620562791824, "eval_logits/rejected": 0.16871705651283264, "eval_logps/chosen": -528.9203491210938, "eval_logps/rejected": -529.6995239257812, "eval_loss": 0.589903712272644, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -1.405550241470337, "eval_rewards/margins": 0.44460350275039673, "eval_rewards/rejected": -1.8501536846160889, "eval_runtime": 232.5763, "eval_samples_per_second": 8.599, "eval_steps_per_second": 1.075, "step": 3500 }, { "epoch": 0.9186076943208584, "grad_norm": 16.307315826416016, "learning_rate": 1.0027529222456755e-07, "logits/chosen": 0.26652759313583374, "logits/rejected": 0.27262991666793823, "logps/chosen": -505.27471923828125, "logps/rejected": -525.823974609375, "loss": 0.5144, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2464582920074463, "rewards/margins": 0.6129791140556335, "rewards/rejected": -1.859437346458435, "step": 3510 }, { "epoch": 0.9212248102590945, "grad_norm": 19.756351470947266, "learning_rate": 9.397045634168766e-08, "logits/chosen": 0.0008102863794192672, "logits/rejected": 0.19065120816230774, "logps/chosen": -516.8560791015625, "logps/rejected": -553.0278930664062, "loss": 0.5415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2412116527557373, "rewards/margins": 0.5996343493461609, "rewards/rejected": -1.8408458232879639, "step": 3520 }, { "epoch": 0.9238419261973305, "grad_norm": 20.353652954101562, "learning_rate": 8.78665232332998e-08, "logits/chosen": 0.12386944144964218, "logits/rejected": 0.27118274569511414, "logps/chosen": -481.002685546875, "logps/rejected": -510.55230712890625, "loss": 0.5669, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.342132806777954, "rewards/margins": 0.4541274905204773, "rewards/rejected": -1.7962604761123657, "step": 3530 }, { "epoch": 0.9264590421355666, "grad_norm": 21.46174430847168, "learning_rate": 8.196400257606208e-08, "logits/chosen": 0.1211334615945816, "logits/rejected": 0.4012266993522644, "logps/chosen": -564.7858276367188, "logps/rejected": -576.843994140625, "loss": 0.5587, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3764750957489014, "rewards/margins": 0.5135365724563599, "rewards/rejected": -1.8900115489959717, "step": 3540 }, { "epoch": 0.9290761580738026, "grad_norm": 21.205127716064453, "learning_rate": 7.626338722875076e-08, "logits/chosen": 0.26407915353775024, "logits/rejected": 0.28373825550079346, "logps/chosen": -528.8353271484375, "logps/rejected": -558.7482299804688, "loss": 0.6023, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4041508436203003, "rewards/margins": 0.4070183336734772, "rewards/rejected": -1.8111692667007446, "step": 3550 }, { "epoch": 0.9316932740120387, "grad_norm": 19.687849044799805, "learning_rate": 7.076515319110688e-08, "logits/chosen": 0.2693914771080017, "logits/rejected": 0.3067939281463623, "logps/chosen": -492.1351623535156, "logps/rejected": -476.29388427734375, "loss": 0.5713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.274938941001892, "rewards/margins": 0.5187603235244751, "rewards/rejected": -1.7936992645263672, "step": 3560 }, { "epoch": 0.9343103899502748, "grad_norm": 21.987916946411133, "learning_rate": 6.54697595640899e-08, "logits/chosen": 0.05399775505065918, "logits/rejected": 0.1626650094985962, "logps/chosen": -542.186767578125, "logps/rejected": -550.5693359375, "loss": 0.5266, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.230676293373108, "rewards/margins": 0.5804111361503601, "rewards/rejected": -1.8110872507095337, "step": 3570 }, { "epoch": 0.9369275058885108, "grad_norm": 13.982370376586914, "learning_rate": 6.037764851154426e-08, "logits/chosen": 0.10340269654989243, "logits/rejected": 0.30877891182899475, "logps/chosen": -515.5814819335938, "logps/rejected": -562.509521484375, "loss": 0.5631, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2640832662582397, "rewards/margins": 0.5411828756332397, "rewards/rejected": -1.8052661418914795, "step": 3580 }, { "epoch": 0.9395446218267469, "grad_norm": 18.1751708984375, "learning_rate": 5.548924522327748e-08, "logits/chosen": 0.06784630566835403, "logits/rejected": 0.2033408135175705, "logps/chosen": -519.9273681640625, "logps/rejected": -530.5051879882812, "loss": 0.5538, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2889801263809204, "rewards/margins": 0.5127965211868286, "rewards/rejected": -1.801776647567749, "step": 3590 }, { "epoch": 0.942161737764983, "grad_norm": 20.21278953552246, "learning_rate": 5.0804957879556915e-08, "logits/chosen": 0.056394852697849274, "logits/rejected": 0.23709776997566223, "logps/chosen": -462.7340393066406, "logps/rejected": -502.6092834472656, "loss": 0.5632, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2453665733337402, "rewards/margins": 0.4758334755897522, "rewards/rejected": -1.7211999893188477, "step": 3600 }, { "epoch": 0.942161737764983, "eval_logits/chosen": 0.051600489765405655, "eval_logits/rejected": 0.1634536236524582, "eval_logps/chosen": -525.2161254882812, "eval_logps/rejected": -525.723388671875, "eval_loss": 0.5906327962875366, "eval_rewards/accuracies": 0.6815000176429749, "eval_rewards/chosen": -1.3685081005096436, "eval_rewards/margins": 0.4418841004371643, "eval_rewards/rejected": -1.8103920221328735, "eval_runtime": 232.6085, "eval_samples_per_second": 8.598, "eval_steps_per_second": 1.075, "step": 3600 }, { "epoch": 0.944778853703219, "grad_norm": 24.784046173095703, "learning_rate": 4.632517761702815e-08, "logits/chosen": 0.1571771204471588, "logits/rejected": 0.32081273198127747, "logps/chosen": -489.2625427246094, "logps/rejected": -496.1024475097656, "loss": 0.5707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3370298147201538, "rewards/margins": 0.49337905645370483, "rewards/rejected": -1.830409049987793, "step": 3610 }, { "epoch": 0.9473959696414551, "grad_norm": 31.509357452392578, "learning_rate": 4.205027849605359e-08, "logits/chosen": 0.29533377289772034, "logits/rejected": 0.265569269657135, "logps/chosen": -500.92803955078125, "logps/rejected": -486.66973876953125, "loss": 0.6225, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3463305234909058, "rewards/margins": 0.3861822783946991, "rewards/rejected": -1.7325128316879272, "step": 3620 }, { "epoch": 0.9500130855796912, "grad_norm": 21.457324981689453, "learning_rate": 3.798061746947995e-08, "logits/chosen": 0.2158433496952057, "logits/rejected": 0.3043590486049652, "logps/chosen": -524.4801025390625, "logps/rejected": -508.20953369140625, "loss": 0.5838, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4179335832595825, "rewards/margins": 0.46254149079322815, "rewards/rejected": -1.8804748058319092, "step": 3630 }, { "epoch": 0.9526302015179272, "grad_norm": 25.418264389038086, "learning_rate": 3.411653435283158e-08, "logits/chosen": 0.06112390756607056, "logits/rejected": 0.14306578040122986, "logps/chosen": -532.900390625, "logps/rejected": -495.4185485839844, "loss": 0.5824, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.300320029258728, "rewards/margins": 0.4503448009490967, "rewards/rejected": -1.7506647109985352, "step": 3640 }, { "epoch": 0.9552473174561633, "grad_norm": 17.69974708557129, "learning_rate": 3.04583517959367e-08, "logits/chosen": 0.006334272213280201, "logits/rejected": 0.10581526905298233, "logps/chosen": -490.5302734375, "logps/rejected": -496.6221618652344, "loss": 0.5431, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2481968402862549, "rewards/margins": 0.5572769045829773, "rewards/rejected": -1.8054739236831665, "step": 3650 }, { "epoch": 0.9578644333943994, "grad_norm": 21.300947189331055, "learning_rate": 2.7006375255985984e-08, "logits/chosen": 0.26631540060043335, "logits/rejected": 0.2855226993560791, "logps/chosen": -517.7824096679688, "logps/rejected": -547.991455078125, "loss": 0.6044, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3668075799942017, "rewards/margins": 0.38124316930770874, "rewards/rejected": -1.7480506896972656, "step": 3660 }, { "epoch": 0.9604815493326354, "grad_norm": 20.375226974487305, "learning_rate": 2.3760892972027328e-08, "logits/chosen": 0.05325336381793022, "logits/rejected": 0.11790470033884048, "logps/chosen": -533.8929443359375, "logps/rejected": -533.8286743164062, "loss": 0.5873, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4277513027191162, "rewards/margins": 0.46243423223495483, "rewards/rejected": -1.8901855945587158, "step": 3670 }, { "epoch": 0.9630986652708715, "grad_norm": 22.46816062927246, "learning_rate": 2.072217594089765e-08, "logits/chosen": 0.17507997155189514, "logits/rejected": 0.20632827281951904, "logps/chosen": -517.567138671875, "logps/rejected": -549.365234375, "loss": 0.543, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3189188241958618, "rewards/margins": 0.5807808637619019, "rewards/rejected": -1.8996999263763428, "step": 3680 }, { "epoch": 0.9657157812091076, "grad_norm": 17.85133934020996, "learning_rate": 1.789047789459375e-08, "logits/chosen": 0.027010198682546616, "logits/rejected": 0.21749186515808105, "logps/chosen": -594.4285278320312, "logps/rejected": -558.7322387695312, "loss": 0.5585, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2619202136993408, "rewards/margins": 0.5537473559379578, "rewards/rejected": -1.8156675100326538, "step": 3690 }, { "epoch": 0.9683328971473436, "grad_norm": 28.183317184448242, "learning_rate": 1.5266035279088708e-08, "logits/chosen": 0.10956914722919464, "logits/rejected": 0.19487416744232178, "logps/chosen": -572.9104614257812, "logps/rejected": -580.4852905273438, "loss": 0.5488, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4588168859481812, "rewards/margins": 0.5328525304794312, "rewards/rejected": -1.9916694164276123, "step": 3700 }, { "epoch": 0.9683328971473436, "eval_logits/chosen": 0.05304437875747681, "eval_logits/rejected": 0.1655142903327942, "eval_logps/chosen": -525.91552734375, "eval_logps/rejected": -526.5010375976562, "eval_loss": 0.5903691649436951, "eval_rewards/accuracies": 0.6834999918937683, "eval_rewards/chosen": -1.375501275062561, "eval_rewards/margins": 0.4426679015159607, "eval_rewards/rejected": -1.818169116973877, "eval_runtime": 232.1562, "eval_samples_per_second": 8.615, "eval_steps_per_second": 1.077, "step": 3700 }, { "epoch": 0.9709500130855797, "grad_norm": 33.79894256591797, "learning_rate": 1.2849067234584623e-08, "logits/chosen": 0.24655885994434357, "logits/rejected": 0.3253239095211029, "logps/chosen": -476.681396484375, "logps/rejected": -499.00494384765625, "loss": 0.6091, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3145527839660645, "rewards/margins": 0.4425618648529053, "rewards/rejected": -1.7571147680282593, "step": 3710 }, { "epoch": 0.9735671290238157, "grad_norm": 20.961793899536133, "learning_rate": 1.0639775577218625e-08, "logits/chosen": 0.16103433072566986, "logits/rejected": 0.22914746403694153, "logps/chosen": -490.27777099609375, "logps/rejected": -488.0320739746094, "loss": 0.5535, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3744248151779175, "rewards/margins": 0.5473370552062988, "rewards/rejected": -1.9217618703842163, "step": 3720 }, { "epoch": 0.9761842449620518, "grad_norm": 18.385028839111328, "learning_rate": 8.638344782207486e-09, "logits/chosen": 0.1808285266160965, "logits/rejected": 0.2911062240600586, "logps/chosen": -493.48321533203125, "logps/rejected": -501.9202575683594, "loss": 0.56, "rewards/accuracies": 0.71875, "rewards/chosen": -1.277074933052063, "rewards/margins": 0.5030657649040222, "rewards/rejected": -1.7801406383514404, "step": 3730 }, { "epoch": 0.9788013609002879, "grad_norm": 19.80834197998047, "learning_rate": 6.84494196844715e-09, "logits/chosen": 0.11059341579675674, "logits/rejected": 0.3094932436943054, "logps/chosen": -525.2141723632812, "logps/rejected": -548.9354248046875, "loss": 0.5209, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2858692407608032, "rewards/margins": 0.6285649538040161, "rewards/rejected": -1.9144340753555298, "step": 3740 }, { "epoch": 0.9814184768385239, "grad_norm": 17.796337127685547, "learning_rate": 5.259716884556121e-09, "logits/chosen": 0.08546547591686249, "logits/rejected": 0.2145168036222458, "logps/chosen": -523.2117919921875, "logps/rejected": -539.4160766601562, "loss": 0.5475, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.280202865600586, "rewards/margins": 0.5271276831626892, "rewards/rejected": -1.8073304891586304, "step": 3750 }, { "epoch": 0.98403559277676, "grad_norm": 19.8670654296875, "learning_rate": 3.882801896372967e-09, "logits/chosen": 0.17666058242321014, "logits/rejected": 0.23271194100379944, "logps/chosen": -512.79736328125, "logps/rejected": -507.11505126953125, "loss": 0.6112, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3305553197860718, "rewards/margins": 0.4106437563896179, "rewards/rejected": -1.7411991357803345, "step": 3760 }, { "epoch": 0.9866527087149961, "grad_norm": 21.29877281188965, "learning_rate": 2.7143119759026614e-09, "logits/chosen": 0.22256436944007874, "logits/rejected": 0.329804003238678, "logps/chosen": -550.2042236328125, "logps/rejected": -548.9179077148438, "loss": 0.5132, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2466299533843994, "rewards/margins": 0.5815029144287109, "rewards/rejected": -1.8281329870224, "step": 3770 }, { "epoch": 0.9892698246532321, "grad_norm": 21.455854415893555, "learning_rate": 1.754344691717591e-09, "logits/chosen": 0.06916506588459015, "logits/rejected": 0.15464463829994202, "logps/chosen": -504.8829040527344, "logps/rejected": -547.6551513671875, "loss": 0.6246, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3374298810958862, "rewards/margins": 0.33050769567489624, "rewards/rejected": -1.6679375171661377, "step": 3780 }, { "epoch": 0.9918869405914682, "grad_norm": 24.640779495239258, "learning_rate": 1.0029802008096335e-09, "logits/chosen": 0.07244547456502914, "logits/rejected": 0.21235093474388123, "logps/chosen": -528.9513549804688, "logps/rejected": -539.2667236328125, "loss": 0.5648, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3598108291625977, "rewards/margins": 0.5505877733230591, "rewards/rejected": -1.9103988409042358, "step": 3790 }, { "epoch": 0.9945040565297043, "grad_norm": 23.488046646118164, "learning_rate": 4.602812418974534e-10, "logits/chosen": 0.006186048500239849, "logits/rejected": 0.1412689983844757, "logps/chosen": -535.95263671875, "logps/rejected": -533.4951782226562, "loss": 0.5674, "rewards/accuracies": 0.65625, "rewards/chosen": -1.274775505065918, "rewards/margins": 0.5218435525894165, "rewards/rejected": -1.7966190576553345, "step": 3800 }, { "epoch": 0.9945040565297043, "eval_logits/chosen": 0.052397292107343674, "eval_logits/rejected": 0.16461612284183502, "eval_logps/chosen": -525.971923828125, "eval_logps/rejected": -526.6827392578125, "eval_loss": 0.5901351571083069, "eval_rewards/accuracies": 0.684499979019165, "eval_rewards/chosen": -1.3760651350021362, "eval_rewards/margins": 0.44392091035842896, "eval_rewards/rejected": -1.81998610496521, "eval_runtime": 232.5667, "eval_samples_per_second": 8.6, "eval_steps_per_second": 1.075, "step": 3800 }, { "epoch": 0.9971211724679403, "grad_norm": 21.794204711914062, "learning_rate": 1.2629313018819312e-10, "logits/chosen": 0.02779226377606392, "logits/rejected": 0.21272841095924377, "logps/chosen": -511.3211364746094, "logps/rejected": -515.0205688476562, "loss": 0.5489, "rewards/accuracies": 0.75, "rewards/chosen": -1.2764627933502197, "rewards/margins": 0.537833571434021, "rewards/rejected": -1.8142963647842407, "step": 3810 }, { "epoch": 0.9997382884061764, "grad_norm": 21.642515182495117, "learning_rate": 1.0437535929996855e-12, "logits/chosen": 0.1357661634683609, "logits/rejected": 0.14389568567276, "logps/chosen": -550.05810546875, "logps/rejected": -537.67041015625, "loss": 0.543, "rewards/accuracies": 0.71875, "rewards/chosen": -1.337266206741333, "rewards/margins": 0.5570909380912781, "rewards/rejected": -1.8943573236465454, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0101, "train_samples_per_second": 6068984.159, "train_steps_per_second": 379323.919 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }