diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6395 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026171159382360636, + "grad_norm": 3.590468168258667, + "learning_rate": 1.3054830287206268e-08, + "logits/chosen": 0.6792653799057007, + "logits/rejected": 1.31020188331604, + "logps/chosen": -469.49981689453125, + "logps/rejected": -525.3796997070312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0026171159382360636, + "grad_norm": 3.1591908931732178, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": 1.5021909475326538, + "logits/rejected": 1.427976131439209, + "logps/chosen": -398.0495300292969, + "logps/rejected": -356.86016845703125, + "loss": 0.6928, + "rewards/accuracies": 0.5138888955116272, + "rewards/chosen": -8.217601134674624e-05, + "rewards/margins": 0.0007430262048728764, + "rewards/rejected": -0.000825202208943665, + "step": 10 + }, + { + "epoch": 0.005234231876472127, + "grad_norm": 3.51601243019104, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": 1.3317842483520508, + "logits/rejected": 1.638771414756775, + "logps/chosen": -435.8251953125, + "logps/rejected": -342.2559509277344, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00043620201176963747, + "rewards/margins": 0.0005908687599003315, + "rewards/rejected": -0.00015466664626728743, + "step": 20 + }, + { + "epoch": 0.007851347814708191, + "grad_norm": 3.3784544467926025, + "learning_rate": 3.9164490861618804e-07, + "logits/chosen": 1.3975493907928467, + "logits/rejected": 1.4588085412979126, + "logps/chosen": -377.9482727050781, + "logps/rejected": -355.25885009765625, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0005684455973096192, + "rewards/margins": 0.0005999829736538231, + "rewards/rejected": -3.1537445465801284e-05, + "step": 30 + }, + { + "epoch": 0.010468463752944255, + "grad_norm": 2.979464292526245, + "learning_rate": 5.221932114882506e-07, + "logits/chosen": 1.6848558187484741, + "logits/rejected": 1.9189517498016357, + "logps/chosen": -316.1363525390625, + "logps/rejected": -315.66058349609375, + "loss": 0.6934, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00017661902529653162, + "rewards/margins": -0.0004952313611283898, + "rewards/rejected": 0.00031861235038377345, + "step": 40 + }, + { + "epoch": 0.01308557969118032, + "grad_norm": 3.1099374294281006, + "learning_rate": 6.527415143603135e-07, + "logits/chosen": 1.5289150476455688, + "logits/rejected": 1.5212490558624268, + "logps/chosen": -398.5328674316406, + "logps/rejected": -336.2831115722656, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0001419751497451216, + "rewards/margins": 0.00047931409790180624, + "rewards/rejected": -0.0003373388899490237, + "step": 50 + }, + { + "epoch": 0.015702695629416383, + "grad_norm": 2.992774248123169, + "learning_rate": 7.832898172323761e-07, + "logits/chosen": 1.4803454875946045, + "logits/rejected": 1.6450494527816772, + "logps/chosen": -373.1556396484375, + "logps/rejected": -328.5540771484375, + "loss": 0.6932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00015133472334127873, + "rewards/margins": -8.949339098762721e-05, + "rewards/rejected": 0.00024082818708848208, + "step": 60 + }, + { + "epoch": 0.018319811567652448, + "grad_norm": 3.166975498199463, + "learning_rate": 9.138381201044387e-07, + "logits/chosen": 1.3975117206573486, + "logits/rejected": 1.632108449935913, + "logps/chosen": -385.858154296875, + "logps/rejected": -334.81219482421875, + "loss": 0.6934, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.00018815476505551487, + "rewards/margins": -0.0004339146544225514, + "rewards/rejected": 0.0002457600203342736, + "step": 70 + }, + { + "epoch": 0.02093692750588851, + "grad_norm": 3.23146915435791, + "learning_rate": 1.0443864229765013e-06, + "logits/chosen": 1.7188682556152344, + "logits/rejected": 1.6811161041259766, + "logps/chosen": -383.59771728515625, + "logps/rejected": -346.7442321777344, + "loss": 0.6935, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00110594870056957, + "rewards/margins": -0.0007629155879840255, + "rewards/rejected": -0.00034303305437788367, + "step": 80 + }, + { + "epoch": 0.023554043444124574, + "grad_norm": 3.223733901977539, + "learning_rate": 1.1749347258485642e-06, + "logits/chosen": 1.5662400722503662, + "logits/rejected": 1.790412187576294, + "logps/chosen": -364.4346923828125, + "logps/rejected": -327.1966552734375, + "loss": 0.6928, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0004470300336834043, + "rewards/margins": 0.0006952629191800952, + "rewards/rejected": -0.00114229298196733, + "step": 90 + }, + { + "epoch": 0.02617115938236064, + "grad_norm": 2.721588373184204, + "learning_rate": 1.305483028720627e-06, + "logits/chosen": 1.4209253787994385, + "logits/rejected": 1.6146681308746338, + "logps/chosen": -368.99169921875, + "logps/rejected": -334.1181640625, + "loss": 0.6932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0008529865299351513, + "rewards/margins": -8.540081762475893e-05, + "rewards/rejected": -0.0007675857050344348, + "step": 100 + }, + { + "epoch": 0.02617115938236064, + "eval_logits/chosen": 1.2742817401885986, + "eval_logits/rejected": 1.4772121906280518, + "eval_logps/chosen": -388.4209289550781, + "eval_logps/rejected": -344.7744140625, + "eval_loss": 0.6929848194122314, + "eval_rewards/accuracies": 0.5210000276565552, + "eval_rewards/chosen": -0.0005557815893553197, + "eval_rewards/margins": 0.00034635106567293406, + "eval_rewards/rejected": -0.0009021326550282538, + "eval_runtime": 233.4714, + "eval_samples_per_second": 8.566, + "eval_steps_per_second": 1.071, + "step": 100 + }, + { + "epoch": 0.028788275320596704, + "grad_norm": 3.0894269943237305, + "learning_rate": 1.4360313315926894e-06, + "logits/chosen": 1.4099972248077393, + "logits/rejected": 1.5973542928695679, + "logps/chosen": -405.90130615234375, + "logps/rejected": -338.3550720214844, + "loss": 0.6929, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0014196943957358599, + "rewards/margins": 0.00048225713544525206, + "rewards/rejected": -0.0019019513856619596, + "step": 110 + }, + { + "epoch": 0.031405391258832765, + "grad_norm": 3.1108322143554688, + "learning_rate": 1.5665796344647521e-06, + "logits/chosen": 1.4038909673690796, + "logits/rejected": 1.6449644565582275, + "logps/chosen": -425.57330322265625, + "logps/rejected": -380.5539855957031, + "loss": 0.6926, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0003092998522333801, + "rewards/margins": 0.0012004419695585966, + "rewards/rejected": -0.0015097421128302813, + "step": 120 + }, + { + "epoch": 0.03402250719706883, + "grad_norm": 3.5928196907043457, + "learning_rate": 1.6971279373368146e-06, + "logits/chosen": 1.4526954889297485, + "logits/rejected": 1.6924293041229248, + "logps/chosen": -368.2237548828125, + "logps/rejected": -353.4678955078125, + "loss": 0.6924, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0008503898861818016, + "rewards/margins": 0.0015992727130651474, + "rewards/rejected": -0.00244966265745461, + "step": 130 + }, + { + "epoch": 0.036639623135304895, + "grad_norm": 3.48256778717041, + "learning_rate": 1.8276762402088774e-06, + "logits/chosen": 1.583683729171753, + "logits/rejected": 1.6599153280258179, + "logps/chosen": -401.0623474121094, + "logps/rejected": -320.5968017578125, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0012277833884581923, + "rewards/margins": 0.0012606108793988824, + "rewards/rejected": -0.0024883942678570747, + "step": 140 + }, + { + "epoch": 0.03925673907354096, + "grad_norm": 3.6008684635162354, + "learning_rate": 1.9582245430809403e-06, + "logits/chosen": 1.608473539352417, + "logits/rejected": 1.6299419403076172, + "logps/chosen": -419.42083740234375, + "logps/rejected": -340.1468811035156, + "loss": 0.6921, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0011100767878815532, + "rewards/margins": 0.0020758803002536297, + "rewards/rejected": -0.003185956971719861, + "step": 150 + }, + { + "epoch": 0.04187385501177702, + "grad_norm": 3.3119959831237793, + "learning_rate": 2.0887728459530026e-06, + "logits/chosen": 1.191816806793213, + "logits/rejected": 1.4365403652191162, + "logps/chosen": -375.5677185058594, + "logps/rejected": -358.6431884765625, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0033069555647671223, + "rewards/margins": 0.0001626126904739067, + "rewards/rejected": -0.0034695682115852833, + "step": 160 + }, + { + "epoch": 0.04449097095001309, + "grad_norm": 3.996933937072754, + "learning_rate": 2.2193211488250653e-06, + "logits/chosen": 1.554457426071167, + "logits/rejected": 1.7651093006134033, + "logps/chosen": -324.51995849609375, + "logps/rejected": -305.23175048828125, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0038299753796309233, + "rewards/margins": 0.0007525371038354933, + "rewards/rejected": -0.004582512192428112, + "step": 170 + }, + { + "epoch": 0.04710808688824915, + "grad_norm": 2.7113592624664307, + "learning_rate": 2.3498694516971284e-06, + "logits/chosen": 1.3284379243850708, + "logits/rejected": 1.6581776142120361, + "logps/chosen": -373.4314880371094, + "logps/rejected": -329.12628173828125, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0027942766901105642, + "rewards/margins": 0.0031869211234152317, + "rewards/rejected": -0.005981197580695152, + "step": 180 + }, + { + "epoch": 0.04972520282648522, + "grad_norm": 3.2230942249298096, + "learning_rate": 2.4804177545691907e-06, + "logits/chosen": 1.333396315574646, + "logits/rejected": 1.4282341003417969, + "logps/chosen": -385.22369384765625, + "logps/rejected": -338.3575744628906, + "loss": 0.6923, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0038786418735980988, + "rewards/margins": 0.0016467362875118852, + "rewards/rejected": -0.005525378044694662, + "step": 190 + }, + { + "epoch": 0.05234231876472128, + "grad_norm": 2.9138338565826416, + "learning_rate": 2.610966057441254e-06, + "logits/chosen": 1.4856141805648804, + "logits/rejected": 1.6409976482391357, + "logps/chosen": -361.8666687011719, + "logps/rejected": -304.0251159667969, + "loss": 0.6916, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.005739121697843075, + "rewards/margins": 0.00320308655500412, + "rewards/rejected": -0.00894220918416977, + "step": 200 + }, + { + "epoch": 0.05234231876472128, + "eval_logits/chosen": 1.275007724761963, + "eval_logits/rejected": 1.4769618511199951, + "eval_logps/chosen": -388.9444274902344, + "eval_logps/rejected": -345.5613098144531, + "eval_loss": 0.6916878819465637, + "eval_rewards/accuracies": 0.5799999833106995, + "eval_rewards/chosen": -0.005790840368717909, + "eval_rewards/margins": 0.0029809444677084684, + "eval_rewards/rejected": -0.008771784603595734, + "eval_runtime": 233.1655, + "eval_samples_per_second": 8.578, + "eval_steps_per_second": 1.072, + "step": 200 + }, + { + "epoch": 0.05495943470295734, + "grad_norm": 3.762040615081787, + "learning_rate": 2.741514360313316e-06, + "logits/chosen": 1.5017220973968506, + "logits/rejected": 1.6038427352905273, + "logps/chosen": -397.37506103515625, + "logps/rejected": -332.4270324707031, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006693325936794281, + "rewards/margins": 0.004596198443323374, + "rewards/rejected": -0.011289524845778942, + "step": 210 + }, + { + "epoch": 0.05757655064119341, + "grad_norm": 3.389441728591919, + "learning_rate": 2.872062663185379e-06, + "logits/chosen": 1.4440081119537354, + "logits/rejected": 1.5644251108169556, + "logps/chosen": -370.3480224609375, + "logps/rejected": -320.7288818359375, + "loss": 0.6904, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.008492258377373219, + "rewards/margins": 0.005523340776562691, + "rewards/rejected": -0.014015598222613335, + "step": 220 + }, + { + "epoch": 0.06019366657942947, + "grad_norm": 3.2270307540893555, + "learning_rate": 3.0026109660574416e-06, + "logits/chosen": 1.2713916301727295, + "logits/rejected": 1.3412346839904785, + "logps/chosen": -443.724853515625, + "logps/rejected": -387.1663513183594, + "loss": 0.6909, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.011385348625481129, + "rewards/margins": 0.00459087360650301, + "rewards/rejected": -0.01597622036933899, + "step": 230 + }, + { + "epoch": 0.06281078251766553, + "grad_norm": 3.316723346710205, + "learning_rate": 3.1331592689295043e-06, + "logits/chosen": 1.3093677759170532, + "logits/rejected": 1.5563104152679443, + "logps/chosen": -427.25787353515625, + "logps/rejected": -385.2057189941406, + "loss": 0.6907, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.016488298773765564, + "rewards/margins": 0.005019473843276501, + "rewards/rejected": -0.02150776982307434, + "step": 240 + }, + { + "epoch": 0.06542789845590159, + "grad_norm": 3.3876242637634277, + "learning_rate": 3.263707571801567e-06, + "logits/chosen": 1.4990646839141846, + "logits/rejected": 1.7924457788467407, + "logps/chosen": -395.66754150390625, + "logps/rejected": -346.0106506347656, + "loss": 0.6898, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.021374255418777466, + "rewards/margins": 0.006892757024616003, + "rewards/rejected": -0.028267016634345055, + "step": 250 + }, + { + "epoch": 0.06804501439413765, + "grad_norm": 3.321073055267334, + "learning_rate": 3.3942558746736293e-06, + "logits/chosen": 1.3270446062088013, + "logits/rejected": 1.5052978992462158, + "logps/chosen": -402.4129638671875, + "logps/rejected": -366.2283630371094, + "loss": 0.6878, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.028442109003663063, + "rewards/margins": 0.010941008105874062, + "rewards/rejected": -0.039383117109537125, + "step": 260 + }, + { + "epoch": 0.07066213033237373, + "grad_norm": 2.7087340354919434, + "learning_rate": 3.524804177545692e-06, + "logits/chosen": 1.3542237281799316, + "logits/rejected": 1.5894794464111328, + "logps/chosen": -386.6691589355469, + "logps/rejected": -324.80499267578125, + "loss": 0.6861, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.035862237215042114, + "rewards/margins": 0.014425190165638924, + "rewards/rejected": -0.05028742551803589, + "step": 270 + }, + { + "epoch": 0.07327924627060979, + "grad_norm": 3.6236772537231445, + "learning_rate": 3.6553524804177547e-06, + "logits/chosen": 1.2839621305465698, + "logits/rejected": 1.604859709739685, + "logps/chosen": -376.8522033691406, + "logps/rejected": -334.2494812011719, + "loss": 0.687, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.046802129596471786, + "rewards/margins": 0.012729302048683167, + "rewards/rejected": -0.05953143909573555, + "step": 280 + }, + { + "epoch": 0.07589636220884585, + "grad_norm": 4.159788131713867, + "learning_rate": 3.7859007832898174e-06, + "logits/chosen": 1.5032262802124023, + "logits/rejected": 1.5165659189224243, + "logps/chosen": -416.37554931640625, + "logps/rejected": -369.2384338378906, + "loss": 0.6851, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.05019887164235115, + "rewards/margins": 0.017000939697027206, + "rewards/rejected": -0.06719981133937836, + "step": 290 + }, + { + "epoch": 0.07851347814708191, + "grad_norm": 3.8966574668884277, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": 1.5554125308990479, + "logits/rejected": 1.7502315044403076, + "logps/chosen": -363.2331237792969, + "logps/rejected": -331.22332763671875, + "loss": 0.6861, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.056589674204587936, + "rewards/margins": 0.014802152290940285, + "rewards/rejected": -0.07139183580875397, + "step": 300 + }, + { + "epoch": 0.07851347814708191, + "eval_logits/chosen": 1.225297212600708, + "eval_logits/rejected": 1.4230777025222778, + "eval_logps/chosen": -394.262451171875, + "eval_logps/rejected": -352.1134338378906, + "eval_loss": 0.6859813332557678, + "eval_rewards/accuracies": 0.5989999771118164, + "eval_rewards/chosen": -0.05897095054388046, + "eval_rewards/margins": 0.015322154387831688, + "eval_rewards/rejected": -0.0742930993437767, + "eval_runtime": 233.2642, + "eval_samples_per_second": 8.574, + "eval_steps_per_second": 1.072, + "step": 300 + }, + { + "epoch": 0.08113059408531798, + "grad_norm": 4.554100513458252, + "learning_rate": 4.046997389033943e-06, + "logits/chosen": 1.3358051776885986, + "logits/rejected": 1.3814319372177124, + "logps/chosen": -428.4581604003906, + "logps/rejected": -350.5341491699219, + "loss": 0.6805, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.053123779594898224, + "rewards/margins": 0.026760786771774292, + "rewards/rejected": -0.07988456636667252, + "step": 310 + }, + { + "epoch": 0.08374771002355404, + "grad_norm": 3.4826040267944336, + "learning_rate": 4.177545691906005e-06, + "logits/chosen": 1.4008252620697021, + "logits/rejected": 1.653390884399414, + "logps/chosen": -378.8886413574219, + "logps/rejected": -348.64361572265625, + "loss": 0.685, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06168674677610397, + "rewards/margins": 0.017807736992836, + "rewards/rejected": -0.07949449121952057, + "step": 320 + }, + { + "epoch": 0.08636482596179011, + "grad_norm": 3.816162586212158, + "learning_rate": 4.308093994778068e-06, + "logits/chosen": 1.3938804864883423, + "logits/rejected": 1.5117136240005493, + "logps/chosen": -373.7430114746094, + "logps/rejected": -343.16607666015625, + "loss": 0.6838, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.07209007441997528, + "rewards/margins": 0.020073365420103073, + "rewards/rejected": -0.09216342866420746, + "step": 330 + }, + { + "epoch": 0.08898194190002617, + "grad_norm": 4.358788013458252, + "learning_rate": 4.4386422976501306e-06, + "logits/chosen": 1.266187071800232, + "logits/rejected": 1.4517450332641602, + "logps/chosen": -425.5559997558594, + "logps/rejected": -387.93243408203125, + "loss": 0.6809, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.06175302714109421, + "rewards/margins": 0.02637804113328457, + "rewards/rejected": -0.08813107013702393, + "step": 340 + }, + { + "epoch": 0.09159905783826224, + "grad_norm": 3.675837755203247, + "learning_rate": 4.569190600522193e-06, + "logits/chosen": 1.0587990283966064, + "logits/rejected": 1.394778847694397, + "logps/chosen": -435.980224609375, + "logps/rejected": -399.3922424316406, + "loss": 0.6819, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.07773401588201523, + "rewards/margins": 0.02536655031144619, + "rewards/rejected": -0.10310056060552597, + "step": 350 + }, + { + "epoch": 0.0942161737764983, + "grad_norm": 2.5249226093292236, + "learning_rate": 4.699738903394257e-06, + "logits/chosen": 1.3578028678894043, + "logits/rejected": 1.5769567489624023, + "logps/chosen": -368.1285705566406, + "logps/rejected": -327.1725769042969, + "loss": 0.6816, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1062256470322609, + "rewards/margins": 0.025300273671746254, + "rewards/rejected": -0.131525918841362, + "step": 360 + }, + { + "epoch": 0.09683328971473436, + "grad_norm": 4.508261680603027, + "learning_rate": 4.8302872062663196e-06, + "logits/chosen": 1.4787975549697876, + "logits/rejected": 1.530562400817871, + "logps/chosen": -423.5401306152344, + "logps/rejected": -350.29522705078125, + "loss": 0.6712, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.11056496202945709, + "rewards/margins": 0.047700513154268265, + "rewards/rejected": -0.15826547145843506, + "step": 370 + }, + { + "epoch": 0.09945040565297043, + "grad_norm": 4.113176345825195, + "learning_rate": 4.9608355091383814e-06, + "logits/chosen": 1.3677705526351929, + "logits/rejected": 1.590041995048523, + "logps/chosen": -425.14569091796875, + "logps/rejected": -369.8869934082031, + "loss": 0.6761, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12270758301019669, + "rewards/margins": 0.03834443539381027, + "rewards/rejected": -0.16105201840400696, + "step": 380 + }, + { + "epoch": 0.1020675215912065, + "grad_norm": 4.8123579025268555, + "learning_rate": 4.9999488562447675e-06, + "logits/chosen": 1.3154010772705078, + "logits/rejected": 1.4253056049346924, + "logps/chosen": -410.3934631347656, + "logps/rejected": -374.3392639160156, + "loss": 0.6646, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11665485054254532, + "rewards/margins": 0.062035609036684036, + "rewards/rejected": -0.17869044840335846, + "step": 390 + }, + { + "epoch": 0.10468463752944256, + "grad_norm": 4.263484001159668, + "learning_rate": 4.999698361256577e-06, + "logits/chosen": 1.333717703819275, + "logits/rejected": 1.4957849979400635, + "logps/chosen": -405.444580078125, + "logps/rejected": -337.0886535644531, + "loss": 0.6757, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.1346513330936432, + "rewards/margins": 0.039198193699121475, + "rewards/rejected": -0.17384955286979675, + "step": 400 + }, + { + "epoch": 0.10468463752944256, + "eval_logits/chosen": 1.2136365175247192, + "eval_logits/rejected": 1.3996493816375732, + "eval_logps/chosen": -403.8213195800781, + "eval_logps/rejected": -363.8988037109375, + "eval_loss": 0.6773815751075745, + "eval_rewards/accuracies": 0.6025000214576721, + "eval_rewards/chosen": -0.15455959737300873, + "eval_rewards/margins": 0.03758702799677849, + "eval_rewards/rejected": -0.19214662909507751, + "eval_runtime": 232.5337, + "eval_samples_per_second": 8.601, + "eval_steps_per_second": 1.075, + "step": 400 + }, + { + "epoch": 0.10730175346767862, + "grad_norm": 4.715285301208496, + "learning_rate": 4.999239142174581e-06, + "logits/chosen": 1.370822548866272, + "logits/rejected": 1.4222373962402344, + "logps/chosen": -386.025146484375, + "logps/rejected": -370.2654724121094, + "loss": 0.6825, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.15298400819301605, + "rewards/margins": 0.026256907731294632, + "rewards/rejected": -0.17924091219902039, + "step": 410 + }, + { + "epoch": 0.10991886940591468, + "grad_norm": 4.306619167327881, + "learning_rate": 4.99857123734344e-06, + "logits/chosen": 1.382204294204712, + "logits/rejected": 1.4024231433868408, + "logps/chosen": -378.82562255859375, + "logps/rejected": -337.8069152832031, + "loss": 0.6765, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.15577563643455505, + "rewards/margins": 0.039799682796001434, + "rewards/rejected": -0.1955752968788147, + "step": 420 + }, + { + "epoch": 0.11253598534415074, + "grad_norm": 4.360618591308594, + "learning_rate": 4.997694702533016e-06, + "logits/chosen": 1.2584686279296875, + "logits/rejected": 1.6165319681167603, + "logps/chosen": -416.8006286621094, + "logps/rejected": -379.16168212890625, + "loss": 0.6744, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.1570083498954773, + "rewards/margins": 0.043786775320768356, + "rewards/rejected": -0.20079509913921356, + "step": 430 + }, + { + "epoch": 0.11515310128238682, + "grad_norm": 3.9530985355377197, + "learning_rate": 4.996609610933713e-06, + "logits/chosen": 1.2687292098999023, + "logits/rejected": 1.2826169729232788, + "logps/chosen": -423.39794921875, + "logps/rejected": -383.66290283203125, + "loss": 0.6748, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.14532431960105896, + "rewards/margins": 0.042948439717292786, + "rewards/rejected": -0.18827277421951294, + "step": 440 + }, + { + "epoch": 0.11777021722062288, + "grad_norm": 4.231596946716309, + "learning_rate": 4.995316053150366e-06, + "logits/chosen": 1.1246349811553955, + "logits/rejected": 1.26097571849823, + "logps/chosen": -403.4371032714844, + "logps/rejected": -370.98431396484375, + "loss": 0.6652, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.12421522289514542, + "rewards/margins": 0.0634991005063057, + "rewards/rejected": -0.18771430850028992, + "step": 450 + }, + { + "epoch": 0.12038733315885894, + "grad_norm": 4.21218729019165, + "learning_rate": 4.9938141371946815e-06, + "logits/chosen": 1.165198564529419, + "logits/rejected": 1.3747196197509766, + "logps/chosen": -396.4094543457031, + "logps/rejected": -366.3015441894531, + "loss": 0.6628, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1397528052330017, + "rewards/margins": 0.06794790923595428, + "rewards/rejected": -0.2077006995677948, + "step": 460 + }, + { + "epoch": 0.123004449097095, + "grad_norm": 5.95582389831543, + "learning_rate": 4.992103988476206e-06, + "logits/chosen": 1.2146246433258057, + "logits/rejected": 1.2889845371246338, + "logps/chosen": -386.59368896484375, + "logps/rejected": -354.281005859375, + "loss": 0.6667, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.13470637798309326, + "rewards/margins": 0.06365668773651123, + "rewards/rejected": -0.1983630657196045, + "step": 470 + }, + { + "epoch": 0.12562156503533106, + "grad_norm": 4.180170059204102, + "learning_rate": 4.990185749791866e-06, + "logits/chosen": 1.0521671772003174, + "logits/rejected": 1.2878140211105347, + "logps/chosen": -396.9014587402344, + "logps/rejected": -366.1920166015625, + "loss": 0.6647, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1293117105960846, + "rewards/margins": 0.06927163153886795, + "rewards/rejected": -0.19858333468437195, + "step": 480 + }, + { + "epoch": 0.12823868097356714, + "grad_norm": 4.745175361633301, + "learning_rate": 4.9880595813140395e-06, + "logits/chosen": 1.0240387916564941, + "logits/rejected": 1.2297166585922241, + "logps/chosen": -430.8408203125, + "logps/rejected": -386.9019470214844, + "loss": 0.6617, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1291750818490982, + "rewards/margins": 0.07389305531978607, + "rewards/rejected": -0.20306813716888428, + "step": 490 + }, + { + "epoch": 0.13085579691180318, + "grad_norm": 4.876718521118164, + "learning_rate": 4.985725660577184e-06, + "logits/chosen": 0.956185519695282, + "logits/rejected": 1.1582107543945312, + "logps/chosen": -418.2518615722656, + "logps/rejected": -358.01739501953125, + "loss": 0.6581, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.15920230746269226, + "rewards/margins": 0.08411301672458649, + "rewards/rejected": -0.24331530928611755, + "step": 500 + }, + { + "epoch": 0.13085579691180318, + "eval_logits/chosen": 0.9564015865325928, + "eval_logits/rejected": 1.1466065645217896, + "eval_logps/chosen": -404.720947265625, + "eval_logps/rejected": -367.4447326660156, + "eval_loss": 0.6681177020072937, + "eval_rewards/accuracies": 0.6240000128746033, + "eval_rewards/chosen": -0.16355587542057037, + "eval_rewards/margins": 0.06404965370893478, + "eval_rewards/rejected": -0.22760552167892456, + "eval_runtime": 232.074, + "eval_samples_per_second": 8.618, + "eval_steps_per_second": 1.077, + "step": 500 + }, + { + "epoch": 0.13347291285003926, + "grad_norm": 5.33396053314209, + "learning_rate": 4.983184182463009e-06, + "logits/chosen": 1.1448547840118408, + "logits/rejected": 1.1940263509750366, + "logps/chosen": -420.2618713378906, + "logps/rejected": -372.01531982421875, + "loss": 0.6574, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1364787369966507, + "rewards/margins": 0.08503785729408264, + "rewards/rejected": -0.22151657938957214, + "step": 510 + }, + { + "epoch": 0.1360900287882753, + "grad_norm": 5.8554887771606445, + "learning_rate": 4.980435359184203e-06, + "logits/chosen": 1.2189310789108276, + "logits/rejected": 1.1558836698532104, + "logps/chosen": -412.2118225097656, + "logps/rejected": -386.9334411621094, + "loss": 0.6653, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18587224185466766, + "rewards/margins": 0.07138343900442123, + "rewards/rejected": -0.2572557032108307, + "step": 520 + }, + { + "epoch": 0.13870714472651138, + "grad_norm": 5.761895656585693, + "learning_rate": 4.9774794202667236e-06, + "logits/chosen": 1.0874278545379639, + "logits/rejected": 1.3288378715515137, + "logps/chosen": -404.8669738769531, + "logps/rejected": -405.3680419921875, + "loss": 0.6552, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18935520946979523, + "rewards/margins": 0.0935024693608284, + "rewards/rejected": -0.28285765647888184, + "step": 530 + }, + { + "epoch": 0.14132426066474746, + "grad_norm": 5.3668413162231445, + "learning_rate": 4.974316612530615e-06, + "logits/chosen": 1.3489412069320679, + "logits/rejected": 1.4930012226104736, + "logps/chosen": -424.3651428222656, + "logps/rejected": -362.0076599121094, + "loss": 0.6354, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.20577266812324524, + "rewards/margins": 0.13636724650859833, + "rewards/rejected": -0.3421398997306824, + "step": 540 + }, + { + "epoch": 0.1439413766029835, + "grad_norm": 5.151000022888184, + "learning_rate": 4.970947200069416e-06, + "logits/chosen": 1.1908769607543945, + "logits/rejected": 1.2403991222381592, + "logps/chosen": -418.28729248046875, + "logps/rejected": -380.58282470703125, + "loss": 0.6608, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.16425392031669617, + "rewards/margins": 0.0870148092508316, + "rewards/rejected": -0.2512687146663666, + "step": 550 + }, + { + "epoch": 0.14655849254121958, + "grad_norm": 5.040604591369629, + "learning_rate": 4.967371464228096e-06, + "logits/chosen": 1.0223934650421143, + "logits/rejected": 1.145374059677124, + "logps/chosen": -404.68829345703125, + "logps/rejected": -392.9267578125, + "loss": 0.6599, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.19183708727359772, + "rewards/margins": 0.08984865993261337, + "rewards/rejected": -0.2816857099533081, + "step": 560 + }, + { + "epoch": 0.14917560847945563, + "grad_norm": 6.167598724365234, + "learning_rate": 4.963589703579569e-06, + "logits/chosen": 1.0712225437164307, + "logits/rejected": 1.3134175539016724, + "logps/chosen": -472.494384765625, + "logps/rejected": -424.765380859375, + "loss": 0.6634, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.29787617921829224, + "rewards/margins": 0.08284131437540054, + "rewards/rejected": -0.3807174861431122, + "step": 570 + }, + { + "epoch": 0.1517927244176917, + "grad_norm": 5.881194114685059, + "learning_rate": 4.9596022338997615e-06, + "logits/chosen": 0.8614290356636047, + "logits/rejected": 1.003142237663269, + "logps/chosen": -461.50421142578125, + "logps/rejected": -403.1651916503906, + "loss": 0.6505, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2679108679294586, + "rewards/margins": 0.11834441125392914, + "rewards/rejected": -0.38625526428222656, + "step": 580 + }, + { + "epoch": 0.15440984035592778, + "grad_norm": 5.650794506072998, + "learning_rate": 4.955409388141243e-06, + "logits/chosen": 0.8646121025085449, + "logits/rejected": 1.1550391912460327, + "logps/chosen": -393.10260009765625, + "logps/rejected": -357.4300231933594, + "loss": 0.6638, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.21748176217079163, + "rewards/margins": 0.07719887048006058, + "rewards/rejected": -0.2946805953979492, + "step": 590 + }, + { + "epoch": 0.15702695629416383, + "grad_norm": 5.714654922485352, + "learning_rate": 4.951011516405429e-06, + "logits/chosen": 0.9980852007865906, + "logits/rejected": 1.1865313053131104, + "logps/chosen": -385.1533508300781, + "logps/rejected": -367.5033264160156, + "loss": 0.658, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.20146425068378448, + "rewards/margins": 0.09325651824474335, + "rewards/rejected": -0.29472076892852783, + "step": 600 + }, + { + "epoch": 0.15702695629416383, + "eval_logits/chosen": 0.9498924016952515, + "eval_logits/rejected": 1.1416826248168945, + "eval_logps/chosen": -415.15386962890625, + "eval_logps/rejected": -380.47955322265625, + "eval_loss": 0.6596394181251526, + "eval_rewards/accuracies": 0.6234999895095825, + "eval_rewards/chosen": -0.2678852677345276, + "eval_rewards/margins": 0.09006918221712112, + "eval_rewards/rejected": -0.3579544723033905, + "eval_runtime": 231.9217, + "eval_samples_per_second": 8.624, + "eval_steps_per_second": 1.078, + "step": 600 + }, + { + "epoch": 0.1596440722323999, + "grad_norm": 5.238458156585693, + "learning_rate": 4.946408985913344e-06, + "logits/chosen": 1.1951860189437866, + "logits/rejected": 1.289475679397583, + "logps/chosen": -383.46795654296875, + "logps/rejected": -362.46417236328125, + "loss": 0.6652, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.29597288370132446, + "rewards/margins": 0.07488597929477692, + "rewards/rejected": -0.3708588182926178, + "step": 610 + }, + { + "epoch": 0.16226118817063595, + "grad_norm": 6.363269805908203, + "learning_rate": 4.941602180974958e-06, + "logits/chosen": 1.051048994064331, + "logits/rejected": 1.357006311416626, + "logps/chosen": -452.8377380371094, + "logps/rejected": -374.8196716308594, + "loss": 0.6508, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.29147928953170776, + "rewards/margins": 0.10656224191188812, + "rewards/rejected": -0.3980415463447571, + "step": 620 + }, + { + "epoch": 0.16487830410887203, + "grad_norm": 6.250793933868408, + "learning_rate": 4.936591502957101e-06, + "logits/chosen": 0.9079286456108093, + "logits/rejected": 1.184887170791626, + "logps/chosen": -387.5869445800781, + "logps/rejected": -360.51885986328125, + "loss": 0.6415, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.17826662957668304, + "rewards/margins": 0.12794804573059082, + "rewards/rejected": -0.30621469020843506, + "step": 630 + }, + { + "epoch": 0.16749542004710807, + "grad_norm": 7.378194332122803, + "learning_rate": 4.931377370249946e-06, + "logits/chosen": 0.7079142332077026, + "logits/rejected": 0.9907251596450806, + "logps/chosen": -420.49560546875, + "logps/rejected": -371.4837951660156, + "loss": 0.657, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.23468086123466492, + "rewards/margins": 0.09351176023483276, + "rewards/rejected": -0.3281926214694977, + "step": 640 + }, + { + "epoch": 0.17011253598534415, + "grad_norm": 10.734910011291504, + "learning_rate": 4.925960218232073e-06, + "logits/chosen": 0.8249115943908691, + "logits/rejected": 1.072989583015442, + "logps/chosen": -398.7706604003906, + "logps/rejected": -382.7182922363281, + "loss": 0.6394, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.24996908009052277, + "rewards/margins": 0.1392596811056137, + "rewards/rejected": -0.3892287611961365, + "step": 650 + }, + { + "epoch": 0.17272965192358022, + "grad_norm": 6.875140190124512, + "learning_rate": 4.920340499234116e-06, + "logits/chosen": 0.8695880770683289, + "logits/rejected": 1.1250814199447632, + "logps/chosen": -394.7156982421875, + "logps/rejected": -351.32427978515625, + "loss": 0.639, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.18945911526679993, + "rewards/margins": 0.13935169577598572, + "rewards/rejected": -0.32881081104278564, + "step": 660 + }, + { + "epoch": 0.17534676786181627, + "grad_norm": 7.728440761566162, + "learning_rate": 4.914518682500995e-06, + "logits/chosen": 0.9572404623031616, + "logits/rejected": 1.0353472232818604, + "logps/chosen": -424.74468994140625, + "logps/rejected": -388.115234375, + "loss": 0.6416, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19141948223114014, + "rewards/margins": 0.1302730143070221, + "rewards/rejected": -0.32169249653816223, + "step": 670 + }, + { + "epoch": 0.17796388380005235, + "grad_norm": 8.876616477966309, + "learning_rate": 4.9084952541527315e-06, + "logits/chosen": 0.7888752222061157, + "logits/rejected": 0.9395732879638672, + "logps/chosen": -428.37847900390625, + "logps/rejected": -380.4373779296875, + "loss": 0.6262, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.34512442350387573, + "rewards/margins": 0.17706379294395447, + "rewards/rejected": -0.5221882462501526, + "step": 680 + }, + { + "epoch": 0.1805809997382884, + "grad_norm": 7.472078800201416, + "learning_rate": 4.902270717143858e-06, + "logits/chosen": 1.0732382535934448, + "logits/rejected": 1.275301218032837, + "logps/chosen": -385.16851806640625, + "logps/rejected": -399.53167724609375, + "loss": 0.6284, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3828611969947815, + "rewards/margins": 0.16961130499839783, + "rewards/rejected": -0.5524724721908569, + "step": 690 + }, + { + "epoch": 0.18319811567652447, + "grad_norm": 6.828216552734375, + "learning_rate": 4.895845591221427e-06, + "logits/chosen": 0.8568657636642456, + "logits/rejected": 0.9334108233451843, + "logps/chosen": -401.2918395996094, + "logps/rejected": -391.73291015625, + "loss": 0.6399, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2788159251213074, + "rewards/margins": 0.14835360646247864, + "rewards/rejected": -0.4271695017814636, + "step": 700 + }, + { + "epoch": 0.18319811567652447, + "eval_logits/chosen": 0.7495799660682678, + "eval_logits/rejected": 0.9458017349243164, + "eval_logps/chosen": -412.70025634765625, + "eval_logps/rejected": -382.4018859863281, + "eval_loss": 0.6480182409286499, + "eval_rewards/accuracies": 0.640999972820282, + "eval_rewards/chosen": -0.24334919452667236, + "eval_rewards/margins": 0.13382813334465027, + "eval_rewards/rejected": -0.37717729806900024, + "eval_runtime": 232.1999, + "eval_samples_per_second": 8.613, + "eval_steps_per_second": 1.077, + "step": 700 + }, + { + "epoch": 0.18581523161476055, + "grad_norm": 7.924125671386719, + "learning_rate": 4.8892204128816e-06, + "logits/chosen": 0.7438673377037048, + "logits/rejected": 1.0076682567596436, + "logps/chosen": -439.7687072753906, + "logps/rejected": -405.40509033203125, + "loss": 0.6513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19243761897087097, + "rewards/margins": 0.12698553502559662, + "rewards/rejected": -0.3194231390953064, + "step": 710 + }, + { + "epoch": 0.1884323475529966, + "grad_norm": 9.067682266235352, + "learning_rate": 4.882395735324864e-06, + "logits/chosen": 0.6921774744987488, + "logits/rejected": 1.028187870979309, + "logps/chosen": -423.8224182128906, + "logps/rejected": -385.47412109375, + "loss": 0.66, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3247512876987457, + "rewards/margins": 0.11575271934270859, + "rewards/rejected": -0.4405040144920349, + "step": 720 + }, + { + "epoch": 0.19104946349123267, + "grad_norm": 7.999166965484619, + "learning_rate": 4.87537212840983e-06, + "logits/chosen": 0.8577788472175598, + "logits/rejected": 1.1802966594696045, + "logps/chosen": -425.0936584472656, + "logps/rejected": -394.3600769042969, + "loss": 0.6348, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.457479327917099, + "rewards/margins": 0.16357269883155823, + "rewards/rejected": -0.6210519671440125, + "step": 730 + }, + { + "epoch": 0.19366657942946872, + "grad_norm": 8.635887145996094, + "learning_rate": 4.8681501786056545e-06, + "logits/chosen": 1.0259983539581299, + "logits/rejected": 1.2564369440078735, + "logps/chosen": -366.40948486328125, + "logps/rejected": -322.6649475097656, + "loss": 0.6061, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2946811616420746, + "rewards/margins": 0.22826531529426575, + "rewards/rejected": -0.5229464769363403, + "step": 740 + }, + { + "epoch": 0.1962836953677048, + "grad_norm": 7.924373626708984, + "learning_rate": 4.860730488943068e-06, + "logits/chosen": 0.9873319864273071, + "logits/rejected": 1.1646772623062134, + "logps/chosen": -393.21209716796875, + "logps/rejected": -388.07452392578125, + "loss": 0.6309, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27078738808631897, + "rewards/margins": 0.1725112944841385, + "rewards/rejected": -0.44329872727394104, + "step": 750 + }, + { + "epoch": 0.19890081130594087, + "grad_norm": 8.054422378540039, + "learning_rate": 4.853113678964022e-06, + "logits/chosen": 0.6722021102905273, + "logits/rejected": 0.8681543469429016, + "logps/chosen": -433.37750244140625, + "logps/rejected": -429.5255432128906, + "loss": 0.6333, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.43984508514404297, + "rewards/margins": 0.16787569224834442, + "rewards/rejected": -0.6077207326889038, + "step": 760 + }, + { + "epoch": 0.20151792724417691, + "grad_norm": 7.1459150314331055, + "learning_rate": 4.845300384669958e-06, + "logits/chosen": 0.7385894060134888, + "logits/rejected": 0.9126585721969604, + "logps/chosen": -414.93292236328125, + "logps/rejected": -372.8153381347656, + "loss": 0.6588, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4202548861503601, + "rewards/margins": 0.1190139502286911, + "rewards/rejected": -0.5392688512802124, + "step": 770 + }, + { + "epoch": 0.204135043182413, + "grad_norm": 9.462651252746582, + "learning_rate": 4.837291258468701e-06, + "logits/chosen": 0.6594001650810242, + "logits/rejected": 0.8065937161445618, + "logps/chosen": -452.4710388183594, + "logps/rejected": -414.4306640625, + "loss": 0.6375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2922487258911133, + "rewards/margins": 0.16755884885787964, + "rewards/rejected": -0.4598075747489929, + "step": 780 + }, + { + "epoch": 0.20675215912064904, + "grad_norm": 9.359882354736328, + "learning_rate": 4.829086969119984e-06, + "logits/chosen": 0.8586047887802124, + "logits/rejected": 1.0241984128952026, + "logps/chosen": -411.26116943359375, + "logps/rejected": -411.509033203125, + "loss": 0.6571, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.47147512435913086, + "rewards/margins": 0.1179923564195633, + "rewards/rejected": -0.589467465877533, + "step": 790 + }, + { + "epoch": 0.2093692750588851, + "grad_norm": 10.965747833251953, + "learning_rate": 4.820688201679605e-06, + "logits/chosen": 0.6815871000289917, + "logits/rejected": 0.9268299341201782, + "logps/chosen": -432.5615234375, + "logps/rejected": -362.436767578125, + "loss": 0.624, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5902279019355774, + "rewards/margins": 0.18446585536003113, + "rewards/rejected": -0.7746937870979309, + "step": 800 + }, + { + "epoch": 0.2093692750588851, + "eval_logits/chosen": 0.6242519021034241, + "eval_logits/rejected": 0.8198153972625732, + "eval_logps/chosen": -442.35064697265625, + "eval_logps/rejected": -415.1210632324219, + "eval_loss": 0.6389787793159485, + "eval_rewards/accuracies": 0.6514999866485596, + "eval_rewards/chosen": -0.5398533940315247, + "eval_rewards/margins": 0.16451531648635864, + "eval_rewards/rejected": -0.7043687105178833, + "eval_runtime": 232.5028, + "eval_samples_per_second": 8.602, + "eval_steps_per_second": 1.075, + "step": 800 + }, + { + "epoch": 0.21198639099712116, + "grad_norm": 9.325600624084473, + "learning_rate": 4.8120956574422315e-06, + "logits/chosen": 0.5215608477592468, + "logits/rejected": 0.7568296194076538, + "logps/chosen": -452.6517639160156, + "logps/rejected": -429.9583435058594, + "loss": 0.6763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4800487160682678, + "rewards/margins": 0.09066037833690643, + "rewards/rejected": -0.5707091093063354, + "step": 810 + }, + { + "epoch": 0.21460350693535724, + "grad_norm": 9.269844055175781, + "learning_rate": 4.803310053882831e-06, + "logits/chosen": 1.0071885585784912, + "logits/rejected": 1.0069457292556763, + "logps/chosen": -374.1867370605469, + "logps/rejected": -393.0777893066406, + "loss": 0.6512, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.38367071747779846, + "rewards/margins": 0.1374969780445099, + "rewards/rejected": -0.5211676955223083, + "step": 820 + }, + { + "epoch": 0.2172206228735933, + "grad_norm": 9.348451614379883, + "learning_rate": 4.794332124596775e-06, + "logits/chosen": 0.6886093616485596, + "logits/rejected": 0.855501651763916, + "logps/chosen": -426.2472229003906, + "logps/rejected": -433.111083984375, + "loss": 0.6489, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3134116232395172, + "rewards/margins": 0.13843798637390137, + "rewards/rejected": -0.4518495500087738, + "step": 830 + }, + { + "epoch": 0.21983773881182936, + "grad_norm": 9.539639472961426, + "learning_rate": 4.785162619238575e-06, + "logits/chosen": 0.7601666450500488, + "logits/rejected": 0.965703010559082, + "logps/chosen": -439.4012756347656, + "logps/rejected": -400.94439697265625, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5403390526771545, + "rewards/margins": 0.17277175188064575, + "rewards/rejected": -0.7131107449531555, + "step": 840 + }, + { + "epoch": 0.22245485475006543, + "grad_norm": 8.668850898742676, + "learning_rate": 4.775802303459288e-06, + "logits/chosen": 0.8507216572761536, + "logits/rejected": 0.9332345724105835, + "logps/chosen": -440.80999755859375, + "logps/rejected": -420.0621643066406, + "loss": 0.6386, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.583116352558136, + "rewards/margins": 0.16593685746192932, + "rewards/rejected": -0.7490531206130981, + "step": 850 + }, + { + "epoch": 0.22507197068830148, + "grad_norm": 12.724005699157715, + "learning_rate": 4.766251958842589e-06, + "logits/chosen": 0.8601281046867371, + "logits/rejected": 0.9346219897270203, + "logps/chosen": -455.83502197265625, + "logps/rejected": -436.79815673828125, + "loss": 0.6329, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6050734519958496, + "rewards/margins": 0.1777031421661377, + "rewards/rejected": -0.7827765941619873, + "step": 860 + }, + { + "epoch": 0.22768908662653756, + "grad_norm": 11.345113754272461, + "learning_rate": 4.7565123828395066e-06, + "logits/chosen": 0.7511667013168335, + "logits/rejected": 0.9087456464767456, + "logps/chosen": -427.225830078125, + "logps/rejected": -417.6124572753906, + "loss": 0.6469, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47927623987197876, + "rewards/margins": 0.181664377450943, + "rewards/rejected": -0.6609406471252441, + "step": 870 + }, + { + "epoch": 0.23030620256477363, + "grad_norm": 11.099597930908203, + "learning_rate": 4.746584388701831e-06, + "logits/chosen": 0.8890976905822754, + "logits/rejected": 0.843266487121582, + "logps/chosen": -428.44091796875, + "logps/rejected": -412.8895568847656, + "loss": 0.6171, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4830327033996582, + "rewards/margins": 0.21945062279701233, + "rewards/rejected": -0.7024833559989929, + "step": 880 + }, + { + "epoch": 0.23292331850300968, + "grad_norm": 11.448017120361328, + "learning_rate": 4.736468805414218e-06, + "logits/chosen": 0.9304903745651245, + "logits/rejected": 1.2368038892745972, + "logps/chosen": -408.22283935546875, + "logps/rejected": -436.41815185546875, + "loss": 0.6201, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45535406470298767, + "rewards/margins": 0.22440317273139954, + "rewards/rejected": -0.6797571778297424, + "step": 890 + }, + { + "epoch": 0.23554043444124576, + "grad_norm": 14.195487976074219, + "learning_rate": 4.7261664776249595e-06, + "logits/chosen": 0.8509295582771301, + "logits/rejected": 1.094995141029358, + "logps/chosen": -421.479248046875, + "logps/rejected": -408.42083740234375, + "loss": 0.62, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5554867386817932, + "rewards/margins": 0.23488807678222656, + "rewards/rejected": -0.7903748750686646, + "step": 900 + }, + { + "epoch": 0.23554043444124576, + "eval_logits/chosen": 0.7130433917045593, + "eval_logits/rejected": 0.9079583883285522, + "eval_logps/chosen": -454.4474792480469, + "eval_logps/rejected": -431.30230712890625, + "eval_loss": 0.6320670247077942, + "eval_rewards/accuracies": 0.6485000252723694, + "eval_rewards/chosen": -0.6608208417892456, + "eval_rewards/margins": 0.2053609937429428, + "eval_rewards/rejected": -0.8661818504333496, + "eval_runtime": 232.3653, + "eval_samples_per_second": 8.607, + "eval_steps_per_second": 1.076, + "step": 900 + }, + { + "epoch": 0.2381575503794818, + "grad_norm": 9.822765350341797, + "learning_rate": 4.715678265575463e-06, + "logits/chosen": 0.9885305166244507, + "logits/rejected": 0.9318181872367859, + "logps/chosen": -475.46728515625, + "logps/rejected": -405.56658935546875, + "loss": 0.6189, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5485895872116089, + "rewards/margins": 0.2235954999923706, + "rewards/rejected": -0.7721850872039795, + "step": 910 + }, + { + "epoch": 0.24077466631771788, + "grad_norm": 8.151389122009277, + "learning_rate": 4.705005045028415e-06, + "logits/chosen": 0.8163010478019714, + "logits/rejected": 0.7590088844299316, + "logps/chosen": -439.08563232421875, + "logps/rejected": -413.9178771972656, + "loss": 0.6244, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5029494166374207, + "rewards/margins": 0.2026442587375641, + "rewards/rejected": -0.7055937051773071, + "step": 920 + }, + { + "epoch": 0.24339178225595393, + "grad_norm": 12.819987297058105, + "learning_rate": 4.694147707194659e-06, + "logits/chosen": 0.6603835225105286, + "logits/rejected": 0.8101722598075867, + "logps/chosen": -468.04083251953125, + "logps/rejected": -449.27423095703125, + "loss": 0.595, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5709540247917175, + "rewards/margins": 0.2979514002799988, + "rewards/rejected": -0.8689054250717163, + "step": 930 + }, + { + "epoch": 0.24600889819419, + "grad_norm": 11.70290470123291, + "learning_rate": 4.683107158658782e-06, + "logits/chosen": 0.6703850030899048, + "logits/rejected": 1.1179869174957275, + "logps/chosen": -492.76904296875, + "logps/rejected": -475.07244873046875, + "loss": 0.6061, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8049885630607605, + "rewards/margins": 0.24929532408714294, + "rewards/rejected": -1.054283857345581, + "step": 940 + }, + { + "epoch": 0.24862601413242608, + "grad_norm": 9.882152557373047, + "learning_rate": 4.671884321303407e-06, + "logits/chosen": 0.8308131098747253, + "logits/rejected": 0.9495989084243774, + "logps/chosen": -425.8990173339844, + "logps/rejected": -416.7378845214844, + "loss": 0.6092, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6653159856796265, + "rewards/margins": 0.2662131190299988, + "rewards/rejected": -0.9315292239189148, + "step": 950 + }, + { + "epoch": 0.2512431300706621, + "grad_norm": 10.33724308013916, + "learning_rate": 4.660480132232224e-06, + "logits/chosen": 0.7533870935440063, + "logits/rejected": 0.8427373766899109, + "logps/chosen": -445.78387451171875, + "logps/rejected": -416.93218994140625, + "loss": 0.6309, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.417325496673584, + "rewards/margins": 0.21080616116523743, + "rewards/rejected": -0.6281316876411438, + "step": 960 + }, + { + "epoch": 0.25386024600889817, + "grad_norm": 12.866530418395996, + "learning_rate": 4.6488955436917414e-06, + "logits/chosen": 0.6672025322914124, + "logits/rejected": 0.8596879243850708, + "logps/chosen": -435.473388671875, + "logps/rejected": -387.2782287597656, + "loss": 0.6097, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4055888056755066, + "rewards/margins": 0.25641921162605286, + "rewards/rejected": -0.6620079278945923, + "step": 970 + }, + { + "epoch": 0.2564773619471343, + "grad_norm": 12.976202964782715, + "learning_rate": 4.6371315229917644e-06, + "logits/chosen": 0.6292930841445923, + "logits/rejected": 0.7828453183174133, + "logps/chosen": -468.067138671875, + "logps/rejected": -448.0206604003906, + "loss": 0.5999, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5305265784263611, + "rewards/margins": 0.2725897431373596, + "rewards/rejected": -0.8031163215637207, + "step": 980 + }, + { + "epoch": 0.2590944778853703, + "grad_norm": 11.27432632446289, + "learning_rate": 4.625189052424638e-06, + "logits/chosen": 0.7714609503746033, + "logits/rejected": 1.057544469833374, + "logps/chosen": -416.63323974609375, + "logps/rejected": -403.56793212890625, + "loss": 0.5821, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6991704702377319, + "rewards/margins": 0.3505772650241852, + "rewards/rejected": -1.0497477054595947, + "step": 990 + }, + { + "epoch": 0.26171159382360637, + "grad_norm": 11.419754981994629, + "learning_rate": 4.613069129183218e-06, + "logits/chosen": 0.7187921404838562, + "logits/rejected": 1.0341460704803467, + "logps/chosen": -520.6634521484375, + "logps/rejected": -485.7464904785156, + "loss": 0.6255, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8484965562820435, + "rewards/margins": 0.24263262748718262, + "rewards/rejected": -1.091129183769226, + "step": 1000 + }, + { + "epoch": 0.26171159382360637, + "eval_logits/chosen": 0.5235105156898499, + "eval_logits/rejected": 0.711112916469574, + "eval_logps/chosen": -473.25457763671875, + "eval_logps/rejected": -453.2762756347656, + "eval_loss": 0.6269846558570862, + "eval_rewards/accuracies": 0.6455000042915344, + "eval_rewards/chosen": -0.848892092704773, + "eval_rewards/margins": 0.2370292991399765, + "eval_rewards/rejected": -1.0859214067459106, + "eval_runtime": 232.3711, + "eval_samples_per_second": 8.607, + "eval_steps_per_second": 1.076, + "step": 1000 + }, + { + "epoch": 0.2643287097618425, + "grad_norm": 21.2219295501709, + "learning_rate": 4.600772765277607e-06, + "logits/chosen": 0.5445064306259155, + "logits/rejected": 0.8773614764213562, + "logps/chosen": -415.87518310546875, + "logps/rejected": -414.38336181640625, + "loss": 0.6134, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7228370904922485, + "rewards/margins": 0.2761087417602539, + "rewards/rejected": -0.9989458322525024, + "step": 1010 + }, + { + "epoch": 0.2669458257000785, + "grad_norm": 12.491921424865723, + "learning_rate": 4.588300987450652e-06, + "logits/chosen": 0.7395948767662048, + "logits/rejected": 1.0148208141326904, + "logps/chosen": -416.64801025390625, + "logps/rejected": -378.2071838378906, + "loss": 0.6405, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.36920469999313354, + "rewards/margins": 0.19941550493240356, + "rewards/rejected": -0.5686202645301819, + "step": 1020 + }, + { + "epoch": 0.26956294163831457, + "grad_norm": 11.955611228942871, + "learning_rate": 4.5756548370922136e-06, + "logits/chosen": 0.5796680450439453, + "logits/rejected": 0.7352942228317261, + "logps/chosen": -382.8646545410156, + "logps/rejected": -367.3566589355469, + "loss": 0.6535, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2818133234977722, + "rewards/margins": 0.14685805141925812, + "rewards/rejected": -0.42867136001586914, + "step": 1030 + }, + { + "epoch": 0.2721800575765506, + "grad_norm": 12.265511512756348, + "learning_rate": 4.562835370152206e-06, + "logits/chosen": 0.3429097533226013, + "logits/rejected": 0.5369440913200378, + "logps/chosen": -484.8700256347656, + "logps/rejected": -457.33905029296875, + "loss": 0.5784, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4302898943424225, + "rewards/margins": 0.34176695346832275, + "rewards/rejected": -0.7720568180084229, + "step": 1040 + }, + { + "epoch": 0.2747971735147867, + "grad_norm": 13.330986022949219, + "learning_rate": 4.54984365705243e-06, + "logits/chosen": 0.48474931716918945, + "logits/rejected": 0.6024073362350464, + "logps/chosen": -467.50018310546875, + "logps/rejected": -466.6949768066406, + "loss": 0.5833, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7076524496078491, + "rewards/margins": 0.3223820626735687, + "rewards/rejected": -1.0300344228744507, + "step": 1050 + }, + { + "epoch": 0.27741428945302277, + "grad_norm": 18.709505081176758, + "learning_rate": 4.536680782597191e-06, + "logits/chosen": 0.4200347363948822, + "logits/rejected": 0.6729756593704224, + "logps/chosen": -439.62579345703125, + "logps/rejected": -422.578857421875, + "loss": 0.6394, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8081327676773071, + "rewards/margins": 0.21756339073181152, + "rewards/rejected": -1.0256961584091187, + "step": 1060 + }, + { + "epoch": 0.2800314053912588, + "grad_norm": 14.954957962036133, + "learning_rate": 4.523347845882718e-06, + "logits/chosen": 0.44082099199295044, + "logits/rejected": 0.5635146498680115, + "logps/chosen": -464.96136474609375, + "logps/rejected": -430.59521484375, + "loss": 0.555, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6111140847206116, + "rewards/margins": 0.41279348731040955, + "rewards/rejected": -1.0239075422286987, + "step": 1070 + }, + { + "epoch": 0.2826485213294949, + "grad_norm": 14.649702072143555, + "learning_rate": 4.50984596020539e-06, + "logits/chosen": 0.3772805631160736, + "logits/rejected": 0.5361444354057312, + "logps/chosen": -446.0318298339844, + "logps/rejected": -424.20208740234375, + "loss": 0.6135, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5018765926361084, + "rewards/margins": 0.24910588562488556, + "rewards/rejected": -0.7509824633598328, + "step": 1080 + }, + { + "epoch": 0.28526563726773096, + "grad_norm": 12.78677749633789, + "learning_rate": 4.4961762529687745e-06, + "logits/chosen": 0.40019339323043823, + "logits/rejected": 0.5373650789260864, + "logps/chosen": -439.0340881347656, + "logps/rejected": -412.9305725097656, + "loss": 0.6463, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6333376169204712, + "rewards/margins": 0.1645718812942505, + "rewards/rejected": -0.7979093790054321, + "step": 1090 + }, + { + "epoch": 0.287882753205967, + "grad_norm": 13.784514427185059, + "learning_rate": 4.482339865589492e-06, + "logits/chosen": 0.5023793578147888, + "logits/rejected": 0.6304869651794434, + "logps/chosen": -468.329345703125, + "logps/rejected": -410.9677734375, + "loss": 0.6257, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7563449144363403, + "rewards/margins": 0.21642132103443146, + "rewards/rejected": -0.9727662205696106, + "step": 1100 + }, + { + "epoch": 0.287882753205967, + "eval_logits/chosen": 0.2741233706474304, + "eval_logits/rejected": 0.45645061135292053, + "eval_logps/chosen": -476.776611328125, + "eval_logps/rejected": -455.3140563964844, + "eval_loss": 0.6249045133590698, + "eval_rewards/accuracies": 0.6539999842643738, + "eval_rewards/chosen": -0.8841127753257751, + "eval_rewards/margins": 0.22218641638755798, + "eval_rewards/rejected": -1.1062991619110107, + "eval_runtime": 232.1027, + "eval_samples_per_second": 8.617, + "eval_steps_per_second": 1.077, + "step": 1100 + }, + { + "epoch": 0.2904998691442031, + "grad_norm": 13.539247512817383, + "learning_rate": 4.468337953401909e-06, + "logits/chosen": 0.508640468120575, + "logits/rejected": 0.6552512049674988, + "logps/chosen": -493.59344482421875, + "logps/rejected": -498.8395080566406, + "loss": 0.6222, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9703060984611511, + "rewards/margins": 0.22064876556396484, + "rewards/rejected": -1.1909549236297607, + "step": 1110 + }, + { + "epoch": 0.29311698508243916, + "grad_norm": 11.673506736755371, + "learning_rate": 4.45417168556166e-06, + "logits/chosen": 0.32166963815689087, + "logits/rejected": 0.6035802960395813, + "logps/chosen": -456.86944580078125, + "logps/rejected": -448.83294677734375, + "loss": 0.6182, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8095690608024597, + "rewards/margins": 0.2476453334093094, + "rewards/rejected": -1.0572144985198975, + "step": 1120 + }, + { + "epoch": 0.2957341010206752, + "grad_norm": 9.967942237854004, + "learning_rate": 4.439842244948036e-06, + "logits/chosen": 0.2381783425807953, + "logits/rejected": 0.4971030354499817, + "logps/chosen": -444.052001953125, + "logps/rejected": -437.64495849609375, + "loss": 0.6464, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6728368997573853, + "rewards/margins": 0.2183237075805664, + "rewards/rejected": -0.8911606073379517, + "step": 1130 + }, + { + "epoch": 0.29835121695891126, + "grad_norm": 12.411190032958984, + "learning_rate": 4.425350828065204e-06, + "logits/chosen": 0.4094735085964203, + "logits/rejected": 0.5186284184455872, + "logps/chosen": -472.64093017578125, + "logps/rejected": -419.64739990234375, + "loss": 0.5997, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5493424534797668, + "rewards/margins": 0.3005039095878601, + "rewards/rejected": -0.849846363067627, + "step": 1140 + }, + { + "epoch": 0.30096833289714736, + "grad_norm": 17.98678207397461, + "learning_rate": 4.410698644942303e-06, + "logits/chosen": 0.19235338270664215, + "logits/rejected": 0.4098784029483795, + "logps/chosen": -451.133056640625, + "logps/rejected": -423.66448974609375, + "loss": 0.6083, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.474482923746109, + "rewards/margins": 0.28159815073013306, + "rewards/rejected": -0.7560810446739197, + "step": 1150 + }, + { + "epoch": 0.3035854488353834, + "grad_norm": 11.179768562316895, + "learning_rate": 4.395886919032406e-06, + "logits/chosen": 0.5191640853881836, + "logits/rejected": 0.6151641607284546, + "logps/chosen": -423.7428283691406, + "logps/rejected": -409.55303955078125, + "loss": 0.6194, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.469729483127594, + "rewards/margins": 0.2616717517375946, + "rewards/rejected": -0.7314012050628662, + "step": 1160 + }, + { + "epoch": 0.30620256477361946, + "grad_norm": 13.860902786254883, + "learning_rate": 4.380916887110366e-06, + "logits/chosen": 0.3877313733100891, + "logits/rejected": 0.25373178720474243, + "logps/chosen": -457.0462951660156, + "logps/rejected": -414.69024658203125, + "loss": 0.631, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8331391215324402, + "rewards/margins": 0.2332063466310501, + "rewards/rejected": -1.066345453262329, + "step": 1170 + }, + { + "epoch": 0.30881968071185556, + "grad_norm": 10.883024215698242, + "learning_rate": 4.365789799169539e-06, + "logits/chosen": 0.5446051955223083, + "logits/rejected": 0.45434585213661194, + "logps/chosen": -477.3246154785156, + "logps/rejected": -486.14422607421875, + "loss": 0.6232, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1283986568450928, + "rewards/margins": 0.22864672541618347, + "rewards/rejected": -1.3570451736450195, + "step": 1180 + }, + { + "epoch": 0.3114367966500916, + "grad_norm": 11.807137489318848, + "learning_rate": 4.350506918317416e-06, + "logits/chosen": 0.48607057332992554, + "logits/rejected": 0.6121966242790222, + "logps/chosen": -470.0353088378906, + "logps/rejected": -458.76055908203125, + "loss": 0.6312, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.05269455909729, + "rewards/margins": 0.19717691838741302, + "rewards/rejected": -1.2498712539672852, + "step": 1190 + }, + { + "epoch": 0.31405391258832765, + "grad_norm": 14.276663780212402, + "learning_rate": 4.335069520670149e-06, + "logits/chosen": 0.42629900574684143, + "logits/rejected": 0.4847659170627594, + "logps/chosen": -411.36407470703125, + "logps/rejected": -420.76239013671875, + "loss": 0.6512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7268288731575012, + "rewards/margins": 0.17092524468898773, + "rewards/rejected": -0.8977540731430054, + "step": 1200 + }, + { + "epoch": 0.31405391258832765, + "eval_logits/chosen": 0.18179067969322205, + "eval_logits/rejected": 0.35566091537475586, + "eval_logps/chosen": -452.5130920410156, + "eval_logps/rejected": -432.3995056152344, + "eval_loss": 0.6197048425674438, + "eval_rewards/accuracies": 0.6629999876022339, + "eval_rewards/chosen": -0.6414775252342224, + "eval_rewards/margins": 0.23567558825016022, + "eval_rewards/rejected": -0.8771531581878662, + "eval_runtime": 232.4956, + "eval_samples_per_second": 8.602, + "eval_steps_per_second": 1.075, + "step": 1200 + }, + { + "epoch": 0.3166710285265637, + "grad_norm": 9.877076148986816, + "learning_rate": 4.319478895246e-06, + "logits/chosen": 0.31032776832580566, + "logits/rejected": 0.4457179009914398, + "logps/chosen": -426.0264587402344, + "logps/rejected": -397.55316162109375, + "loss": 0.6026, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5681090950965881, + "rewards/margins": 0.26915818452835083, + "rewards/rejected": -0.837267279624939, + "step": 1210 + }, + { + "epoch": 0.3192881444647998, + "grad_norm": 11.549198150634766, + "learning_rate": 4.303736343857704e-06, + "logits/chosen": 0.3116544485092163, + "logits/rejected": 0.5375791788101196, + "logps/chosen": -420.4837951660156, + "logps/rejected": -431.21929931640625, + "loss": 0.6428, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3886328339576721, + "rewards/margins": 0.19474461674690247, + "rewards/rejected": -0.5833774209022522, + "step": 1220 + }, + { + "epoch": 0.32190526040303585, + "grad_norm": 11.519137382507324, + "learning_rate": 4.287843181003772e-06, + "logits/chosen": 0.19896575808525085, + "logits/rejected": 0.25549182295799255, + "logps/chosen": -476.87017822265625, + "logps/rejected": -413.09197998046875, + "loss": 0.6303, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.39009106159210205, + "rewards/margins": 0.20869462192058563, + "rewards/rejected": -0.5987856984138489, + "step": 1230 + }, + { + "epoch": 0.3245223763412719, + "grad_norm": 11.55118465423584, + "learning_rate": 4.27180073375873e-06, + "logits/chosen": 0.39684659242630005, + "logits/rejected": 0.3274468183517456, + "logps/chosen": -453.70587158203125, + "logps/rejected": -417.8851623535156, + "loss": 0.5939, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4496288299560547, + "rewards/margins": 0.3144444525241852, + "rewards/rejected": -0.7640732526779175, + "step": 1240 + }, + { + "epoch": 0.327139492279508, + "grad_norm": 10.041382789611816, + "learning_rate": 4.255610341662304e-06, + "logits/chosen": 0.13150617480278015, + "logits/rejected": 0.4586234986782074, + "logps/chosen": -432.53570556640625, + "logps/rejected": -412.88726806640625, + "loss": 0.6162, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.56169593334198, + "rewards/margins": 0.2593781054019928, + "rewards/rejected": -0.8210738897323608, + "step": 1250 + }, + { + "epoch": 0.32975660821774405, + "grad_norm": 15.793495178222656, + "learning_rate": 4.2392733566075764e-06, + "logits/chosen": 0.2116355448961258, + "logits/rejected": 0.34284886717796326, + "logps/chosen": -430.3287658691406, + "logps/rejected": -423.35980224609375, + "loss": 0.6349, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6224905252456665, + "rewards/margins": 0.2327694147825241, + "rewards/rejected": -0.8552600145339966, + "step": 1260 + }, + { + "epoch": 0.3323737241559801, + "grad_norm": 11.136332511901855, + "learning_rate": 4.2227911427280975e-06, + "logits/chosen": 0.19596245884895325, + "logits/rejected": 0.3888585865497589, + "logps/chosen": -438.890625, + "logps/rejected": -403.1433410644531, + "loss": 0.6353, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6415246725082397, + "rewards/margins": 0.2291472852230072, + "rewards/rejected": -0.8706718683242798, + "step": 1270 + }, + { + "epoch": 0.33499084009421615, + "grad_norm": 20.09882926940918, + "learning_rate": 4.206165076283983e-06, + "logits/chosen": 0.2686145603656769, + "logits/rejected": 0.49140438437461853, + "logps/chosen": -447.68115234375, + "logps/rejected": -423.4256896972656, + "loss": 0.585, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7579549551010132, + "rewards/margins": 0.31806570291519165, + "rewards/rejected": -1.07602059841156, + "step": 1280 + }, + { + "epoch": 0.33760795603245225, + "grad_norm": 15.394529342651367, + "learning_rate": 4.189396545546995e-06, + "logits/chosen": 0.0032246888149529696, + "logits/rejected": 0.3693595230579376, + "logps/chosen": -438.0142517089844, + "logps/rejected": -418.50775146484375, + "loss": 0.6328, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6667572259902954, + "rewards/margins": 0.2417328804731369, + "rewards/rejected": -0.9084900617599487, + "step": 1290 + }, + { + "epoch": 0.3402250719706883, + "grad_norm": 17.138986587524414, + "learning_rate": 4.172486950684627e-06, + "logits/chosen": 0.1562187373638153, + "logits/rejected": 0.4008878171443939, + "logps/chosen": -412.3981018066406, + "logps/rejected": -425.25299072265625, + "loss": 0.5864, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4970017075538635, + "rewards/margins": 0.32930120825767517, + "rewards/rejected": -0.8263028860092163, + "step": 1300 + }, + { + "epoch": 0.3402250719706883, + "eval_logits/chosen": 0.12899738550186157, + "eval_logits/rejected": 0.29827845096588135, + "eval_logps/chosen": -457.74737548828125, + "eval_logps/rejected": -441.21051025390625, + "eval_loss": 0.6130329370498657, + "eval_rewards/accuracies": 0.6735000014305115, + "eval_rewards/chosen": -0.693820059299469, + "eval_rewards/margins": 0.27144384384155273, + "eval_rewards/rejected": -0.965263843536377, + "eval_runtime": 232.1045, + "eval_samples_per_second": 8.617, + "eval_steps_per_second": 1.077, + "step": 1300 + }, + { + "epoch": 0.34284218790892435, + "grad_norm": 15.050552368164062, + "learning_rate": 4.155437703643182e-06, + "logits/chosen": 0.3422376215457916, + "logits/rejected": 0.3849483132362366, + "logps/chosen": -431.77166748046875, + "logps/rejected": -416.0318298339844, + "loss": 0.5837, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7273216843605042, + "rewards/margins": 0.3494306206703186, + "rewards/rejected": -1.0767523050308228, + "step": 1310 + }, + { + "epoch": 0.34545930384716045, + "grad_norm": 16.636903762817383, + "learning_rate": 4.138250228029882e-06, + "logits/chosen": 0.0636025071144104, + "logits/rejected": 0.21294847130775452, + "logps/chosen": -461.449951171875, + "logps/rejected": -475.20361328125, + "loss": 0.6341, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8283727765083313, + "rewards/margins": 0.2532418668270111, + "rewards/rejected": -1.0816147327423096, + "step": 1320 + }, + { + "epoch": 0.3480764197853965, + "grad_norm": 18.111169815063477, + "learning_rate": 4.120925958993994e-06, + "logits/chosen": 0.23635880649089813, + "logits/rejected": 0.25746363401412964, + "logps/chosen": -399.59442138671875, + "logps/rejected": -405.0094299316406, + "loss": 0.6418, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.571780800819397, + "rewards/margins": 0.21083179116249084, + "rewards/rejected": -0.7826126217842102, + "step": 1330 + }, + { + "epoch": 0.35069353572363254, + "grad_norm": 16.408119201660156, + "learning_rate": 4.103466343106999e-06, + "logits/chosen": 0.3088940680027008, + "logits/rejected": 0.4904406666755676, + "logps/chosen": -449.12713623046875, + "logps/rejected": -436.40716552734375, + "loss": 0.6052, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4821189045906067, + "rewards/margins": 0.28531405329704285, + "rewards/rejected": -0.7674329280853271, + "step": 1340 + }, + { + "epoch": 0.35331065166186865, + "grad_norm": 13.181236267089844, + "learning_rate": 4.085872838241797e-06, + "logits/chosen": 0.26650765538215637, + "logits/rejected": 0.46960416436195374, + "logps/chosen": -435.21270751953125, + "logps/rejected": -414.5130310058594, + "loss": 0.6259, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4914736747741699, + "rewards/margins": 0.24917948246002197, + "rewards/rejected": -0.7406532168388367, + "step": 1350 + }, + { + "epoch": 0.3559277676001047, + "grad_norm": 14.832420349121094, + "learning_rate": 4.06814691345098e-06, + "logits/chosen": 0.24336537718772888, + "logits/rejected": 0.30810093879699707, + "logps/chosen": -410.4248962402344, + "logps/rejected": -418.159423828125, + "loss": 0.5866, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5495951771736145, + "rewards/margins": 0.3245389461517334, + "rewards/rejected": -0.8741341829299927, + "step": 1360 + }, + { + "epoch": 0.35854488353834074, + "grad_norm": 17.649641036987305, + "learning_rate": 4.050290048844171e-06, + "logits/chosen": 0.168039470911026, + "logits/rejected": 0.265805184841156, + "logps/chosen": -489.49468994140625, + "logps/rejected": -486.09100341796875, + "loss": 0.6079, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8853768110275269, + "rewards/margins": 0.28231385350227356, + "rewards/rejected": -1.167690634727478, + "step": 1370 + }, + { + "epoch": 0.3611619994765768, + "grad_norm": 15.066794395446777, + "learning_rate": 4.032303735464422e-06, + "logits/chosen": 0.2061309516429901, + "logits/rejected": 0.3621533513069153, + "logps/chosen": -498.93890380859375, + "logps/rejected": -477.98974609375, + "loss": 0.5812, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9270712733268738, + "rewards/margins": 0.3421303629875183, + "rewards/rejected": -1.2692015171051025, + "step": 1380 + }, + { + "epoch": 0.3637791154148129, + "grad_norm": 14.546058654785156, + "learning_rate": 4.014189475163727e-06, + "logits/chosen": 0.295467346906662, + "logits/rejected": 0.3572823405265808, + "logps/chosen": -464.5303649902344, + "logps/rejected": -447.1439514160156, + "loss": 0.6072, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8640359044075012, + "rewards/margins": 0.27750641107559204, + "rewards/rejected": -1.1415421962738037, + "step": 1390 + }, + { + "epoch": 0.36639623135304894, + "grad_norm": 24.52462387084961, + "learning_rate": 3.995948780477605e-06, + "logits/chosen": 0.19618520140647888, + "logits/rejected": 0.33636996150016785, + "logps/chosen": -475.90826416015625, + "logps/rejected": -455.0912170410156, + "loss": 0.6226, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7381902933120728, + "rewards/margins": 0.2615019679069519, + "rewards/rejected": -0.9996922612190247, + "step": 1400 + }, + { + "epoch": 0.36639623135304894, + "eval_logits/chosen": 0.1471870094537735, + "eval_logits/rejected": 0.3099009096622467, + "eval_logps/chosen": -460.8104553222656, + "eval_logps/rejected": -446.4751281738281, + "eval_loss": 0.608772337436676, + "eval_rewards/accuracies": 0.6790000200271606, + "eval_rewards/chosen": -0.7244512438774109, + "eval_rewards/margins": 0.29345834255218506, + "eval_rewards/rejected": -1.0179095268249512, + "eval_runtime": 232.5568, + "eval_samples_per_second": 8.6, + "eval_steps_per_second": 1.075, + "step": 1400 + }, + { + "epoch": 0.369013347291285, + "grad_norm": 18.559682846069336, + "learning_rate": 3.977583174498816e-06, + "logits/chosen": 0.2730047106742859, + "logits/rejected": 0.5137112140655518, + "logps/chosen": -473.2476501464844, + "logps/rejected": -456.55609130859375, + "loss": 0.5941, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.868022084236145, + "rewards/margins": 0.3129437267780304, + "rewards/rejected": -1.1809656620025635, + "step": 1410 + }, + { + "epoch": 0.3716304632295211, + "grad_norm": 15.352100372314453, + "learning_rate": 3.959094190750172e-06, + "logits/chosen": 0.22451026737689972, + "logits/rejected": 0.3914637267589569, + "logps/chosen": -482.42010498046875, + "logps/rejected": -452.3109436035156, + "loss": 0.607, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7052127718925476, + "rewards/margins": 0.31599652767181396, + "rewards/rejected": -1.0212092399597168, + "step": 1420 + }, + { + "epoch": 0.37424757916775714, + "grad_norm": 20.098033905029297, + "learning_rate": 3.9404833730564975e-06, + "logits/chosen": 0.1833394318819046, + "logits/rejected": 0.30950406193733215, + "logps/chosen": -431.68438720703125, + "logps/rejected": -421.0174255371094, + "loss": 0.6279, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5920284986495972, + "rewards/margins": 0.2607758641242981, + "rewards/rejected": -0.8528043627738953, + "step": 1430 + }, + { + "epoch": 0.3768646951059932, + "grad_norm": 16.98012351989746, + "learning_rate": 3.921752275415712e-06, + "logits/chosen": 0.39584654569625854, + "logits/rejected": 0.6170969605445862, + "logps/chosen": -442.19915771484375, + "logps/rejected": -441.6769104003906, + "loss": 0.5883, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8056619763374329, + "rewards/margins": 0.3388480842113495, + "rewards/rejected": -1.1445101499557495, + "step": 1440 + }, + { + "epoch": 0.37948181104422923, + "grad_norm": 13.900838851928711, + "learning_rate": 3.902902461869079e-06, + "logits/chosen": 0.3026077151298523, + "logits/rejected": 0.5092092156410217, + "logps/chosen": -435.91571044921875, + "logps/rejected": -435.0750427246094, + "loss": 0.6001, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8758746981620789, + "rewards/margins": 0.34648483991622925, + "rewards/rejected": -1.2223594188690186, + "step": 1450 + }, + { + "epoch": 0.38209892698246534, + "grad_norm": 17.466562271118164, + "learning_rate": 3.883935506370605e-06, + "logits/chosen": 0.2131034880876541, + "logits/rejected": 0.42597731947898865, + "logps/chosen": -423.911376953125, + "logps/rejected": -408.1385192871094, + "loss": 0.613, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6358692049980164, + "rewards/margins": 0.3088337481021881, + "rewards/rejected": -0.9447029232978821, + "step": 1460 + }, + { + "epoch": 0.3847160429207014, + "grad_norm": 11.749993324279785, + "learning_rate": 3.864852992655617e-06, + "logits/chosen": 0.30351871252059937, + "logits/rejected": 0.3591347336769104, + "logps/chosen": -443.84735107421875, + "logps/rejected": -447.32763671875, + "loss": 0.5455, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.6212112903594971, + "rewards/margins": 0.44100433588027954, + "rewards/rejected": -1.0622155666351318, + "step": 1470 + }, + { + "epoch": 0.38733315885893743, + "grad_norm": 15.301169395446777, + "learning_rate": 3.845656514108516e-06, + "logits/chosen": 0.30313563346862793, + "logits/rejected": 0.3141325116157532, + "logps/chosen": -478.4485778808594, + "logps/rejected": -414.6806640625, + "loss": 0.6202, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9370290040969849, + "rewards/margins": 0.28061023354530334, + "rewards/rejected": -1.2176392078399658, + "step": 1480 + }, + { + "epoch": 0.38995027479717354, + "grad_norm": 16.264097213745117, + "learning_rate": 3.826347673629738e-06, + "logits/chosen": 0.2156684696674347, + "logits/rejected": 0.3405511975288391, + "logps/chosen": -454.978515625, + "logps/rejected": -446.1961975097656, + "loss": 0.5828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8145572543144226, + "rewards/margins": 0.3832133412361145, + "rewards/rejected": -1.1977707147598267, + "step": 1490 + }, + { + "epoch": 0.3925673907354096, + "grad_norm": 17.33570098876953, + "learning_rate": 3.8069280835019062e-06, + "logits/chosen": 0.197922945022583, + "logits/rejected": 0.31282711029052734, + "logps/chosen": -465.8154296875, + "logps/rejected": -456.82562255859375, + "loss": 0.5748, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8404709696769714, + "rewards/margins": 0.37942442297935486, + "rewards/rejected": -1.219895362854004, + "step": 1500 + }, + { + "epoch": 0.3925673907354096, + "eval_logits/chosen": 0.08960460871458054, + "eval_logits/rejected": 0.23796047270298004, + "eval_logps/chosen": -480.02130126953125, + "eval_logps/rejected": -468.2979431152344, + "eval_loss": 0.6048462986946106, + "eval_rewards/accuracies": 0.6754999756813049, + "eval_rewards/chosen": -0.9165594577789307, + "eval_rewards/margins": 0.3195783197879791, + "eval_rewards/rejected": -1.236137866973877, + "eval_runtime": 231.9563, + "eval_samples_per_second": 8.622, + "eval_steps_per_second": 1.078, + "step": 1500 + }, + { + "epoch": 0.39518450667364563, + "grad_norm": 14.863387107849121, + "learning_rate": 3.7873993652552077e-06, + "logits/chosen": 0.23345918953418732, + "logits/rejected": 0.28646907210350037, + "logps/chosen": -454.75897216796875, + "logps/rejected": -453.5748596191406, + "loss": 0.6597, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.0461337566375732, + "rewards/margins": 0.21926303207874298, + "rewards/rejected": -1.2653969526290894, + "step": 1510 + }, + { + "epoch": 0.39780162261188173, + "grad_norm": 16.0816593170166, + "learning_rate": 3.7677631495319953e-06, + "logits/chosen": 0.2003081738948822, + "logits/rejected": 0.36653000116348267, + "logps/chosen": -488.4042053222656, + "logps/rejected": -490.30853271484375, + "loss": 0.581, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9618334770202637, + "rewards/margins": 0.38063231110572815, + "rewards/rejected": -1.3424657583236694, + "step": 1520 + }, + { + "epoch": 0.4004187385501178, + "grad_norm": 15.07925033569336, + "learning_rate": 3.748021075950633e-06, + "logits/chosen": -0.012658292427659035, + "logits/rejected": 0.16820164024829865, + "logps/chosen": -478.895751953125, + "logps/rejected": -465.083984375, + "loss": 0.6449, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.834586501121521, + "rewards/margins": 0.23068487644195557, + "rewards/rejected": -1.0652713775634766, + "step": 1530 + }, + { + "epoch": 0.40303585448835383, + "grad_norm": 13.859041213989258, + "learning_rate": 3.7281747929685824e-06, + "logits/chosen": 0.3084440231323242, + "logits/rejected": 0.4003655016422272, + "logps/chosen": -465.21844482421875, + "logps/rejected": -456.33465576171875, + "loss": 0.6143, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1239235401153564, + "rewards/margins": 0.2595583498477936, + "rewards/rejected": -1.3834818601608276, + "step": 1540 + }, + { + "epoch": 0.4056529704265899, + "grad_norm": 12.961313247680664, + "learning_rate": 3.7082259577447604e-06, + "logits/chosen": 0.20876283943653107, + "logits/rejected": 0.4126996099948883, + "logps/chosen": -499.909912109375, + "logps/rejected": -486.24749755859375, + "loss": 0.5952, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9955336451530457, + "rewards/margins": 0.3382071256637573, + "rewards/rejected": -1.3337408304214478, + "step": 1550 + }, + { + "epoch": 0.408270086364826, + "grad_norm": 15.985527038574219, + "learning_rate": 3.6881762360011688e-06, + "logits/chosen": 0.12506040930747986, + "logits/rejected": 0.17536480724811554, + "logps/chosen": -477.6473083496094, + "logps/rejected": -436.26751708984375, + "loss": 0.6062, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7066253423690796, + "rewards/margins": 0.3402264714241028, + "rewards/rejected": -1.0468518733978271, + "step": 1560 + }, + { + "epoch": 0.410887202303062, + "grad_norm": 22.8974666595459, + "learning_rate": 3.668027301883802e-06, + "logits/chosen": 0.0588788278400898, + "logits/rejected": 0.12763305008411407, + "logps/chosen": -440.4716796875, + "logps/rejected": -444.61834716796875, + "loss": 0.6024, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7403033375740051, + "rewards/margins": 0.37406405806541443, + "rewards/rejected": -1.1143672466278076, + "step": 1570 + }, + { + "epoch": 0.4135043182412981, + "grad_norm": 15.58066463470459, + "learning_rate": 3.64778083782286e-06, + "logits/chosen": 0.27244722843170166, + "logits/rejected": 0.3991895318031311, + "logps/chosen": -443.55645751953125, + "logps/rejected": -494.92254638671875, + "loss": 0.5942, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7955921292304993, + "rewards/margins": 0.36010387539863586, + "rewards/rejected": -1.1556960344314575, + "step": 1580 + }, + { + "epoch": 0.4161214341795342, + "grad_norm": 13.258322715759277, + "learning_rate": 3.627438534392268e-06, + "logits/chosen": -0.04510800167918205, + "logits/rejected": 0.03904765844345093, + "logps/chosen": -436.66802978515625, + "logps/rejected": -471.71142578125, + "loss": 0.5738, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.744075357913971, + "rewards/margins": 0.400721937417984, + "rewards/rejected": -1.1447973251342773, + "step": 1590 + }, + { + "epoch": 0.4187385501177702, + "grad_norm": 14.653417587280273, + "learning_rate": 3.607002090168506e-06, + "logits/chosen": 0.014941488392651081, + "logits/rejected": 0.006946629378944635, + "logps/chosen": -499.9811096191406, + "logps/rejected": -463.3321228027344, + "loss": 0.6615, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9098555445671082, + "rewards/margins": 0.21712689101696014, + "rewards/rejected": -1.1269824504852295, + "step": 1600 + }, + { + "epoch": 0.4187385501177702, + "eval_logits/chosen": -0.08284498751163483, + "eval_logits/rejected": 0.05344332382082939, + "eval_logps/chosen": -487.8919982910156, + "eval_logps/rejected": -479.4829406738281, + "eval_loss": 0.6062743067741394, + "eval_rewards/accuracies": 0.6704999804496765, + "eval_rewards/chosen": -0.9952664971351624, + "eval_rewards/margins": 0.35272136330604553, + "eval_rewards/rejected": -1.3479877710342407, + "eval_runtime": 231.9207, + "eval_samples_per_second": 8.624, + "eval_steps_per_second": 1.078, + "step": 1600 + }, + { + "epoch": 0.4213556660560063, + "grad_norm": 14.061836242675781, + "learning_rate": 3.586473211588787e-06, + "logits/chosen": 0.043830014765262604, + "logits/rejected": 0.0922635942697525, + "logps/chosen": -452.1427307128906, + "logps/rejected": -491.6693420410156, + "loss": 0.5682, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8795869946479797, + "rewards/margins": 0.42901507019996643, + "rewards/rejected": -1.3086020946502686, + "step": 1610 + }, + { + "epoch": 0.4239727819942423, + "grad_norm": 27.227012634277344, + "learning_rate": 3.5658536128085623e-06, + "logits/chosen": 0.06352569162845612, + "logits/rejected": 0.29197776317596436, + "logps/chosen": -464.83154296875, + "logps/rejected": -456.54644775390625, + "loss": 0.6573, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.966151237487793, + "rewards/margins": 0.261862576007843, + "rewards/rejected": -1.2280137538909912, + "step": 1620 + }, + { + "epoch": 0.4265898979324784, + "grad_norm": 17.129220962524414, + "learning_rate": 3.545145015558399e-06, + "logits/chosen": 0.09727749973535538, + "logits/rejected": 0.07895330339670181, + "logps/chosen": -418.5646057128906, + "logps/rejected": -414.50347900390625, + "loss": 0.635, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9049051403999329, + "rewards/margins": 0.2770025134086609, + "rewards/rejected": -1.1819076538085938, + "step": 1630 + }, + { + "epoch": 0.42920701387071447, + "grad_norm": 13.82589340209961, + "learning_rate": 3.5243491490002056e-06, + "logits/chosen": -0.012812698259949684, + "logits/rejected": -0.03551667556166649, + "logps/chosen": -469.703857421875, + "logps/rejected": -462.33489990234375, + "loss": 0.6404, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8767460584640503, + "rewards/margins": 0.29123008251190186, + "rewards/rejected": -1.1679762601852417, + "step": 1640 + }, + { + "epoch": 0.4318241298089505, + "grad_norm": 15.320180892944336, + "learning_rate": 3.503467749582857e-06, + "logits/chosen": 0.16704775393009186, + "logits/rejected": 0.20051440596580505, + "logps/chosen": -468.2303161621094, + "logps/rejected": -435.6004333496094, + "loss": 0.6469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9747709035873413, + "rewards/margins": 0.2773420214653015, + "rewards/rejected": -1.2521127462387085, + "step": 1650 + }, + { + "epoch": 0.4344412457471866, + "grad_norm": 14.137253761291504, + "learning_rate": 3.4825025608971947e-06, + "logits/chosen": 0.20161625742912292, + "logits/rejected": 0.34558919072151184, + "logps/chosen": -457.50775146484375, + "logps/rejected": -459.1556091308594, + "loss": 0.6473, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1610692739486694, + "rewards/margins": 0.21469669044017792, + "rewards/rejected": -1.3757660388946533, + "step": 1660 + }, + { + "epoch": 0.43705836168542267, + "grad_norm": 14.53886604309082, + "learning_rate": 3.4614553335304407e-06, + "logits/chosen": 0.08533845096826553, + "logits/rejected": 0.3017124533653259, + "logps/chosen": -496.14801025390625, + "logps/rejected": -463.46807861328125, + "loss": 0.601, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9761697053909302, + "rewards/margins": 0.33948642015457153, + "rewards/rejected": -1.3156561851501465, + "step": 1670 + }, + { + "epoch": 0.4396754776236587, + "grad_norm": 24.965370178222656, + "learning_rate": 3.4403278249200222e-06, + "logits/chosen": 0.06942877918481827, + "logits/rejected": 0.1279851198196411, + "logps/chosen": -469.521728515625, + "logps/rejected": -445.25592041015625, + "loss": 0.5627, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6999729871749878, + "rewards/margins": 0.4319036900997162, + "rewards/rejected": -1.1318767070770264, + "step": 1680 + }, + { + "epoch": 0.44229259356189476, + "grad_norm": 15.348676681518555, + "learning_rate": 3.4191217992068293e-06, + "logits/chosen": 0.06874585151672363, + "logits/rejected": 0.13581883907318115, + "logps/chosen": -501.0218200683594, + "logps/rejected": -451.73345947265625, + "loss": 0.6089, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9231773614883423, + "rewards/margins": 0.31995171308517456, + "rewards/rejected": -1.243129014968872, + "step": 1690 + }, + { + "epoch": 0.44490970950013087, + "grad_norm": 19.28348731994629, + "learning_rate": 3.3978390270879056e-06, + "logits/chosen": 0.1793755143880844, + "logits/rejected": 0.4792613983154297, + "logps/chosen": -450.6305236816406, + "logps/rejected": -468.32086181640625, + "loss": 0.6395, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2974417209625244, + "rewards/margins": 0.2531173825263977, + "rewards/rejected": -1.5505590438842773, + "step": 1700 + }, + { + "epoch": 0.44490970950013087, + "eval_logits/chosen": 0.0606597363948822, + "eval_logits/rejected": 0.20719292759895325, + "eval_logps/chosen": -515.066162109375, + "eval_logps/rejected": -504.28570556640625, + "eval_loss": 0.6020700931549072, + "eval_rewards/accuracies": 0.6744999885559082, + "eval_rewards/chosen": -1.2670079469680786, + "eval_rewards/margins": 0.32900768518447876, + "eval_rewards/rejected": -1.5960155725479126, + "eval_runtime": 232.1756, + "eval_samples_per_second": 8.614, + "eval_steps_per_second": 1.077, + "step": 1700 + }, + { + "epoch": 0.4475268254383669, + "grad_norm": 26.0466365814209, + "learning_rate": 3.3764812856685995e-06, + "logits/chosen": 0.11054261028766632, + "logits/rejected": 0.11302468925714493, + "logps/chosen": -463.68212890625, + "logps/rejected": -505.44464111328125, + "loss": 0.6036, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1472562551498413, + "rewards/margins": 0.3336459994316101, + "rewards/rejected": -1.4809024333953857, + "step": 1710 + }, + { + "epoch": 0.45014394137660296, + "grad_norm": 12.329084396362305, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": 0.14257648587226868, + "logits/rejected": 0.2127263993024826, + "logps/chosen": -476.0755920410156, + "logps/rejected": -482.536376953125, + "loss": 0.5637, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8113042712211609, + "rewards/margins": 0.41295966506004333, + "rewards/rejected": -1.2242640256881714, + "step": 1720 + }, + { + "epoch": 0.45276105731483907, + "grad_norm": 14.099928855895996, + "learning_rate": 3.3335480345008907e-06, + "logits/chosen": 0.11843159049749374, + "logits/rejected": 0.2315410077571869, + "logps/chosen": -438.8074645996094, + "logps/rejected": -451.05816650390625, + "loss": 0.6125, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6928716897964478, + "rewards/margins": 0.3718397617340088, + "rewards/rejected": -1.064711570739746, + "step": 1730 + }, + { + "epoch": 0.4553781732530751, + "grad_norm": 12.286911010742188, + "learning_rate": 3.3119761096666055e-06, + "logits/chosen": 0.025634441524744034, + "logits/rejected": 0.12711207568645477, + "logps/chosen": -447.642578125, + "logps/rejected": -428.96246337890625, + "loss": 0.6114, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6357846856117249, + "rewards/margins": 0.2916674017906189, + "rewards/rejected": -0.9274520874023438, + "step": 1740 + }, + { + "epoch": 0.45799528919131116, + "grad_norm": 14.236140251159668, + "learning_rate": 3.290336385060832e-06, + "logits/chosen": 0.07329438626766205, + "logits/rejected": 0.2325226366519928, + "logps/chosen": -458.25213623046875, + "logps/rejected": -445.90557861328125, + "loss": 0.5975, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8362909555435181, + "rewards/margins": 0.3581870198249817, + "rewards/rejected": -1.1944780349731445, + "step": 1750 + }, + { + "epoch": 0.46061240512954726, + "grad_norm": 15.764031410217285, + "learning_rate": 3.268630667594348e-06, + "logits/chosen": 0.11211331933736801, + "logits/rejected": 0.12331026792526245, + "logps/chosen": -460.3636169433594, + "logps/rejected": -454.080078125, + "loss": 0.5906, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8475208282470703, + "rewards/margins": 0.3742133677005768, + "rewards/rejected": -1.2217340469360352, + "step": 1760 + }, + { + "epoch": 0.4632295210677833, + "grad_norm": 23.4930362701416, + "learning_rate": 3.2468607696883147e-06, + "logits/chosen": 0.24953755736351013, + "logits/rejected": 0.34629741311073303, + "logps/chosen": -477.17791748046875, + "logps/rejected": -515.0440673828125, + "loss": 0.5613, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0197489261627197, + "rewards/margins": 0.4735226035118103, + "rewards/rejected": -1.4932715892791748, + "step": 1770 + }, + { + "epoch": 0.46584663700601936, + "grad_norm": 13.870096206665039, + "learning_rate": 3.225028509122944e-06, + "logits/chosen": 0.12339513003826141, + "logits/rejected": 0.26224666833877563, + "logps/chosen": -495.1161193847656, + "logps/rejected": -489.4485778808594, + "loss": 0.613, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2092769145965576, + "rewards/margins": 0.3337065279483795, + "rewards/rejected": -1.5429834127426147, + "step": 1780 + }, + { + "epoch": 0.4684637529442554, + "grad_norm": 19.58800506591797, + "learning_rate": 3.2031357088857083e-06, + "logits/chosen": 0.04813681170344353, + "logits/rejected": 0.21235807240009308, + "logps/chosen": -539.0248413085938, + "logps/rejected": -540.2000732421875, + "loss": 0.6242, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3390930891036987, + "rewards/margins": 0.3306949734687805, + "rewards/rejected": -1.669788122177124, + "step": 1790 + }, + { + "epoch": 0.4710808688824915, + "grad_norm": 16.04212760925293, + "learning_rate": 3.181184197019127e-06, + "logits/chosen": 0.1568802297115326, + "logits/rejected": 0.3320377767086029, + "logps/chosen": -475.43487548828125, + "logps/rejected": -508.42083740234375, + "loss": 0.5924, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2451696395874023, + "rewards/margins": 0.4058682918548584, + "rewards/rejected": -1.6510378122329712, + "step": 1800 + }, + { + "epoch": 0.4710808688824915, + "eval_logits/chosen": -0.0006541285547427833, + "eval_logits/rejected": 0.12322327494621277, + "eval_logps/chosen": -501.73223876953125, + "eval_logps/rejected": -494.2189636230469, + "eval_loss": 0.5999693870544434, + "eval_rewards/accuracies": 0.6654999852180481, + "eval_rewards/chosen": -1.133669137954712, + "eval_rewards/margins": 0.3616788983345032, + "eval_rewards/rejected": -1.4953482151031494, + "eval_runtime": 232.2077, + "eval_samples_per_second": 8.613, + "eval_steps_per_second": 1.077, + "step": 1800 + }, + { + "epoch": 0.47369798482072756, + "grad_norm": 16.250635147094727, + "learning_rate": 3.159175806468126e-06, + "logits/chosen": -0.059101611375808716, + "logits/rejected": 0.07854647934436798, + "logps/chosen": -468.9906311035156, + "logps/rejected": -467.6946716308594, + "loss": 0.5778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1028201580047607, + "rewards/margins": 0.41374215483665466, + "rewards/rejected": -1.5165622234344482, + "step": 1810 + }, + { + "epoch": 0.4763151007589636, + "grad_norm": 17.706560134887695, + "learning_rate": 3.1371123749269804e-06, + "logits/chosen": 0.029452210292220116, + "logits/rejected": 0.0658307746052742, + "logps/chosen": -503.1075134277344, + "logps/rejected": -495.166259765625, + "loss": 0.6576, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.9831246137619019, + "rewards/margins": 0.2420085221529007, + "rewards/rejected": -1.2251330614089966, + "step": 1820 + }, + { + "epoch": 0.4789322166971997, + "grad_norm": 11.094071388244629, + "learning_rate": 3.114995744685877e-06, + "logits/chosen": 0.15399818122386932, + "logits/rejected": 0.09563325345516205, + "logps/chosen": -425.6133728027344, + "logps/rejected": -413.8519592285156, + "loss": 0.6349, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6909037828445435, + "rewards/margins": 0.21615329384803772, + "rewards/rejected": -0.907056987285614, + "step": 1830 + }, + { + "epoch": 0.48154933263543576, + "grad_norm": 13.76625919342041, + "learning_rate": 3.0928277624770743e-06, + "logits/chosen": 0.03457440435886383, + "logits/rejected": 0.3043617010116577, + "logps/chosen": -482.749267578125, + "logps/rejected": -466.3192443847656, + "loss": 0.575, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6658245921134949, + "rewards/margins": 0.4141596257686615, + "rewards/rejected": -1.079984188079834, + "step": 1840 + }, + { + "epoch": 0.4841664485736718, + "grad_norm": 13.214118003845215, + "learning_rate": 3.070610279320708e-06, + "logits/chosen": 0.10331498086452484, + "logits/rejected": 0.20391520857810974, + "logps/chosen": -504.2576599121094, + "logps/rejected": -493.19635009765625, + "loss": 0.5699, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9062315225601196, + "rewards/margins": 0.4142914414405823, + "rewards/rejected": -1.3205230236053467, + "step": 1850 + }, + { + "epoch": 0.48678356451190785, + "grad_norm": 15.12294864654541, + "learning_rate": 3.0483451503702264e-06, + "logits/chosen": 0.22254931926727295, + "logits/rejected": 0.15011247992515564, + "logps/chosen": -541.047119140625, + "logps/rejected": -546.2518310546875, + "loss": 0.5984, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3822373151779175, + "rewards/margins": 0.3731249272823334, + "rewards/rejected": -1.7553622722625732, + "step": 1860 + }, + { + "epoch": 0.48940068045014395, + "grad_norm": 19.084123611450195, + "learning_rate": 3.0260342347574916e-06, + "logits/chosen": 0.16859467327594757, + "logits/rejected": 0.2447008639574051, + "logps/chosen": -518.3531494140625, + "logps/rejected": -513.4488525390625, + "loss": 0.567, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2804909944534302, + "rewards/margins": 0.4136219620704651, + "rewards/rejected": -1.69411301612854, + "step": 1870 + }, + { + "epoch": 0.49201779638838, + "grad_norm": 15.792978286743164, + "learning_rate": 3.0036793954375358e-06, + "logits/chosen": 0.11333123594522476, + "logits/rejected": 0.27916672825813293, + "logps/chosen": -503.6768493652344, + "logps/rejected": -482.2354431152344, + "loss": 0.5638, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0980170965194702, + "rewards/margins": 0.46111616492271423, + "rewards/rejected": -1.5591331720352173, + "step": 1880 + }, + { + "epoch": 0.49463491232661605, + "grad_norm": 16.74842071533203, + "learning_rate": 2.981282499033009e-06, + "logits/chosen": -0.023114752024412155, + "logits/rejected": 0.1362176537513733, + "logps/chosen": -517.9249267578125, + "logps/rejected": -498.3343200683594, + "loss": 0.6287, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0479462146759033, + "rewards/margins": 0.3232496380805969, + "rewards/rejected": -1.3711960315704346, + "step": 1890 + }, + { + "epoch": 0.49725202826485215, + "grad_norm": 14.8408784866333, + "learning_rate": 2.9588454156783163e-06, + "logits/chosen": -0.03425337374210358, + "logits/rejected": 0.01658450812101364, + "logps/chosen": -510.95263671875, + "logps/rejected": -505.4061584472656, + "loss": 0.5875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.101326584815979, + "rewards/margins": 0.37253618240356445, + "rewards/rejected": -1.4738627672195435, + "step": 1900 + }, + { + "epoch": 0.49725202826485215, + "eval_logits/chosen": 0.06292513012886047, + "eval_logits/rejected": 0.18740317225456238, + "eval_logps/chosen": -508.2808532714844, + "eval_logps/rejected": -502.21710205078125, + "eval_loss": 0.5985915660858154, + "eval_rewards/accuracies": 0.6744999885559082, + "eval_rewards/chosen": -1.1991546154022217, + "eval_rewards/margins": 0.3761745095252991, + "eval_rewards/rejected": -1.5753291845321655, + "eval_runtime": 232.6562, + "eval_samples_per_second": 8.596, + "eval_steps_per_second": 1.075, + "step": 1900 + }, + { + "epoch": 0.4998691442030882, + "grad_norm": 13.105058670043945, + "learning_rate": 2.9363700188634597e-06, + "logits/chosen": 0.08050940185785294, + "logits/rejected": 0.27998119592666626, + "logps/chosen": -500.57720947265625, + "logps/rejected": -478.83624267578125, + "loss": 0.5974, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2193187475204468, + "rewards/margins": 0.34275728464126587, + "rewards/rejected": -1.5620760917663574, + "step": 1910 + }, + { + "epoch": 0.5024862601413242, + "grad_norm": 17.656320571899414, + "learning_rate": 2.9138581852776053e-06, + "logits/chosen": 0.2168809473514557, + "logits/rejected": 0.3105737566947937, + "logps/chosen": -496.070556640625, + "logps/rejected": -499.7674865722656, + "loss": 0.5806, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1544568538665771, + "rewards/margins": 0.4054805636405945, + "rewards/rejected": -1.5599374771118164, + "step": 1920 + }, + { + "epoch": 0.5051033760795604, + "grad_norm": 14.08234977722168, + "learning_rate": 2.8913117946523805e-06, + "logits/chosen": 0.2844335436820984, + "logits/rejected": 0.31109169125556946, + "logps/chosen": -513.3364868164062, + "logps/rejected": -496.12200927734375, + "loss": 0.5716, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3235498666763306, + "rewards/margins": 0.4011419713497162, + "rewards/rejected": -1.7246919870376587, + "step": 1930 + }, + { + "epoch": 0.5077204920177963, + "grad_norm": 15.062522888183594, + "learning_rate": 2.8687327296049126e-06, + "logits/chosen": 0.230653315782547, + "logits/rejected": 0.3991672396659851, + "logps/chosen": -498.00933837890625, + "logps/rejected": -510.3081970214844, + "loss": 0.5859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1839993000030518, + "rewards/margins": 0.4453356862068176, + "rewards/rejected": -1.6293350458145142, + "step": 1940 + }, + { + "epoch": 0.5103376079560324, + "grad_norm": 19.26543426513672, + "learning_rate": 2.8461228754806376e-06, + "logits/chosen": 0.14215265214443207, + "logits/rejected": 0.23736266791820526, + "logps/chosen": -510.92657470703125, + "logps/rejected": -494.1617126464844, + "loss": 0.5901, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0615785121917725, + "rewards/margins": 0.360460102558136, + "rewards/rejected": -1.4220386743545532, + "step": 1950 + }, + { + "epoch": 0.5129547238942685, + "grad_norm": 13.139251708984375, + "learning_rate": 2.823484120195865e-06, + "logits/chosen": 0.15903696417808533, + "logits/rejected": 0.29216212034225464, + "logps/chosen": -534.9984741210938, + "logps/rejected": -512.8052978515625, + "loss": 0.5718, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2044841051101685, + "rewards/margins": 0.41181907057762146, + "rewards/rejected": -1.6163032054901123, + "step": 1960 + }, + { + "epoch": 0.5155718398325045, + "grad_norm": 21.8537654876709, + "learning_rate": 2.8008183540801486e-06, + "logits/chosen": 0.16536223888397217, + "logits/rejected": 0.2516060173511505, + "logps/chosen": -516.1781005859375, + "logps/rejected": -486.567138671875, + "loss": 0.6009, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.140059471130371, + "rewards/margins": 0.38968387246131897, + "rewards/rejected": -1.5297433137893677, + "step": 1970 + }, + { + "epoch": 0.5181889557707406, + "grad_norm": 18.018861770629883, + "learning_rate": 2.7781274697184353e-06, + "logits/chosen": 0.16366654634475708, + "logits/rejected": 0.278939425945282, + "logps/chosen": -435.4315490722656, + "logps/rejected": -477.2860412597656, + "loss": 0.6252, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.983991265296936, + "rewards/margins": 0.28731080889701843, + "rewards/rejected": -1.2713019847869873, + "step": 1980 + }, + { + "epoch": 0.5208060717089767, + "grad_norm": 15.223612785339355, + "learning_rate": 2.7554133617930397e-06, + "logits/chosen": 0.05067938566207886, + "logits/rejected": 0.06571893393993378, + "logps/chosen": -452.55804443359375, + "logps/rejected": -454.023193359375, + "loss": 0.5889, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8877269625663757, + "rewards/margins": 0.38499319553375244, + "rewards/rejected": -1.2727200984954834, + "step": 1990 + }, + { + "epoch": 0.5234231876472127, + "grad_norm": 17.48723602294922, + "learning_rate": 2.7326779269254363e-06, + "logits/chosen": -0.015536749735474586, + "logits/rejected": 0.16148407757282257, + "logps/chosen": -518.9827880859375, + "logps/rejected": -480.99163818359375, + "loss": 0.5849, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1106268167495728, + "rewards/margins": 0.4065285623073578, + "rewards/rejected": -1.5171552896499634, + "step": 2000 + }, + { + "epoch": 0.5234231876472127, + "eval_logits/chosen": -0.0025373969692736864, + "eval_logits/rejected": 0.12474588304758072, + "eval_logps/chosen": -524.7777099609375, + "eval_logps/rejected": -517.0887451171875, + "eval_loss": 0.5969316959381104, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": -1.3641233444213867, + "eval_rewards/margins": 0.3599224388599396, + "eval_rewards/rejected": -1.7240456342697144, + "eval_runtime": 232.6406, + "eval_samples_per_second": 8.597, + "eval_steps_per_second": 1.075, + "step": 2000 + }, + { + "epoch": 0.5260403035854488, + "grad_norm": 17.689542770385742, + "learning_rate": 2.7099230635178954e-06, + "logits/chosen": 0.2274688184261322, + "logits/rejected": 0.25372716784477234, + "logps/chosen": -526.2362060546875, + "logps/rejected": -533.8927001953125, + "loss": 0.5753, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3567404747009277, + "rewards/margins": 0.4240929186344147, + "rewards/rejected": -1.7808334827423096, + "step": 2010 + }, + { + "epoch": 0.528657419523685, + "grad_norm": 15.211199760437012, + "learning_rate": 2.6871506715949608e-06, + "logits/chosen": 0.058562636375427246, + "logits/rejected": 0.18520574271678925, + "logps/chosen": -499.6806640625, + "logps/rejected": -491.3092346191406, + "loss": 0.5954, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1945785284042358, + "rewards/margins": 0.3566603362560272, + "rewards/rejected": -1.551238775253296, + "step": 2020 + }, + { + "epoch": 0.5312745354619209, + "grad_norm": 15.991866111755371, + "learning_rate": 2.6643626526448063e-06, + "logits/chosen": -0.09116406738758087, + "logits/rejected": -0.05098678544163704, + "logps/chosen": -533.4771728515625, + "logps/rejected": -504.9336853027344, + "loss": 0.5447, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0096272230148315, + "rewards/margins": 0.49922627210617065, + "rewards/rejected": -1.508853554725647, + "step": 2030 + }, + { + "epoch": 0.533891651400157, + "grad_norm": 15.529874801635742, + "learning_rate": 2.6415609094604562e-06, + "logits/chosen": 0.0014547407627105713, + "logits/rejected": -0.014313450083136559, + "logps/chosen": -489.4344177246094, + "logps/rejected": -481.03558349609375, + "loss": 0.6275, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1068042516708374, + "rewards/margins": 0.29210469126701355, + "rewards/rejected": -1.3989089727401733, + "step": 2040 + }, + { + "epoch": 0.5365087673383931, + "grad_norm": 14.723438262939453, + "learning_rate": 2.618747345980904e-06, + "logits/chosen": 0.06219317764043808, + "logits/rejected": 0.29134300351142883, + "logps/chosen": -482.73651123046875, + "logps/rejected": -454.9242248535156, + "loss": 0.577, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2507166862487793, + "rewards/margins": 0.40432801842689514, + "rewards/rejected": -1.6550447940826416, + "step": 2050 + }, + { + "epoch": 0.5391258832766291, + "grad_norm": 14.240797996520996, + "learning_rate": 2.595923867132136e-06, + "logits/chosen": -0.00904160737991333, + "logits/rejected": -0.042715176939964294, + "logps/chosen": -539.594482421875, + "logps/rejected": -535.2293701171875, + "loss": 0.5718, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3310154676437378, + "rewards/margins": 0.48368293046951294, + "rewards/rejected": -1.814698576927185, + "step": 2060 + }, + { + "epoch": 0.5417429992148652, + "grad_norm": 14.852096557617188, + "learning_rate": 2.5730923786680672e-06, + "logits/chosen": -0.025297870859503746, + "logits/rejected": 0.19150254130363464, + "logps/chosen": -528.1922607421875, + "logps/rejected": -553.71142578125, + "loss": 0.587, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5110130310058594, + "rewards/margins": 0.3934626281261444, + "rewards/rejected": -1.9044758081436157, + "step": 2070 + }, + { + "epoch": 0.5443601151531012, + "grad_norm": 16.273681640625, + "learning_rate": 2.5502547870114137e-06, + "logits/chosen": -0.0205762330442667, + "logits/rejected": 0.12017925083637238, + "logps/chosen": -543.7810668945312, + "logps/rejected": -533.4263916015625, + "loss": 0.6134, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7067127227783203, + "rewards/margins": 0.3284439742565155, + "rewards/rejected": -2.035156726837158, + "step": 2080 + }, + { + "epoch": 0.5469772310913373, + "grad_norm": 23.068370819091797, + "learning_rate": 2.527412999094507e-06, + "logits/chosen": 0.06104808300733566, + "logits/rejected": 0.21988165378570557, + "logps/chosen": -568.421630859375, + "logps/rejected": -585.5074462890625, + "loss": 0.5597, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.414439082145691, + "rewards/margins": 0.4884655475616455, + "rewards/rejected": -1.902904748916626, + "step": 2090 + }, + { + "epoch": 0.5495943470295734, + "grad_norm": 18.669837951660156, + "learning_rate": 2.504568922200064e-06, + "logits/chosen": 0.03404618427157402, + "logits/rejected": 0.2708562910556793, + "logps/chosen": -476.51324462890625, + "logps/rejected": -472.4541931152344, + "loss": 0.6106, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2663251161575317, + "rewards/margins": 0.32750216126441956, + "rewards/rejected": -1.5938273668289185, + "step": 2100 + }, + { + "epoch": 0.5495943470295734, + "eval_logits/chosen": -0.016485024243593216, + "eval_logits/rejected": 0.10226120799779892, + "eval_logps/chosen": -514.2800903320312, + "eval_logps/rejected": -508.7902526855469, + "eval_loss": 0.5930544137954712, + "eval_rewards/accuracies": 0.6834999918937683, + "eval_rewards/chosen": -1.2591471672058105, + "eval_rewards/margins": 0.38191384077072144, + "eval_rewards/rejected": -1.6410611867904663, + "eval_runtime": 232.4639, + "eval_samples_per_second": 8.603, + "eval_steps_per_second": 1.075, + "step": 2100 + }, + { + "epoch": 0.5522114629678094, + "grad_norm": 17.219968795776367, + "learning_rate": 2.4817244638019333e-06, + "logits/chosen": 0.07551795244216919, + "logits/rejected": 0.11292078346014023, + "logps/chosen": -530.4259033203125, + "logps/rejected": -491.468017578125, + "loss": 0.5957, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1608909368515015, + "rewards/margins": 0.4003227651119232, + "rewards/rejected": -1.561213731765747, + "step": 2110 + }, + { + "epoch": 0.5548285789060455, + "grad_norm": 19.8704776763916, + "learning_rate": 2.4588815314058155e-06, + "logits/chosen": 0.16289404034614563, + "logits/rejected": 0.24848175048828125, + "logps/chosen": -452.452880859375, + "logps/rejected": -434.36090087890625, + "loss": 0.5882, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9118096232414246, + "rewards/margins": 0.3754151463508606, + "rewards/rejected": -1.2872246503829956, + "step": 2120 + }, + { + "epoch": 0.5574456948442816, + "grad_norm": 15.798819541931152, + "learning_rate": 2.4360420323899922e-06, + "logits/chosen": 0.11843502521514893, + "logits/rejected": 0.15708817541599274, + "logps/chosen": -505.29168701171875, + "logps/rejected": -489.7765197753906, + "loss": 0.5713, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.04365873336792, + "rewards/margins": 0.4483584761619568, + "rewards/rejected": -1.492017149925232, + "step": 2130 + }, + { + "epoch": 0.5600628107825176, + "grad_norm": 17.67369842529297, + "learning_rate": 2.4132078738460585e-06, + "logits/chosen": 0.17071916162967682, + "logits/rejected": 0.209875226020813, + "logps/chosen": -521.3018798828125, + "logps/rejected": -482.3414001464844, + "loss": 0.5975, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2791138887405396, + "rewards/margins": 0.3632059097290039, + "rewards/rejected": -1.642319679260254, + "step": 2140 + }, + { + "epoch": 0.5626799267207537, + "grad_norm": 21.954570770263672, + "learning_rate": 2.3903809624196826e-06, + "logits/chosen": 0.3113669753074646, + "logits/rejected": 0.2932060956954956, + "logps/chosen": -475.5943908691406, + "logps/rejected": -454.9044494628906, + "loss": 0.6099, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2669929265975952, + "rewards/margins": 0.3418061137199402, + "rewards/rejected": -1.6087989807128906, + "step": 2150 + }, + { + "epoch": 0.5652970426589898, + "grad_norm": 25.539379119873047, + "learning_rate": 2.3675632041513978e-06, + "logits/chosen": 0.1442708671092987, + "logits/rejected": 0.23462197184562683, + "logps/chosen": -535.3931884765625, + "logps/rejected": -477.28369140625, + "loss": 0.567, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.161911964416504, + "rewards/margins": 0.46348389983177185, + "rewards/rejected": -1.6253957748413086, + "step": 2160 + }, + { + "epoch": 0.5679141585972258, + "grad_norm": 19.380157470703125, + "learning_rate": 2.3447565043174533e-06, + "logits/chosen": 0.21937327086925507, + "logits/rejected": 0.30190104246139526, + "logps/chosen": -508.240966796875, + "logps/rejected": -481.8168029785156, + "loss": 0.5906, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4066482782363892, + "rewards/margins": 0.38738176226615906, + "rewards/rejected": -1.7940301895141602, + "step": 2170 + }, + { + "epoch": 0.5705312745354619, + "grad_norm": 18.660661697387695, + "learning_rate": 2.321962767270724e-06, + "logits/chosen": 0.22592106461524963, + "logits/rejected": 0.28465738892555237, + "logps/chosen": -527.2122802734375, + "logps/rejected": -483.3441467285156, + "loss": 0.6442, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.577532410621643, + "rewards/margins": 0.2664092779159546, + "rewards/rejected": -1.8439416885375977, + "step": 2180 + }, + { + "epoch": 0.573148390473698, + "grad_norm": 20.082679748535156, + "learning_rate": 2.299183896281692e-06, + "logits/chosen": 0.16324841976165771, + "logits/rejected": 0.29000192880630493, + "logps/chosen": -521.3306274414062, + "logps/rejected": -535.11328125, + "loss": 0.6251, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5316604375839233, + "rewards/margins": 0.3131243586540222, + "rewards/rejected": -1.8447847366333008, + "step": 2190 + }, + { + "epoch": 0.575765506411934, + "grad_norm": 15.897833824157715, + "learning_rate": 2.2764217933795297e-06, + "logits/chosen": 0.2278885841369629, + "logits/rejected": 0.2919641137123108, + "logps/chosen": -514.6935424804688, + "logps/rejected": -508.80096435546875, + "loss": 0.5783, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2934855222702026, + "rewards/margins": 0.41693955659866333, + "rewards/rejected": -1.7104251384735107, + "step": 2200 + }, + { + "epoch": 0.575765506411934, + "eval_logits/chosen": 0.14214755594730377, + "eval_logits/rejected": 0.2720797061920166, + "eval_logps/chosen": -512.494873046875, + "eval_logps/rejected": -506.34222412109375, + "eval_loss": 0.594093382358551, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": -1.2412952184677124, + "eval_rewards/margins": 0.3752853572368622, + "eval_rewards/rejected": -1.616580605506897, + "eval_runtime": 232.5254, + "eval_samples_per_second": 8.601, + "eval_steps_per_second": 1.075, + "step": 2200 + }, + { + "epoch": 0.5783826223501701, + "grad_norm": 15.479647636413574, + "learning_rate": 2.2536783591932786e-06, + "logits/chosen": 0.140645831823349, + "logits/rejected": 0.33969563245773315, + "logps/chosen": -515.4466552734375, + "logps/rejected": -524.2593994140625, + "loss": 0.604, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2855024337768555, + "rewards/margins": 0.35720211267471313, + "rewards/rejected": -1.6427046060562134, + "step": 2210 + }, + { + "epoch": 0.5809997382884062, + "grad_norm": 15.445068359375, + "learning_rate": 2.230955492793149e-06, + "logits/chosen": 0.2617935538291931, + "logits/rejected": 0.2599295973777771, + "logps/chosen": -544.3765869140625, + "logps/rejected": -532.53271484375, + "loss": 0.6289, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3259150981903076, + "rewards/margins": 0.3112506568431854, + "rewards/rejected": -1.6371657848358154, + "step": 2220 + }, + { + "epoch": 0.5836168542266422, + "grad_norm": 16.804216384887695, + "learning_rate": 2.208255091531947e-06, + "logits/chosen": 0.33556073904037476, + "logits/rejected": 0.504830539226532, + "logps/chosen": -567.1902465820312, + "logps/rejected": -558.8784790039062, + "loss": 0.5847, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4798108339309692, + "rewards/margins": 0.43180447816848755, + "rewards/rejected": -1.9116153717041016, + "step": 2230 + }, + { + "epoch": 0.5862339701648783, + "grad_norm": 15.775924682617188, + "learning_rate": 2.1855790508866435e-06, + "logits/chosen": 0.34685009717941284, + "logits/rejected": 0.377047598361969, + "logps/chosen": -602.8464965820312, + "logps/rejected": -586.4207763671875, + "loss": 0.6197, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6576731204986572, + "rewards/margins": 0.36601829528808594, + "rewards/rejected": -2.0236916542053223, + "step": 2240 + }, + { + "epoch": 0.5888510861031143, + "grad_norm": 14.079179763793945, + "learning_rate": 2.162929264300107e-06, + "logits/chosen": 0.2139190137386322, + "logits/rejected": 0.42656293511390686, + "logps/chosen": -567.9656982421875, + "logps/rejected": -579.9403076171875, + "loss": 0.555, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7237701416015625, + "rewards/margins": 0.4748326241970062, + "rewards/rejected": -2.1986026763916016, + "step": 2250 + }, + { + "epoch": 0.5914682020413504, + "grad_norm": 18.067665100097656, + "learning_rate": 2.1403076230230006e-06, + "logits/chosen": 0.4777112603187561, + "logits/rejected": 0.43725594878196716, + "logps/chosen": -557.2151489257812, + "logps/rejected": -545.1485595703125, + "loss": 0.6474, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7369823455810547, + "rewards/margins": 0.29405802488327026, + "rewards/rejected": -2.0310404300689697, + "step": 2260 + }, + { + "epoch": 0.5940853179795865, + "grad_norm": 19.236621856689453, + "learning_rate": 2.11771601595586e-06, + "logits/chosen": 0.40373674035072327, + "logits/rejected": 0.3856516480445862, + "logps/chosen": -574.0790405273438, + "logps/rejected": -524.0260009765625, + "loss": 0.5974, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5017083883285522, + "rewards/margins": 0.43927374482154846, + "rewards/rejected": -1.9409822225570679, + "step": 2270 + }, + { + "epoch": 0.5967024339178225, + "grad_norm": 16.725879669189453, + "learning_rate": 2.0951563294913737e-06, + "logits/chosen": 0.3718245029449463, + "logits/rejected": 0.4380251467227936, + "logps/chosen": -513.0089111328125, + "logps/rejected": -508.85760498046875, + "loss": 0.5285, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3701801300048828, + "rewards/margins": 0.5219612717628479, + "rewards/rejected": -1.892141342163086, + "step": 2280 + }, + { + "epoch": 0.5993195498560586, + "grad_norm": 17.321361541748047, + "learning_rate": 2.0726304473568693e-06, + "logits/chosen": 0.25947481393814087, + "logits/rejected": 0.3379734754562378, + "logps/chosen": -519.076904296875, + "logps/rejected": -498.84307861328125, + "loss": 0.5948, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3756697177886963, + "rewards/margins": 0.35469603538513184, + "rewards/rejected": -1.730365514755249, + "step": 2290 + }, + { + "epoch": 0.6019366657942947, + "grad_norm": 27.503204345703125, + "learning_rate": 2.050140250457023e-06, + "logits/chosen": 0.13081859052181244, + "logits/rejected": 0.22465069591999054, + "logps/chosen": -527.054931640625, + "logps/rejected": -524.3948974609375, + "loss": 0.574, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3721132278442383, + "rewards/margins": 0.4912486672401428, + "rewards/rejected": -1.8633617162704468, + "step": 2300 + }, + { + "epoch": 0.6019366657942947, + "eval_logits/chosen": 0.20482583343982697, + "eval_logits/rejected": 0.3316424489021301, + "eval_logps/chosen": -533.3546752929688, + "eval_logps/rejected": -529.7435302734375, + "eval_loss": 0.5939305424690247, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": -1.4498937129974365, + "eval_rewards/margins": 0.40070000290870667, + "eval_rewards/rejected": -1.8505936861038208, + "eval_runtime": 232.6476, + "eval_samples_per_second": 8.597, + "eval_steps_per_second": 1.075, + "step": 2300 + }, + { + "epoch": 0.6045537817325307, + "grad_norm": 14.945230484008789, + "learning_rate": 2.0276876167168042e-06, + "logits/chosen": 0.3673093914985657, + "logits/rejected": 0.3980174660682678, + "logps/chosen": -465.44775390625, + "logps/rejected": -453.03076171875, + "loss": 0.5977, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4102133512496948, + "rewards/margins": 0.36651554703712463, + "rewards/rejected": -1.7767289876937866, + "step": 2310 + }, + { + "epoch": 0.6071708976707668, + "grad_norm": 14.364018440246582, + "learning_rate": 2.0052744209246682e-06, + "logits/chosen": 0.34174802899360657, + "logits/rejected": 0.3505280613899231, + "logps/chosen": -476.4820861816406, + "logps/rejected": -462.1893615722656, + "loss": 0.5789, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1048295497894287, + "rewards/margins": 0.4239567816257477, + "rewards/rejected": -1.5287864208221436, + "step": 2320 + }, + { + "epoch": 0.6097880136090029, + "grad_norm": 19.915328979492188, + "learning_rate": 1.9829025345760127e-06, + "logits/chosen": 0.22554683685302734, + "logits/rejected": 0.3321411609649658, + "logps/chosen": -512.6522827148438, + "logps/rejected": -518.2416381835938, + "loss": 0.618, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0541471242904663, + "rewards/margins": 0.31440818309783936, + "rewards/rejected": -1.3685553073883057, + "step": 2330 + }, + { + "epoch": 0.6124051295472389, + "grad_norm": 27.91488265991211, + "learning_rate": 1.9605738257169115e-06, + "logits/chosen": 0.38059157133102417, + "logits/rejected": 0.4925920367240906, + "logps/chosen": -471.82098388671875, + "logps/rejected": -469.058837890625, + "loss": 0.6262, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1800696849822998, + "rewards/margins": 0.3158087134361267, + "rewards/rejected": -1.4958784580230713, + "step": 2340 + }, + { + "epoch": 0.615022245485475, + "grad_norm": 13.445463180541992, + "learning_rate": 1.9382901587881275e-06, + "logits/chosen": 0.22165732085704803, + "logits/rejected": 0.3755477964878082, + "logps/chosen": -512.750244140625, + "logps/rejected": -493.84869384765625, + "loss": 0.5605, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2496296167373657, + "rewards/margins": 0.4642421305179596, + "rewards/rejected": -1.713871717453003, + "step": 2350 + }, + { + "epoch": 0.6176393614237111, + "grad_norm": 17.654233932495117, + "learning_rate": 1.916053394469437e-06, + "logits/chosen": 0.22464020550251007, + "logits/rejected": 0.412786066532135, + "logps/chosen": -504.1710510253906, + "logps/rejected": -522.4702758789062, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2194569110870361, + "rewards/margins": 0.5263023972511292, + "rewards/rejected": -1.7457596063613892, + "step": 2360 + }, + { + "epoch": 0.6202564773619471, + "grad_norm": 14.313925743103027, + "learning_rate": 1.8938653895242604e-06, + "logits/chosen": 0.3136211335659027, + "logits/rejected": 0.38369834423065186, + "logps/chosen": -522.3583984375, + "logps/rejected": -514.4608154296875, + "loss": 0.5689, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2894423007965088, + "rewards/margins": 0.50419020652771, + "rewards/rejected": -1.7936325073242188, + "step": 2370 + }, + { + "epoch": 0.6228735933001832, + "grad_norm": 22.269620895385742, + "learning_rate": 1.8717279966446267e-06, + "logits/chosen": 0.28245988488197327, + "logits/rejected": 0.3623971939086914, + "logps/chosen": -478.1153869628906, + "logps/rejected": -492.8341369628906, + "loss": 0.6142, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2947511672973633, + "rewards/margins": 0.3845621645450592, + "rewards/rejected": -1.6793134212493896, + "step": 2380 + }, + { + "epoch": 0.6254907092384192, + "grad_norm": 15.111832618713379, + "learning_rate": 1.8496430642964698e-06, + "logits/chosen": 0.28817129135131836, + "logits/rejected": 0.31284889578819275, + "logps/chosen": -510.97576904296875, + "logps/rejected": -507.06689453125, + "loss": 0.6027, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2796354293823242, + "rewards/margins": 0.38231566548347473, + "rewards/rejected": -1.6619510650634766, + "step": 2390 + }, + { + "epoch": 0.6281078251766553, + "grad_norm": 17.842960357666016, + "learning_rate": 1.827612436565286e-06, + "logits/chosen": 0.2098797857761383, + "logits/rejected": 0.3998289704322815, + "logps/chosen": -493.5147399902344, + "logps/rejected": -490.57366943359375, + "loss": 0.581, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1564563512802124, + "rewards/margins": 0.40797433257102966, + "rewards/rejected": -1.564430594444275, + "step": 2400 + }, + { + "epoch": 0.6281078251766553, + "eval_logits/chosen": 0.1449422538280487, + "eval_logits/rejected": 0.2640918791294098, + "eval_logps/chosen": -505.70361328125, + "eval_logps/rejected": -501.2297668457031, + "eval_loss": 0.5944039225578308, + "eval_rewards/accuracies": 0.6869999766349792, + "eval_rewards/chosen": -1.1733826398849487, + "eval_rewards/margins": 0.39207327365875244, + "eval_rewards/rejected": -1.5654560327529907, + "eval_runtime": 232.3606, + "eval_samples_per_second": 8.607, + "eval_steps_per_second": 1.076, + "step": 2400 + }, + { + "epoch": 0.6307249411148914, + "grad_norm": 17.78997802734375, + "learning_rate": 1.8056379530021492e-06, + "logits/chosen": 0.18455150723457336, + "logits/rejected": 0.3481082618236542, + "logps/chosen": -462.8374938964844, + "logps/rejected": -461.21051025390625, + "loss": 0.6006, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1027500629425049, + "rewards/margins": 0.3634553551673889, + "rewards/rejected": -1.466205358505249, + "step": 2410 + }, + { + "epoch": 0.6333420570531274, + "grad_norm": 20.840116500854492, + "learning_rate": 1.7837214484701154e-06, + "logits/chosen": 0.2612837255001068, + "logits/rejected": 0.32097476720809937, + "logps/chosen": -474.12774658203125, + "logps/rejected": -475.11376953125, + "loss": 0.5625, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1177818775177002, + "rewards/margins": 0.450967937707901, + "rewards/rejected": -1.5687499046325684, + "step": 2420 + }, + { + "epoch": 0.6359591729913635, + "grad_norm": 17.94804573059082, + "learning_rate": 1.7618647529910043e-06, + "logits/chosen": 0.24861130118370056, + "logits/rejected": 0.3079971969127655, + "logps/chosen": -495.91021728515625, + "logps/rejected": -495.01434326171875, + "loss": 0.5697, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1746553182601929, + "rewards/margins": 0.4058234691619873, + "rewards/rejected": -1.5804787874221802, + "step": 2430 + }, + { + "epoch": 0.6385762889295996, + "grad_norm": 16.069711685180664, + "learning_rate": 1.7400696915925996e-06, + "logits/chosen": 0.05449223518371582, + "logits/rejected": 0.3324371874332428, + "logps/chosen": -504.40509033203125, + "logps/rejected": -470.92529296875, + "loss": 0.5925, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2023115158081055, + "rewards/margins": 0.4562528729438782, + "rewards/rejected": -1.6585643291473389, + "step": 2440 + }, + { + "epoch": 0.6411934048678356, + "grad_norm": 23.202844619750977, + "learning_rate": 1.718338084156254e-06, + "logits/chosen": 0.07136271893978119, + "logits/rejected": 0.224413201212883, + "logps/chosen": -545.9983520507812, + "logps/rejected": -521.3710327148438, + "loss": 0.5708, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2068856954574585, + "rewards/margins": 0.4760914742946625, + "rewards/rejected": -1.682977318763733, + "step": 2450 + }, + { + "epoch": 0.6438105208060717, + "grad_norm": 13.684337615966797, + "learning_rate": 1.6966717452649372e-06, + "logits/chosen": 0.2635645270347595, + "logits/rejected": 0.22264519333839417, + "logps/chosen": -511.80584716796875, + "logps/rejected": -482.666015625, + "loss": 0.5498, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1622530221939087, + "rewards/margins": 0.5128196477890015, + "rewards/rejected": -1.6750726699829102, + "step": 2460 + }, + { + "epoch": 0.6464276367443078, + "grad_norm": 16.29932975769043, + "learning_rate": 1.6750724840517103e-06, + "logits/chosen": 0.17669948935508728, + "logits/rejected": 0.28178560733795166, + "logps/chosen": -506.5672912597656, + "logps/rejected": -530.7100830078125, + "loss": 0.5961, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.264910340309143, + "rewards/margins": 0.4237841069698334, + "rewards/rejected": -1.6886943578720093, + "step": 2470 + }, + { + "epoch": 0.6490447526825438, + "grad_norm": 22.445497512817383, + "learning_rate": 1.6535421040486686e-06, + "logits/chosen": 0.25904372334480286, + "logits/rejected": 0.34389209747314453, + "logps/chosen": -510.6214294433594, + "logps/rejected": -497.93377685546875, + "loss": 0.5676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5339219570159912, + "rewards/margins": 0.4442078173160553, + "rewards/rejected": -1.9781297445297241, + "step": 2480 + }, + { + "epoch": 0.6516618686207799, + "grad_norm": 13.630302429199219, + "learning_rate": 1.6320824030363458e-06, + "logits/chosen": 0.07676917314529419, + "logits/rejected": 0.0776657983660698, + "logps/chosen": -501.290771484375, + "logps/rejected": -504.23504638671875, + "loss": 0.5775, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.433424711227417, + "rewards/margins": 0.43815937638282776, + "rewards/rejected": -1.871584177017212, + "step": 2490 + }, + { + "epoch": 0.654278984559016, + "grad_norm": 17.672494888305664, + "learning_rate": 1.6106951728936028e-06, + "logits/chosen": 0.06636019051074982, + "logits/rejected": 0.24429766833782196, + "logps/chosen": -503.6896057128906, + "logps/rejected": -528.3905029296875, + "loss": 0.5516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2013368606567383, + "rewards/margins": 0.5229911208152771, + "rewards/rejected": -1.724327802658081, + "step": 2500 + }, + { + "epoch": 0.654278984559016, + "eval_logits/chosen": 0.06589578092098236, + "eval_logits/rejected": 0.17816391587257385, + "eval_logps/chosen": -510.28302001953125, + "eval_logps/rejected": -507.39532470703125, + "eval_loss": 0.5968104004859924, + "eval_rewards/accuracies": 0.6775000095367432, + "eval_rewards/chosen": -1.219177007675171, + "eval_rewards/margins": 0.4079345464706421, + "eval_rewards/rejected": -1.627111554145813, + "eval_runtime": 232.5509, + "eval_samples_per_second": 8.6, + "eval_steps_per_second": 1.075, + "step": 2500 + }, + { + "epoch": 0.656896100497252, + "grad_norm": 21.569828033447266, + "learning_rate": 1.5893821994479996e-06, + "logits/chosen": 0.22845594584941864, + "logits/rejected": 0.29911938309669495, + "logps/chosen": -505.616943359375, + "logps/rejected": -486.45947265625, + "loss": 0.5867, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.05279541015625, + "rewards/margins": 0.43759018182754517, + "rewards/rejected": -1.4903854131698608, + "step": 2510 + }, + { + "epoch": 0.6595132164354881, + "grad_norm": 19.897043228149414, + "learning_rate": 1.5681452623266868e-06, + "logits/chosen": -0.006847086362540722, + "logits/rejected": 0.17560932040214539, + "logps/chosen": -525.3328857421875, + "logps/rejected": -500.6075134277344, + "loss": 0.5357, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1965337991714478, + "rewards/margins": 0.5665737390518188, + "rewards/rejected": -1.7631075382232666, + "step": 2520 + }, + { + "epoch": 0.6621303323737242, + "grad_norm": 14.37187385559082, + "learning_rate": 1.5469861348078014e-06, + "logits/chosen": 0.15944847464561462, + "logits/rejected": 0.2913690209388733, + "logps/chosen": -473.12652587890625, + "logps/rejected": -505.186279296875, + "loss": 0.5359, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2122926712036133, + "rewards/margins": 0.5305719375610352, + "rewards/rejected": -1.7428646087646484, + "step": 2530 + }, + { + "epoch": 0.6647474483119602, + "grad_norm": 14.125144958496094, + "learning_rate": 1.5259065836724035e-06, + "logits/chosen": 0.17984794080257416, + "logits/rejected": 0.2315172702074051, + "logps/chosen": -485.05938720703125, + "logps/rejected": -505.0470275878906, + "loss": 0.6083, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2600443363189697, + "rewards/margins": 0.4279232621192932, + "rewards/rejected": -1.6879676580429077, + "step": 2540 + }, + { + "epoch": 0.6673645642501963, + "grad_norm": 27.50576400756836, + "learning_rate": 1.5049083690569456e-06, + "logits/chosen": 0.17406558990478516, + "logits/rejected": 0.3330245018005371, + "logps/chosen": -472.1449279785156, + "logps/rejected": -494.12677001953125, + "loss": 0.6191, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.262279987335205, + "rewards/margins": 0.39653074741363525, + "rewards/rejected": -1.6588106155395508, + "step": 2550 + }, + { + "epoch": 0.6699816801884323, + "grad_norm": 27.87693977355957, + "learning_rate": 1.4839932443063057e-06, + "logits/chosen": 0.2273554801940918, + "logits/rejected": 0.20852844417095184, + "logps/chosen": -547.5466918945312, + "logps/rejected": -501.408203125, + "loss": 0.5473, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1462863683700562, + "rewards/margins": 0.48758673667907715, + "rewards/rejected": -1.6338729858398438, + "step": 2560 + }, + { + "epoch": 0.6725987961266684, + "grad_norm": 23.339811325073242, + "learning_rate": 1.4631629558273803e-06, + "logits/chosen": 0.1783367097377777, + "logits/rejected": 0.3085087239742279, + "logps/chosen": -486.72100830078125, + "logps/rejected": -489.66387939453125, + "loss": 0.6158, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1978503465652466, + "rewards/margins": 0.3335895240306854, + "rewards/rejected": -1.5314397811889648, + "step": 2570 + }, + { + "epoch": 0.6752159120649045, + "grad_norm": 15.076952934265137, + "learning_rate": 1.4424192429432657e-06, + "logits/chosen": 0.16626477241516113, + "logits/rejected": 0.18546536564826965, + "logps/chosen": -478.89300537109375, + "logps/rejected": -508.7444763183594, + "loss": 0.5569, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0055302381515503, + "rewards/margins": 0.4730769693851471, + "rewards/rejected": -1.478607177734375, + "step": 2580 + }, + { + "epoch": 0.6778330280031405, + "grad_norm": 27.09433364868164, + "learning_rate": 1.421763837748016e-06, + "logits/chosen": 0.2768346667289734, + "logits/rejected": 0.35653841495513916, + "logps/chosen": -494.996337890625, + "logps/rejected": -491.51116943359375, + "loss": 0.6019, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.140509009361267, + "rewards/margins": 0.38551202416419983, + "rewards/rejected": -1.5260212421417236, + "step": 2590 + }, + { + "epoch": 0.6804501439413766, + "grad_norm": 22.913333892822266, + "learning_rate": 1.401198464962021e-06, + "logits/chosen": 0.15527863800525665, + "logits/rejected": 0.21972744166851044, + "logps/chosen": -525.5530395507812, + "logps/rejected": -504.53814697265625, + "loss": 0.5515, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2042077779769897, + "rewards/margins": 0.5288984179496765, + "rewards/rejected": -1.7331063747406006, + "step": 2600 + }, + { + "epoch": 0.6804501439413766, + "eval_logits/chosen": 0.08117599785327911, + "eval_logits/rejected": 0.1933600753545761, + "eval_logps/chosen": -520.3548583984375, + "eval_logps/rejected": -520.2850952148438, + "eval_loss": 0.5958514213562012, + "eval_rewards/accuracies": 0.6765000224113464, + "eval_rewards/chosen": -1.3198949098587036, + "eval_rewards/margins": 0.43611443042755127, + "eval_rewards/rejected": -1.7560093402862549, + "eval_runtime": 232.575, + "eval_samples_per_second": 8.599, + "eval_steps_per_second": 1.075, + "step": 2600 + }, + { + "epoch": 0.6830672598796127, + "grad_norm": 23.55072593688965, + "learning_rate": 1.3807248417879896e-06, + "logits/chosen": 0.03950003907084465, + "logits/rejected": 0.09041625261306763, + "logps/chosen": -530.7230224609375, + "logps/rejected": -526.6931762695312, + "loss": 0.5821, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3074090480804443, + "rewards/margins": 0.4792531430721283, + "rewards/rejected": -1.7866621017456055, + "step": 2610 + }, + { + "epoch": 0.6856843758178487, + "grad_norm": 52.15616226196289, + "learning_rate": 1.3603446777675665e-06, + "logits/chosen": 0.2288985550403595, + "logits/rejected": 0.39459601044654846, + "logps/chosen": -510.23687744140625, + "logps/rejected": -519.2042236328125, + "loss": 0.5945, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2479314804077148, + "rewards/margins": 0.4636309742927551, + "rewards/rejected": -1.7115623950958252, + "step": 2620 + }, + { + "epoch": 0.6883014917560848, + "grad_norm": 17.42203712463379, + "learning_rate": 1.3400596746385817e-06, + "logits/chosen": 0.1373758614063263, + "logits/rejected": 0.24959711730480194, + "logps/chosen": -511.2616271972656, + "logps/rejected": -500.71527099609375, + "loss": 0.6122, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1762168407440186, + "rewards/margins": 0.40532511472702026, + "rewards/rejected": -1.5815417766571045, + "step": 2630 + }, + { + "epoch": 0.6909186076943209, + "grad_norm": 23.382965087890625, + "learning_rate": 1.3198715261929587e-06, + "logits/chosen": 0.24336513876914978, + "logits/rejected": 0.290175199508667, + "logps/chosen": -476.31549072265625, + "logps/rejected": -488.55010986328125, + "loss": 0.5516, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.297133207321167, + "rewards/margins": 0.47855645418167114, + "rewards/rejected": -1.775689721107483, + "step": 2640 + }, + { + "epoch": 0.6935357236325569, + "grad_norm": 27.794416427612305, + "learning_rate": 1.2997819181352823e-06, + "logits/chosen": 0.029675770550966263, + "logits/rejected": 0.17278780043125153, + "logps/chosen": -564.1571655273438, + "logps/rejected": -551.5831909179688, + "loss": 0.552, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2936184406280518, + "rewards/margins": 0.5748482942581177, + "rewards/rejected": -1.8684667348861694, + "step": 2650 + }, + { + "epoch": 0.696152839570793, + "grad_norm": 27.250629425048828, + "learning_rate": 1.2797925279420454e-06, + "logits/chosen": 0.1551249921321869, + "logits/rejected": 0.22874709963798523, + "logps/chosen": -531.533447265625, + "logps/rejected": -539.8661499023438, + "loss": 0.5713, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3435332775115967, + "rewards/margins": 0.5102267265319824, + "rewards/rejected": -1.8537601232528687, + "step": 2660 + }, + { + "epoch": 0.6987699555090291, + "grad_norm": 17.067325592041016, + "learning_rate": 1.2599050247215764e-06, + "logits/chosen": 0.08357984572649002, + "logits/rejected": 0.1700626015663147, + "logps/chosen": -526.3895263671875, + "logps/rejected": -539.6080322265625, + "loss": 0.5181, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2903873920440674, + "rewards/margins": 0.6351937651634216, + "rewards/rejected": -1.9255812168121338, + "step": 2670 + }, + { + "epoch": 0.7013870714472651, + "grad_norm": 21.607013702392578, + "learning_rate": 1.2401210690746705e-06, + "logits/chosen": 0.07735034078359604, + "logits/rejected": 0.2136649787425995, + "logps/chosen": -531.7027587890625, + "logps/rejected": -509.68701171875, + "loss": 0.6257, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3712577819824219, + "rewards/margins": 0.37780770659446716, + "rewards/rejected": -1.749065637588501, + "step": 2680 + }, + { + "epoch": 0.7040041873855012, + "grad_norm": 22.4666748046875, + "learning_rate": 1.2204423129559306e-06, + "logits/chosen": 0.19079174101352692, + "logits/rejected": 0.2903195023536682, + "logps/chosen": -524.7943725585938, + "logps/rejected": -556.605224609375, + "loss": 0.5842, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4239646196365356, + "rewards/margins": 0.49202004075050354, + "rewards/rejected": -1.9159847497940063, + "step": 2690 + }, + { + "epoch": 0.7066213033237373, + "grad_norm": 26.54542350769043, + "learning_rate": 1.20087039953583e-06, + "logits/chosen": 0.18427929282188416, + "logits/rejected": 0.3345043957233429, + "logps/chosen": -519.188720703125, + "logps/rejected": -516.6758422851562, + "loss": 0.6139, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3359040021896362, + "rewards/margins": 0.4217115342617035, + "rewards/rejected": -1.7576156854629517, + "step": 2700 + }, + { + "epoch": 0.7066213033237373, + "eval_logits/chosen": 0.08023716509342194, + "eval_logits/rejected": 0.194534033536911, + "eval_logps/chosen": -529.7188720703125, + "eval_logps/rejected": -528.66455078125, + "eval_loss": 0.594265878200531, + "eval_rewards/accuracies": 0.6784999966621399, + "eval_rewards/chosen": -1.4135349988937378, + "eval_rewards/margins": 0.4262690842151642, + "eval_rewards/rejected": -1.8398040533065796, + "eval_runtime": 232.327, + "eval_samples_per_second": 8.609, + "eval_steps_per_second": 1.076, + "step": 2700 + }, + { + "epoch": 0.7092384192619733, + "grad_norm": 21.013721466064453, + "learning_rate": 1.181406963063507e-06, + "logits/chosen": 0.23000892996788025, + "logits/rejected": 0.30790433287620544, + "logps/chosen": -526.0445556640625, + "logps/rejected": -554.0514526367188, + "loss": 0.594, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3565577268600464, + "rewards/margins": 0.46391409635543823, + "rewards/rejected": -1.8204717636108398, + "step": 2710 + }, + { + "epoch": 0.7118555352002094, + "grad_norm": 18.518587112426758, + "learning_rate": 1.1620536287303052e-06, + "logits/chosen": 0.19795748591423035, + "logits/rejected": 0.2533331513404846, + "logps/chosen": -560.046630859375, + "logps/rejected": -535.3902587890625, + "loss": 0.653, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4724786281585693, + "rewards/margins": 0.31183096766471863, + "rewards/rejected": -1.7843097448349, + "step": 2720 + }, + { + "epoch": 0.7144726511384454, + "grad_norm": 18.251569747924805, + "learning_rate": 1.1428120125340717e-06, + "logits/chosen": 0.37381523847579956, + "logits/rejected": 0.42608457803726196, + "logps/chosen": -497.0757751464844, + "logps/rejected": -489.75286865234375, + "loss": 0.5237, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.37063729763031, + "rewards/margins": 0.5976725816726685, + "rewards/rejected": -1.9683096408843994, + "step": 2730 + }, + { + "epoch": 0.7170897670766815, + "grad_norm": 33.466217041015625, + "learning_rate": 1.123683721144223e-06, + "logits/chosen": 0.33738958835601807, + "logits/rejected": 0.4564022123813629, + "logps/chosen": -546.6898193359375, + "logps/rejected": -542.0783081054688, + "loss": 0.5693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.379553198814392, + "rewards/margins": 0.47975024580955505, + "rewards/rejected": -1.8593032360076904, + "step": 2740 + }, + { + "epoch": 0.7197068830149176, + "grad_norm": 16.89527702331543, + "learning_rate": 1.1046703517675848e-06, + "logits/chosen": 0.325847327709198, + "logits/rejected": 0.5274810791015625, + "logps/chosen": -478.98846435546875, + "logps/rejected": -527.6720581054688, + "loss": 0.5674, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1666440963745117, + "rewards/margins": 0.46127814054489136, + "rewards/rejected": -1.6279222965240479, + "step": 2750 + }, + { + "epoch": 0.7223239989531536, + "grad_norm": 26.121501922607422, + "learning_rate": 1.085773492015028e-06, + "logits/chosen": 0.14125783741474152, + "logits/rejected": 0.25838667154312134, + "logps/chosen": -488.41827392578125, + "logps/rejected": -485.6160583496094, + "loss": 0.5577, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.267000436782837, + "rewards/margins": 0.5229637622833252, + "rewards/rejected": -1.7899643182754517, + "step": 2760 + }, + { + "epoch": 0.7249411148913897, + "grad_norm": 26.272478103637695, + "learning_rate": 1.0669947197689034e-06, + "logits/chosen": 0.3034301698207855, + "logits/rejected": 0.33288899064064026, + "logps/chosen": -546.6353149414062, + "logps/rejected": -535.7976684570312, + "loss": 0.5917, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4825105667114258, + "rewards/margins": 0.4382646679878235, + "rewards/rejected": -1.920775055885315, + "step": 2770 + }, + { + "epoch": 0.7275582308296258, + "grad_norm": 24.98711585998535, + "learning_rate": 1.048335603051291e-06, + "logits/chosen": 0.2563607692718506, + "logits/rejected": 0.35739272832870483, + "logps/chosen": -576.2966918945312, + "logps/rejected": -583.8340454101562, + "loss": 0.5134, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4835859537124634, + "rewards/margins": 0.6780148148536682, + "rewards/rejected": -2.1616008281707764, + "step": 2780 + }, + { + "epoch": 0.7301753467678618, + "grad_norm": 28.010181427001953, + "learning_rate": 1.0297976998930665e-06, + "logits/chosen": 0.23113617300987244, + "logits/rejected": 0.324890673160553, + "logps/chosen": -531.7484130859375, + "logps/rejected": -531.4666748046875, + "loss": 0.5544, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5267302989959717, + "rewards/margins": 0.5725789666175842, + "rewards/rejected": -2.0993094444274902, + "step": 2790 + }, + { + "epoch": 0.7327924627060979, + "grad_norm": 32.755393981933594, + "learning_rate": 1.0113825582038078e-06, + "logits/chosen": 0.2566104531288147, + "logits/rejected": 0.37072187662124634, + "logps/chosen": -558.6234741210938, + "logps/rejected": -554.4527587890625, + "loss": 0.5976, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.625914216041565, + "rewards/margins": 0.4005354344844818, + "rewards/rejected": -2.026449680328369, + "step": 2800 + }, + { + "epoch": 0.7327924627060979, + "eval_logits/chosen": 0.11360778659582138, + "eval_logits/rejected": 0.23131908476352692, + "eval_logps/chosen": -545.891845703125, + "eval_logps/rejected": -547.037109375, + "eval_loss": 0.5920617580413818, + "eval_rewards/accuracies": 0.6784999966621399, + "eval_rewards/chosen": -1.5752650499343872, + "eval_rewards/margins": 0.44826528429985046, + "eval_rewards/rejected": -2.0235302448272705, + "eval_runtime": 232.6397, + "eval_samples_per_second": 8.597, + "eval_steps_per_second": 1.075, + "step": 2800 + }, + { + "epoch": 0.735409578644334, + "grad_norm": 14.789016723632812, + "learning_rate": 9.930917156425477e-07, + "logits/chosen": 0.17329376935958862, + "logits/rejected": 0.3116983473300934, + "logps/chosen": -529.4105224609375, + "logps/rejected": -557.4210815429688, + "loss": 0.574, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6027743816375732, + "rewards/margins": 0.5013204216957092, + "rewards/rejected": -2.104094982147217, + "step": 2810 + }, + { + "epoch": 0.73802669458257, + "grad_norm": 22.683055877685547, + "learning_rate": 9.749266994893756e-07, + "logits/chosen": 0.33349448442459106, + "logits/rejected": 0.34794288873672485, + "logps/chosen": -520.3793334960938, + "logps/rejected": -538.2870483398438, + "loss": 0.6172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6407073736190796, + "rewards/margins": 0.37328147888183594, + "rewards/rejected": -2.013988971710205, + "step": 2820 + }, + { + "epoch": 0.7406438105208061, + "grad_norm": 27.596086502075195, + "learning_rate": 9.56889026517913e-07, + "logits/chosen": 0.3088427484035492, + "logits/rejected": 0.48099619150161743, + "logps/chosen": -532.8345947265625, + "logps/rejected": -520.9815673828125, + "loss": 0.6012, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.565934181213379, + "rewards/margins": 0.41419801115989685, + "rewards/rejected": -1.9801323413848877, + "step": 2830 + }, + { + "epoch": 0.7432609264590422, + "grad_norm": 27.7423095703125, + "learning_rate": 9.389802028686617e-07, + "logits/chosen": 0.35369396209716797, + "logits/rejected": 0.2665908932685852, + "logps/chosen": -524.5148315429688, + "logps/rejected": -510.5908203125, + "loss": 0.616, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4806346893310547, + "rewards/margins": 0.37274661660194397, + "rewards/rejected": -1.8533813953399658, + "step": 2840 + }, + { + "epoch": 0.7458780423972782, + "grad_norm": 16.51445770263672, + "learning_rate": 9.212017239232427e-07, + "logits/chosen": 0.1338292360305786, + "logits/rejected": 0.34962359070777893, + "logps/chosen": -557.58154296875, + "logps/rejected": -550.517578125, + "loss": 0.5427, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4360605478286743, + "rewards/margins": 0.5636521577835083, + "rewards/rejected": -1.9997127056121826, + "step": 2850 + }, + { + "epoch": 0.7484951583355143, + "grad_norm": 25.57477378845215, + "learning_rate": 9.03555074179533e-07, + "logits/chosen": 0.12778687477111816, + "logits/rejected": 0.31376713514328003, + "logps/chosen": -526.8626098632812, + "logps/rejected": -553.7708740234375, + "loss": 0.5618, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3927412033081055, + "rewards/margins": 0.521297812461853, + "rewards/rejected": -1.9140390157699585, + "step": 2860 + }, + { + "epoch": 0.7511122742737504, + "grad_norm": 18.769468307495117, + "learning_rate": 8.860417271277067e-07, + "logits/chosen": 0.10975948721170425, + "logits/rejected": 0.36845940351486206, + "logps/chosen": -543.9274291992188, + "logps/rejected": -546.5499267578125, + "loss": 0.6186, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3972618579864502, + "rewards/margins": 0.3473066985607147, + "rewards/rejected": -1.7445685863494873, + "step": 2870 + }, + { + "epoch": 0.7537293902119864, + "grad_norm": 22.453828811645508, + "learning_rate": 8.686631451272029e-07, + "logits/chosen": 0.16480425000190735, + "logits/rejected": 0.2969481647014618, + "logps/chosen": -519.5490112304688, + "logps/rejected": -512.5626831054688, + "loss": 0.6033, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5151922702789307, + "rewards/margins": 0.39446836709976196, + "rewards/rejected": -1.9096605777740479, + "step": 2880 + }, + { + "epoch": 0.7563465061502225, + "grad_norm": 20.046194076538086, + "learning_rate": 8.514207792846168e-07, + "logits/chosen": 0.31565916538238525, + "logits/rejected": 0.39657875895500183, + "logps/chosen": -525.2118530273438, + "logps/rejected": -516.3604736328125, + "loss": 0.5859, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.533144235610962, + "rewards/margins": 0.43004053831100464, + "rewards/rejected": -1.9631845951080322, + "step": 2890 + }, + { + "epoch": 0.7589636220884585, + "grad_norm": 19.951534271240234, + "learning_rate": 8.343160693325356e-07, + "logits/chosen": 0.24349746108055115, + "logits/rejected": 0.3447602689266205, + "logps/chosen": -516.9713134765625, + "logps/rejected": -535.793212890625, + "loss": 0.586, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4326432943344116, + "rewards/margins": 0.43857187032699585, + "rewards/rejected": -1.8712152242660522, + "step": 2900 + }, + { + "epoch": 0.7589636220884585, + "eval_logits/chosen": 0.12212979793548584, + "eval_logits/rejected": 0.24171759188175201, + "eval_logps/chosen": -535.29541015625, + "eval_logps/rejected": -534.5630493164062, + "eval_loss": 0.5905064940452576, + "eval_rewards/accuracies": 0.6779999732971191, + "eval_rewards/chosen": -1.4693007469177246, + "eval_rewards/margins": 0.4294882118701935, + "eval_rewards/rejected": -1.8987890481948853, + "eval_runtime": 232.2009, + "eval_samples_per_second": 8.613, + "eval_steps_per_second": 1.077, + "step": 2900 + }, + { + "epoch": 0.7615807380266946, + "grad_norm": 15.65670394897461, + "learning_rate": 8.173504435093174e-07, + "logits/chosen": 0.3191450834274292, + "logits/rejected": 0.452759325504303, + "logps/chosen": -491.33831787109375, + "logps/rejected": -494.2073669433594, + "loss": 0.5618, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4124243259429932, + "rewards/margins": 0.518468976020813, + "rewards/rejected": -1.9308933019638062, + "step": 2910 + }, + { + "epoch": 0.7641978539649307, + "grad_norm": 17.081989288330078, + "learning_rate": 8.00525318439836e-07, + "logits/chosen": 0.25279700756073, + "logits/rejected": 0.3255782127380371, + "logps/chosen": -534.6649169921875, + "logps/rejected": -551.4168701171875, + "loss": 0.6125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3950837850570679, + "rewards/margins": 0.39895501732826233, + "rewards/rejected": -1.7940387725830078, + "step": 2920 + }, + { + "epoch": 0.7668149699031667, + "grad_norm": 21.055017471313477, + "learning_rate": 7.838420990171927e-07, + "logits/chosen": 0.20293910801410675, + "logits/rejected": 0.274809867143631, + "logps/chosen": -526.1995849609375, + "logps/rejected": -538.4735717773438, + "loss": 0.5446, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3020565509796143, + "rewards/margins": 0.5154568552970886, + "rewards/rejected": -1.8175132274627686, + "step": 2930 + }, + { + "epoch": 0.7694320858414028, + "grad_norm": 17.768627166748047, + "learning_rate": 7.673021782854084e-07, + "logits/chosen": 0.3839934468269348, + "logits/rejected": 0.3556897044181824, + "logps/chosen": -529.2373046875, + "logps/rejected": -502.952880859375, + "loss": 0.5787, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4653526544570923, + "rewards/margins": 0.48818325996398926, + "rewards/rejected": -1.953536033630371, + "step": 2940 + }, + { + "epoch": 0.7720492017796389, + "grad_norm": 19.30914878845215, + "learning_rate": 7.509069373231039e-07, + "logits/chosen": 0.23126430809497833, + "logits/rejected": 0.22901423275470734, + "logps/chosen": -515.2665405273438, + "logps/rejected": -515.766357421875, + "loss": 0.5974, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.515284538269043, + "rewards/margins": 0.42790335416793823, + "rewards/rejected": -1.9431880712509155, + "step": 2950 + }, + { + "epoch": 0.7746663177178749, + "grad_norm": 20.416574478149414, + "learning_rate": 7.346577451281822e-07, + "logits/chosen": 0.31689321994781494, + "logits/rejected": 0.37642043828964233, + "logps/chosen": -539.6643676757812, + "logps/rejected": -538.9885864257812, + "loss": 0.5562, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.518949270248413, + "rewards/margins": 0.5324884653091431, + "rewards/rejected": -2.0514376163482666, + "step": 2960 + }, + { + "epoch": 0.777283433656111, + "grad_norm": 28.220792770385742, + "learning_rate": 7.185559585035138e-07, + "logits/chosen": 0.07478724420070648, + "logits/rejected": 0.2702252268791199, + "logps/chosen": -551.8621215820312, + "logps/rejected": -564.0684814453125, + "loss": 0.5573, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.43454909324646, + "rewards/margins": 0.5544053912162781, + "rewards/rejected": -1.988954782485962, + "step": 2970 + }, + { + "epoch": 0.7799005495943471, + "grad_norm": 18.90873146057129, + "learning_rate": 7.026029219436504e-07, + "logits/chosen": 0.16967260837554932, + "logits/rejected": 0.33861225843429565, + "logps/chosen": -534.6746215820312, + "logps/rejected": -526.2569580078125, + "loss": 0.6105, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4762811660766602, + "rewards/margins": 0.38691508769989014, + "rewards/rejected": -1.8631963729858398, + "step": 2980 + }, + { + "epoch": 0.7825176655325831, + "grad_norm": 15.61099910736084, + "learning_rate": 6.867999675225523e-07, + "logits/chosen": 0.2207004576921463, + "logits/rejected": 0.2613358795642853, + "logps/chosen": -489.48529052734375, + "logps/rejected": -498.1441955566406, + "loss": 0.5601, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4658935070037842, + "rewards/margins": 0.5164823532104492, + "rewards/rejected": -1.9823758602142334, + "step": 2990 + }, + { + "epoch": 0.7851347814708192, + "grad_norm": 29.250232696533203, + "learning_rate": 6.711484147823663e-07, + "logits/chosen": 0.1829605996608734, + "logits/rejected": 0.3020482361316681, + "logps/chosen": -495.75927734375, + "logps/rejected": -530.975830078125, + "loss": 0.5671, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4143750667572021, + "rewards/margins": 0.502804696559906, + "rewards/rejected": -1.9171797037124634, + "step": 3000 + }, + { + "epoch": 0.7851347814708192, + "eval_logits/chosen": 0.1228410005569458, + "eval_logits/rejected": 0.24235635995864868, + "eval_logps/chosen": -534.0778198242188, + "eval_logps/rejected": -533.9715576171875, + "eval_loss": 0.5899218916893005, + "eval_rewards/accuracies": 0.6794999837875366, + "eval_rewards/chosen": -1.4571242332458496, + "eval_rewards/margins": 0.4357497990131378, + "eval_rewards/rejected": -1.8928741216659546, + "eval_runtime": 232.4162, + "eval_samples_per_second": 8.605, + "eval_steps_per_second": 1.076, + "step": 3000 + }, + { + "epoch": 0.7877518974090553, + "grad_norm": 21.612812042236328, + "learning_rate": 6.556495706232413e-07, + "logits/chosen": 0.28465574979782104, + "logits/rejected": 0.3168385922908783, + "logps/chosen": -515.8154296875, + "logps/rejected": -536.3228759765625, + "loss": 0.5725, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4145126342773438, + "rewards/margins": 0.49681711196899414, + "rewards/rejected": -1.9113296270370483, + "step": 3010 + }, + { + "epoch": 0.7903690133472913, + "grad_norm": 19.919925689697266, + "learning_rate": 6.403047291942057e-07, + "logits/chosen": 0.25551754236221313, + "logits/rejected": 0.3729092478752136, + "logps/chosen": -478.81451416015625, + "logps/rejected": -482.24725341796875, + "loss": 0.5659, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4676393270492554, + "rewards/margins": 0.47457486391067505, + "rewards/rejected": -1.9422142505645752, + "step": 3020 + }, + { + "epoch": 0.7929861292855274, + "grad_norm": 25.699249267578125, + "learning_rate": 6.251151717851023e-07, + "logits/chosen": 0.3613748848438263, + "logits/rejected": 0.464524507522583, + "logps/chosen": -483.13775634765625, + "logps/rejected": -510.97576904296875, + "loss": 0.6078, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4159023761749268, + "rewards/margins": 0.4669385850429535, + "rewards/rejected": -1.882840871810913, + "step": 3030 + }, + { + "epoch": 0.7956032452237635, + "grad_norm": 18.529563903808594, + "learning_rate": 6.100821667196041e-07, + "logits/chosen": 0.1572251319885254, + "logits/rejected": 0.26168403029441833, + "logps/chosen": -538.1602783203125, + "logps/rejected": -491.727783203125, + "loss": 0.5572, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.387502908706665, + "rewards/margins": 0.5043372511863708, + "rewards/rejected": -1.8918402194976807, + "step": 3040 + }, + { + "epoch": 0.7982203611619995, + "grad_norm": 18.66206169128418, + "learning_rate": 5.952069692493062e-07, + "logits/chosen": 0.15295560657978058, + "logits/rejected": 0.2732795178890228, + "logps/chosen": -466.65350341796875, + "logps/rejected": -510.0077209472656, + "loss": 0.5483, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4113075733184814, + "rewards/margins": 0.5156591534614563, + "rewards/rejected": -1.926966667175293, + "step": 3050 + }, + { + "epoch": 0.8008374771002356, + "grad_norm": 32.57720947265625, + "learning_rate": 5.80490821448918e-07, + "logits/chosen": 0.18402081727981567, + "logits/rejected": 0.25058144330978394, + "logps/chosen": -528.1961059570312, + "logps/rejected": -616.8970947265625, + "loss": 0.5587, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4876148700714111, + "rewards/margins": 0.5890025496482849, + "rewards/rejected": -2.0766172409057617, + "step": 3060 + }, + { + "epoch": 0.8034545930384716, + "grad_norm": 22.268369674682617, + "learning_rate": 5.659349521125459e-07, + "logits/chosen": 0.08234192430973053, + "logits/rejected": 0.09182853996753693, + "logps/chosen": -555.3732299804688, + "logps/rejected": -554.161865234375, + "loss": 0.596, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4457889795303345, + "rewards/margins": 0.41241535544395447, + "rewards/rejected": -1.8582042455673218, + "step": 3070 + }, + { + "epoch": 0.8060717089767077, + "grad_norm": 18.810224533081055, + "learning_rate": 5.5154057665109e-07, + "logits/chosen": 0.11081822216510773, + "logits/rejected": 0.25048089027404785, + "logps/chosen": -493.99493408203125, + "logps/rejected": -511.19464111328125, + "loss": 0.5361, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3138765096664429, + "rewards/margins": 0.5303700566291809, + "rewards/rejected": -1.844246506690979, + "step": 3080 + }, + { + "epoch": 0.8086888249149438, + "grad_norm": 20.452537536621094, + "learning_rate": 5.373088969907586e-07, + "logits/chosen": 0.1712993085384369, + "logits/rejected": 0.28175634145736694, + "logps/chosen": -530.8388671875, + "logps/rejected": -525.1220703125, + "loss": 0.5466, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.279170274734497, + "rewards/margins": 0.5140555500984192, + "rewards/rejected": -1.793225884437561, + "step": 3090 + }, + { + "epoch": 0.8113059408531798, + "grad_norm": 18.429285049438477, + "learning_rate": 5.23241101472709e-07, + "logits/chosen": 0.1318766474723816, + "logits/rejected": 0.12211046367883682, + "logps/chosen": -517.6220092773438, + "logps/rejected": -529.6044921875, + "loss": 0.56, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.189562201499939, + "rewards/margins": 0.5340698957443237, + "rewards/rejected": -1.7236320972442627, + "step": 3100 + }, + { + "epoch": 0.8113059408531798, + "eval_logits/chosen": 0.08669886738061905, + "eval_logits/rejected": 0.2020561546087265, + "eval_logps/chosen": -520.4529418945312, + "eval_logps/rejected": -520.4334106445312, + "eval_loss": 0.5915951132774353, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": -1.3208762407302856, + "eval_rewards/margins": 0.43661609292030334, + "eval_rewards/rejected": -1.757492184638977, + "eval_runtime": 232.359, + "eval_samples_per_second": 8.607, + "eval_steps_per_second": 1.076, + "step": 3100 + }, + { + "epoch": 0.8139230567914159, + "grad_norm": 21.272666931152344, + "learning_rate": 5.09338364753818e-07, + "logits/chosen": 0.22215643525123596, + "logits/rejected": 0.2583310008049011, + "logps/chosen": -535.0568237304688, + "logps/rejected": -544.3440551757812, + "loss": 0.5716, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.234817624092102, + "rewards/margins": 0.4773196280002594, + "rewards/rejected": -1.71213698387146, + "step": 3110 + }, + { + "epoch": 0.816540172729652, + "grad_norm": 26.98297882080078, + "learning_rate": 4.956018477086005e-07, + "logits/chosen": 0.21868661046028137, + "logits/rejected": 0.32554829120635986, + "logps/chosen": -545.9993896484375, + "logps/rejected": -515.1287231445312, + "loss": 0.6281, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3831281661987305, + "rewards/margins": 0.3917158544063568, + "rewards/rejected": -1.7748441696166992, + "step": 3120 + }, + { + "epoch": 0.819157288667888, + "grad_norm": 19.549560546875, + "learning_rate": 4.820326973322764e-07, + "logits/chosen": 0.072993703186512, + "logits/rejected": 0.19295060634613037, + "logps/chosen": -512.5438842773438, + "logps/rejected": -527.2860107421875, + "loss": 0.5642, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.34549880027771, + "rewards/margins": 0.4567781090736389, + "rewards/rejected": -1.8022769689559937, + "step": 3130 + }, + { + "epoch": 0.821774404606124, + "grad_norm": 22.311878204345703, + "learning_rate": 4.686320466449981e-07, + "logits/chosen": 0.20619972050189972, + "logits/rejected": 0.35505905747413635, + "logps/chosen": -488.4278869628906, + "logps/rejected": -507.36785888671875, + "loss": 0.5786, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3550437688827515, + "rewards/margins": 0.4867793619632721, + "rewards/rejected": -1.841822862625122, + "step": 3140 + }, + { + "epoch": 0.8243915205443602, + "grad_norm": 16.643848419189453, + "learning_rate": 4.554010145972418e-07, + "logits/chosen": 0.21577072143554688, + "logits/rejected": 0.33193427324295044, + "logps/chosen": -521.0413208007812, + "logps/rejected": -540.6106567382812, + "loss": 0.5785, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3828619718551636, + "rewards/margins": 0.4862847924232483, + "rewards/rejected": -1.869146704673767, + "step": 3150 + }, + { + "epoch": 0.8270086364825961, + "grad_norm": 17.693086624145508, + "learning_rate": 4.4234070597637455e-07, + "logits/chosen": 0.11184321343898773, + "logits/rejected": 0.19088369607925415, + "logps/chosen": -536.9678955078125, + "logps/rejected": -554.2827758789062, + "loss": 0.5834, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.369461178779602, + "rewards/margins": 0.48122233152389526, + "rewards/rejected": -1.850683569908142, + "step": 3160 + }, + { + "epoch": 0.8296257524208323, + "grad_norm": 16.465007781982422, + "learning_rate": 4.2945221131440783e-07, + "logits/chosen": 0.24646055698394775, + "logits/rejected": 0.306671679019928, + "logps/chosen": -524.2401123046875, + "logps/rejected": -516.0563354492188, + "loss": 0.5793, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3744683265686035, + "rewards/margins": 0.4835619032382965, + "rewards/rejected": -1.8580303192138672, + "step": 3170 + }, + { + "epoch": 0.8322428683590684, + "grad_norm": 22.869001388549805, + "learning_rate": 4.167366067969381e-07, + "logits/chosen": 0.07770199328660965, + "logits/rejected": 0.25347238779067993, + "logps/chosen": -455.5186462402344, + "logps/rejected": -507.84356689453125, + "loss": 0.5827, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2709424495697021, + "rewards/margins": 0.4785892367362976, + "rewards/rejected": -1.7495317459106445, + "step": 3180 + }, + { + "epoch": 0.8348599842973043, + "grad_norm": 20.31063461303711, + "learning_rate": 4.041949541732826e-07, + "logits/chosen": 0.24958959221839905, + "logits/rejected": 0.255808025598526, + "logps/chosen": -538.5, + "logps/rejected": -554.1825561523438, + "loss": 0.5788, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4217197895050049, + "rewards/margins": 0.490082323551178, + "rewards/rejected": -1.9118019342422485, + "step": 3190 + }, + { + "epoch": 0.8374771002355405, + "grad_norm": 17.181446075439453, + "learning_rate": 3.9182830066782614e-07, + "logits/chosen": 0.1832321137189865, + "logits/rejected": 0.20756061375141144, + "logps/chosen": -523.9074096679688, + "logps/rejected": -562.7293701171875, + "loss": 0.5796, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.473734736442566, + "rewards/margins": 0.465187132358551, + "rewards/rejected": -1.9389216899871826, + "step": 3200 + }, + { + "epoch": 0.8374771002355405, + "eval_logits/chosen": 0.07104705274105072, + "eval_logits/rejected": 0.18436755239963531, + "eval_logps/chosen": -532.9956665039062, + "eval_logps/rejected": -534.0466918945312, + "eval_loss": 0.590150773525238, + "eval_rewards/accuracies": 0.684499979019165, + "eval_rewards/chosen": -1.4463036060333252, + "eval_rewards/margins": 0.44732123613357544, + "eval_rewards/rejected": -1.8936247825622559, + "eval_runtime": 232.4834, + "eval_samples_per_second": 8.603, + "eval_steps_per_second": 1.075, + "step": 3200 + }, + { + "epoch": 0.8400942161737766, + "grad_norm": 15.772056579589844, + "learning_rate": 3.796376788925771e-07, + "logits/chosen": 0.17947904765605927, + "logits/rejected": 0.38756972551345825, + "logps/chosen": -536.089599609375, + "logps/rejected": -513.5861206054688, + "loss": 0.6027, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4111121892929077, + "rewards/margins": 0.3696535527706146, + "rewards/rejected": -1.7807658910751343, + "step": 3210 + }, + { + "epoch": 0.8427113321120125, + "grad_norm": 22.664976119995117, + "learning_rate": 3.676241067609465e-07, + "logits/chosen": 0.21588890254497528, + "logits/rejected": 0.31630846858024597, + "logps/chosen": -577.00146484375, + "logps/rejected": -551.9093017578125, + "loss": 0.5948, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4189621210098267, + "rewards/margins": 0.4751489758491516, + "rewards/rejected": -1.8941110372543335, + "step": 3220 + }, + { + "epoch": 0.8453284480502486, + "grad_norm": 30.950883865356445, + "learning_rate": 3.5578858740274976e-07, + "logits/chosen": 0.08771739155054092, + "logits/rejected": 0.16890794038772583, + "logps/chosen": -532.2547607421875, + "logps/rejected": -527.4913330078125, + "loss": 0.6527, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4851154088974, + "rewards/margins": 0.3038921654224396, + "rewards/rejected": -1.789007544517517, + "step": 3230 + }, + { + "epoch": 0.8479455639884846, + "grad_norm": 22.29638671875, + "learning_rate": 3.44132109080447e-07, + "logits/chosen": 0.05612761899828911, + "logits/rejected": 0.13626542687416077, + "logps/chosen": -520.4820556640625, + "logps/rejected": -517.2299194335938, + "loss": 0.554, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3912557363510132, + "rewards/margins": 0.4790167808532715, + "rewards/rejected": -1.8702728748321533, + "step": 3240 + }, + { + "epoch": 0.8505626799267207, + "grad_norm": 18.589750289916992, + "learning_rate": 3.3265564510662344e-07, + "logits/chosen": 0.04296109080314636, + "logits/rejected": 0.2635151743888855, + "logps/chosen": -563.4909057617188, + "logps/rejected": -560.6986694335938, + "loss": 0.538, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3561357259750366, + "rewards/margins": 0.6071338653564453, + "rewards/rejected": -1.963269591331482, + "step": 3250 + }, + { + "epoch": 0.8531797958649568, + "grad_norm": 23.977201461791992, + "learning_rate": 3.213601537627195e-07, + "logits/chosen": 0.2044192999601364, + "logits/rejected": 0.20763865113258362, + "logps/chosen": -533.2398681640625, + "logps/rejected": -539.3159790039062, + "loss": 0.6017, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5438867807388306, + "rewards/margins": 0.37856417894363403, + "rewards/rejected": -1.9224510192871094, + "step": 3260 + }, + { + "epoch": 0.8557969118031928, + "grad_norm": 19.610824584960938, + "learning_rate": 3.1024657821901063e-07, + "logits/chosen": 0.09625023603439331, + "logits/rejected": 0.09076298773288727, + "logps/chosen": -519.69287109375, + "logps/rejected": -526.6182861328125, + "loss": 0.5844, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3638288974761963, + "rewards/margins": 0.4889647364616394, + "rewards/rejected": -1.8527934551239014, + "step": 3270 + }, + { + "epoch": 0.8584140277414289, + "grad_norm": 17.58568000793457, + "learning_rate": 2.9931584645585654e-07, + "logits/chosen": 0.24739189445972443, + "logits/rejected": 0.3013184666633606, + "logps/chosen": -541.0255126953125, + "logps/rejected": -576.6962890625, + "loss": 0.5773, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3129136562347412, + "rewards/margins": 0.46883121132850647, + "rewards/rejected": -1.781745195388794, + "step": 3280 + }, + { + "epoch": 0.861031143679665, + "grad_norm": 12.26266860961914, + "learning_rate": 2.885688711862136e-07, + "logits/chosen": 0.14454945921897888, + "logits/rejected": 0.2927783131599426, + "logps/chosen": -537.5690307617188, + "logps/rejected": -562.3256225585938, + "loss": 0.5883, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4469431638717651, + "rewards/margins": 0.4720059335231781, + "rewards/rejected": -1.9189491271972656, + "step": 3290 + }, + { + "epoch": 0.863648259617901, + "grad_norm": 20.302330017089844, + "learning_rate": 2.7800654977942486e-07, + "logits/chosen": 0.06817831099033356, + "logits/rejected": 0.22290131449699402, + "logps/chosen": -533.7552490234375, + "logps/rejected": -575.96875, + "loss": 0.55, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4392116069793701, + "rewards/margins": 0.5773890018463135, + "rewards/rejected": -2.0166003704071045, + "step": 3300 + }, + { + "epoch": 0.863648259617901, + "eval_logits/chosen": 0.06103089079260826, + "eval_logits/rejected": 0.17368435859680176, + "eval_logps/chosen": -540.1639404296875, + "eval_logps/rejected": -541.0390625, + "eval_loss": 0.5897097587585449, + "eval_rewards/accuracies": 0.6834999918937683, + "eval_rewards/chosen": -1.5179859399795532, + "eval_rewards/margins": 0.44556280970573425, + "eval_rewards/rejected": -1.9635487794876099, + "eval_runtime": 232.0365, + "eval_samples_per_second": 8.619, + "eval_steps_per_second": 1.077, + "step": 3300 + }, + { + "epoch": 0.8662653755561371, + "grad_norm": 45.810630798339844, + "learning_rate": 2.6762976418628797e-07, + "logits/chosen": 0.32271671295166016, + "logits/rejected": 0.39709824323654175, + "logps/chosen": -474.7618103027344, + "logps/rejected": -456.5228576660156, + "loss": 0.5846, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.407684087753296, + "rewards/margins": 0.46905022859573364, + "rewards/rejected": -1.8767343759536743, + "step": 3310 + }, + { + "epoch": 0.8688824914943732, + "grad_norm": 20.996036529541016, + "learning_rate": 2.5743938086541354e-07, + "logits/chosen": 0.15398995578289032, + "logits/rejected": 0.3004111349582672, + "logps/chosen": -523.9410400390625, + "logps/rejected": -531.103759765625, + "loss": 0.5832, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.478339672088623, + "rewards/margins": 0.49801668524742126, + "rewards/rejected": -1.9763562679290771, + "step": 3320 + }, + { + "epoch": 0.8714996074326092, + "grad_norm": 21.416170120239258, + "learning_rate": 2.4743625071087574e-07, + "logits/chosen": 0.09900239109992981, + "logits/rejected": 0.1008148342370987, + "logps/chosen": -542.8681640625, + "logps/rejected": -543.7512817382812, + "loss": 0.562, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3520573377609253, + "rewards/margins": 0.5512968301773071, + "rewards/rejected": -1.9033544063568115, + "step": 3330 + }, + { + "epoch": 0.8741167233708453, + "grad_norm": 20.47162437438965, + "learning_rate": 2.3762120898116498e-07, + "logits/chosen": 0.07932907342910767, + "logits/rejected": 0.2061280757188797, + "logps/chosen": -550.5972290039062, + "logps/rejected": -576.6636962890625, + "loss": 0.5604, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.505156397819519, + "rewards/margins": 0.4955100119113922, + "rewards/rejected": -2.000666379928589, + "step": 3340 + }, + { + "epoch": 0.8767338393090814, + "grad_norm": 18.11985969543457, + "learning_rate": 2.2799507522944048e-07, + "logits/chosen": 0.16951636970043182, + "logits/rejected": 0.21532103419303894, + "logps/chosen": -531.1329345703125, + "logps/rejected": -555.5535278320312, + "loss": 0.5539, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3905164003372192, + "rewards/margins": 0.5168679356575012, + "rewards/rejected": -1.9073841571807861, + "step": 3350 + }, + { + "epoch": 0.8793509552473174, + "grad_norm": 23.173076629638672, + "learning_rate": 2.1855865323510056e-07, + "logits/chosen": 0.10670924186706543, + "logits/rejected": 0.25408512353897095, + "logps/chosen": -534.140625, + "logps/rejected": -571.8042602539062, + "loss": 0.5092, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2978779077529907, + "rewards/margins": 0.7049534320831299, + "rewards/rejected": -2.00283145904541, + "step": 3360 + }, + { + "epoch": 0.8819680711855535, + "grad_norm": 26.09234619140625, + "learning_rate": 2.0931273093666575e-07, + "logits/chosen": 0.1856442391872406, + "logits/rejected": 0.2775765657424927, + "logps/chosen": -504.78472900390625, + "logps/rejected": -510.97772216796875, + "loss": 0.5706, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4099186658859253, + "rewards/margins": 0.48924511671066284, + "rewards/rejected": -1.899163842201233, + "step": 3370 + }, + { + "epoch": 0.8845851871237895, + "grad_norm": 18.44064712524414, + "learning_rate": 2.002580803659873e-07, + "logits/chosen": 0.18279746174812317, + "logits/rejected": 0.25130945444107056, + "logps/chosen": -501.90191650390625, + "logps/rejected": -510.632080078125, + "loss": 0.5763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.326120138168335, + "rewards/margins": 0.5073062777519226, + "rewards/rejected": -1.8334262371063232, + "step": 3380 + }, + { + "epoch": 0.8872023030620256, + "grad_norm": 20.44449234008789, + "learning_rate": 1.913954575837826e-07, + "logits/chosen": 0.26278918981552124, + "logits/rejected": 0.3742792010307312, + "logps/chosen": -547.27490234375, + "logps/rejected": -502.9208984375, + "loss": 0.569, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3170632123947144, + "rewards/margins": 0.5174871683120728, + "rewards/rejected": -1.8345504999160767, + "step": 3390 + }, + { + "epoch": 0.8898194190002617, + "grad_norm": 23.813940048217773, + "learning_rate": 1.827256026165028e-07, + "logits/chosen": 0.15093761682510376, + "logits/rejected": 0.15371516346931458, + "logps/chosen": -576.1170654296875, + "logps/rejected": -542.66357421875, + "loss": 0.5519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2780691385269165, + "rewards/margins": 0.5459933280944824, + "rewards/rejected": -1.8240623474121094, + "step": 3400 + }, + { + "epoch": 0.8898194190002617, + "eval_logits/chosen": 0.049354203045368195, + "eval_logits/rejected": 0.16207368671894073, + "eval_logps/chosen": -524.5573120117188, + "eval_logps/rejected": -524.9038696289062, + "eval_loss": 0.5905484557151794, + "eval_rewards/accuracies": 0.684499979019165, + "eval_rewards/chosen": -1.3619197607040405, + "eval_rewards/margins": 0.4402773082256317, + "eval_rewards/rejected": -1.8021970987319946, + "eval_runtime": 232.2898, + "eval_samples_per_second": 8.61, + "eval_steps_per_second": 1.076, + "step": 3400 + }, + { + "epoch": 0.8924365349384977, + "grad_norm": 22.652015686035156, + "learning_rate": 1.7424923939454274e-07, + "logits/chosen": 0.09193596988916397, + "logits/rejected": 0.12269000709056854, + "logps/chosen": -560.306884765625, + "logps/rejected": -547.3897705078125, + "loss": 0.5522, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3574447631835938, + "rewards/margins": 0.512332558631897, + "rewards/rejected": -1.8697776794433594, + "step": 3410 + }, + { + "epoch": 0.8950536508767338, + "grad_norm": 20.60724449157715, + "learning_rate": 1.6596707569179304e-07, + "logits/chosen": 0.11068376153707504, + "logits/rejected": 0.2603934407234192, + "logps/chosen": -541.2059326171875, + "logps/rejected": -526.3267211914062, + "loss": 0.5604, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.222874402999878, + "rewards/margins": 0.528007447719574, + "rewards/rejected": -1.7508817911148071, + "step": 3420 + }, + { + "epoch": 0.8976707668149699, + "grad_norm": 19.67418098449707, + "learning_rate": 1.578798030665385e-07, + "logits/chosen": 0.07019755989313126, + "logits/rejected": 0.25848856568336487, + "logps/chosen": -518.75146484375, + "logps/rejected": -542.2596435546875, + "loss": 0.528, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2500635385513306, + "rewards/margins": 0.5684723854064941, + "rewards/rejected": -1.8185360431671143, + "step": 3430 + }, + { + "epoch": 0.9002878827532059, + "grad_norm": 21.947776794433594, + "learning_rate": 1.499880968037165e-07, + "logits/chosen": 0.2768905758857727, + "logits/rejected": 0.3939592242240906, + "logps/chosen": -490.6131896972656, + "logps/rejected": -477.58380126953125, + "loss": 0.5807, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2620429992675781, + "rewards/margins": 0.421647846698761, + "rewards/rejected": -1.6836907863616943, + "step": 3440 + }, + { + "epoch": 0.902904998691442, + "grad_norm": 23.7869930267334, + "learning_rate": 1.4229261585852805e-07, + "logits/chosen": 0.28201746940612793, + "logits/rejected": 0.20732636749744415, + "logps/chosen": -523.0221557617188, + "logps/rejected": -520.0089721679688, + "loss": 0.5617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2637730836868286, + "rewards/margins": 0.4759213328361511, + "rewards/rejected": -1.739694595336914, + "step": 3450 + }, + { + "epoch": 0.9055221146296781, + "grad_norm": 24.20222282409668, + "learning_rate": 1.3479400280141886e-07, + "logits/chosen": 0.11287301778793335, + "logits/rejected": 0.17427489161491394, + "logps/chosen": -487.00262451171875, + "logps/rejected": -517.9505615234375, + "loss": 0.5644, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.300456166267395, + "rewards/margins": 0.5554144382476807, + "rewards/rejected": -1.8558704853057861, + "step": 3460 + }, + { + "epoch": 0.9081392305679141, + "grad_norm": 23.366870880126953, + "learning_rate": 1.2749288376442044e-07, + "logits/chosen": 0.08213616907596588, + "logits/rejected": 0.1927454173564911, + "logps/chosen": -560.6731567382812, + "logps/rejected": -532.0469970703125, + "loss": 0.5432, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3204230070114136, + "rewards/margins": 0.5710306763648987, + "rewards/rejected": -1.891453742980957, + "step": 3470 + }, + { + "epoch": 0.9107563465061502, + "grad_norm": 21.51507568359375, + "learning_rate": 1.203898683888713e-07, + "logits/chosen": 0.22756421566009521, + "logits/rejected": 0.25787419080734253, + "logps/chosen": -500.9013671875, + "logps/rejected": -513.2033081054688, + "loss": 0.6123, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4208608865737915, + "rewards/margins": 0.40193843841552734, + "rewards/rejected": -1.8227994441986084, + "step": 3480 + }, + { + "epoch": 0.9133734624443863, + "grad_norm": 22.2541446685791, + "learning_rate": 1.1348554977451132e-07, + "logits/chosen": 0.16574542224407196, + "logits/rejected": 0.22871682047843933, + "logps/chosen": -550.3922119140625, + "logps/rejected": -538.7455444335938, + "loss": 0.5858, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4138902425765991, + "rewards/margins": 0.44176238775253296, + "rewards/rejected": -1.8556525707244873, + "step": 3490 + }, + { + "epoch": 0.9159905783826223, + "grad_norm": 17.759902954101562, + "learning_rate": 1.0678050442995802e-07, + "logits/chosen": 0.006184411235153675, + "logits/rejected": 0.05719981715083122, + "logps/chosen": -541.510009765625, + "logps/rejected": -514.5938110351562, + "loss": 0.5939, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4001209735870361, + "rewards/margins": 0.437242329120636, + "rewards/rejected": -1.8373632431030273, + "step": 3500 + }, + { + "epoch": 0.9159905783826223, + "eval_logits/chosen": 0.056149620562791824, + "eval_logits/rejected": 0.16871705651283264, + "eval_logps/chosen": -528.9203491210938, + "eval_logps/rejected": -529.6995239257812, + "eval_loss": 0.589903712272644, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -1.405550241470337, + "eval_rewards/margins": 0.44460350275039673, + "eval_rewards/rejected": -1.8501536846160889, + "eval_runtime": 232.5763, + "eval_samples_per_second": 8.599, + "eval_steps_per_second": 1.075, + "step": 3500 + }, + { + "epoch": 0.9186076943208584, + "grad_norm": 16.307315826416016, + "learning_rate": 1.0027529222456755e-07, + "logits/chosen": 0.26652759313583374, + "logits/rejected": 0.27262991666793823, + "logps/chosen": -505.27471923828125, + "logps/rejected": -525.823974609375, + "loss": 0.5144, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2464582920074463, + "rewards/margins": 0.6129791140556335, + "rewards/rejected": -1.859437346458435, + "step": 3510 + }, + { + "epoch": 0.9212248102590945, + "grad_norm": 19.756351470947266, + "learning_rate": 9.397045634168766e-08, + "logits/chosen": 0.0008102863794192672, + "logits/rejected": 0.19065120816230774, + "logps/chosen": -516.8560791015625, + "logps/rejected": -553.0278930664062, + "loss": 0.5415, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2412116527557373, + "rewards/margins": 0.5996343493461609, + "rewards/rejected": -1.8408458232879639, + "step": 3520 + }, + { + "epoch": 0.9238419261973305, + "grad_norm": 20.353652954101562, + "learning_rate": 8.78665232332998e-08, + "logits/chosen": 0.12386944144964218, + "logits/rejected": 0.27118274569511414, + "logps/chosen": -481.002685546875, + "logps/rejected": -510.55230712890625, + "loss": 0.5669, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.342132806777954, + "rewards/margins": 0.4541274905204773, + "rewards/rejected": -1.7962604761123657, + "step": 3530 + }, + { + "epoch": 0.9264590421355666, + "grad_norm": 21.46174430847168, + "learning_rate": 8.196400257606208e-08, + "logits/chosen": 0.1211334615945816, + "logits/rejected": 0.4012266993522644, + "logps/chosen": -564.7858276367188, + "logps/rejected": -576.843994140625, + "loss": 0.5587, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3764750957489014, + "rewards/margins": 0.5135365724563599, + "rewards/rejected": -1.8900115489959717, + "step": 3540 + }, + { + "epoch": 0.9290761580738026, + "grad_norm": 21.205127716064453, + "learning_rate": 7.626338722875076e-08, + "logits/chosen": 0.26407915353775024, + "logits/rejected": 0.28373825550079346, + "logps/chosen": -528.8353271484375, + "logps/rejected": -558.7482299804688, + "loss": 0.6023, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4041508436203003, + "rewards/margins": 0.4070183336734772, + "rewards/rejected": -1.8111692667007446, + "step": 3550 + }, + { + "epoch": 0.9316932740120387, + "grad_norm": 19.687849044799805, + "learning_rate": 7.076515319110688e-08, + "logits/chosen": 0.2693914771080017, + "logits/rejected": 0.3067939281463623, + "logps/chosen": -492.1351623535156, + "logps/rejected": -476.29388427734375, + "loss": 0.5713, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.274938941001892, + "rewards/margins": 0.5187603235244751, + "rewards/rejected": -1.7936992645263672, + "step": 3560 + }, + { + "epoch": 0.9343103899502748, + "grad_norm": 21.987916946411133, + "learning_rate": 6.54697595640899e-08, + "logits/chosen": 0.05399775505065918, + "logits/rejected": 0.1626650094985962, + "logps/chosen": -542.186767578125, + "logps/rejected": -550.5693359375, + "loss": 0.5266, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.230676293373108, + "rewards/margins": 0.5804111361503601, + "rewards/rejected": -1.8110872507095337, + "step": 3570 + }, + { + "epoch": 0.9369275058885108, + "grad_norm": 13.982370376586914, + "learning_rate": 6.037764851154426e-08, + "logits/chosen": 0.10340269654989243, + "logits/rejected": 0.30877891182899475, + "logps/chosen": -515.5814819335938, + "logps/rejected": -562.509521484375, + "loss": 0.5631, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2640832662582397, + "rewards/margins": 0.5411828756332397, + "rewards/rejected": -1.8052661418914795, + "step": 3580 + }, + { + "epoch": 0.9395446218267469, + "grad_norm": 18.1751708984375, + "learning_rate": 5.548924522327748e-08, + "logits/chosen": 0.06784630566835403, + "logits/rejected": 0.2033408135175705, + "logps/chosen": -519.9273681640625, + "logps/rejected": -530.5051879882812, + "loss": 0.5538, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2889801263809204, + "rewards/margins": 0.5127965211868286, + "rewards/rejected": -1.801776647567749, + "step": 3590 + }, + { + "epoch": 0.942161737764983, + "grad_norm": 20.21278953552246, + "learning_rate": 5.0804957879556915e-08, + "logits/chosen": 0.056394852697849274, + "logits/rejected": 0.23709776997566223, + "logps/chosen": -462.7340393066406, + "logps/rejected": -502.6092834472656, + "loss": 0.5632, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2453665733337402, + "rewards/margins": 0.4758334755897522, + "rewards/rejected": -1.7211999893188477, + "step": 3600 + }, + { + "epoch": 0.942161737764983, + "eval_logits/chosen": 0.051600489765405655, + "eval_logits/rejected": 0.1634536236524582, + "eval_logps/chosen": -525.2161254882812, + "eval_logps/rejected": -525.723388671875, + "eval_loss": 0.5906327962875366, + "eval_rewards/accuracies": 0.6815000176429749, + "eval_rewards/chosen": -1.3685081005096436, + "eval_rewards/margins": 0.4418841004371643, + "eval_rewards/rejected": -1.8103920221328735, + "eval_runtime": 232.6085, + "eval_samples_per_second": 8.598, + "eval_steps_per_second": 1.075, + "step": 3600 + }, + { + "epoch": 0.944778853703219, + "grad_norm": 24.784046173095703, + "learning_rate": 4.632517761702815e-08, + "logits/chosen": 0.1571771204471588, + "logits/rejected": 0.32081273198127747, + "logps/chosen": -489.2625427246094, + "logps/rejected": -496.1024475097656, + "loss": 0.5707, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3370298147201538, + "rewards/margins": 0.49337905645370483, + "rewards/rejected": -1.830409049987793, + "step": 3610 + }, + { + "epoch": 0.9473959696414551, + "grad_norm": 31.509357452392578, + "learning_rate": 4.205027849605359e-08, + "logits/chosen": 0.29533377289772034, + "logits/rejected": 0.265569269657135, + "logps/chosen": -500.92803955078125, + "logps/rejected": -486.66973876953125, + "loss": 0.6225, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3463305234909058, + "rewards/margins": 0.3861822783946991, + "rewards/rejected": -1.7325128316879272, + "step": 3620 + }, + { + "epoch": 0.9500130855796912, + "grad_norm": 21.457324981689453, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": 0.2158433496952057, + "logits/rejected": 0.3043590486049652, + "logps/chosen": -524.4801025390625, + "logps/rejected": -508.20953369140625, + "loss": 0.5838, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4179335832595825, + "rewards/margins": 0.46254149079322815, + "rewards/rejected": -1.8804748058319092, + "step": 3630 + }, + { + "epoch": 0.9526302015179272, + "grad_norm": 25.418264389038086, + "learning_rate": 3.411653435283158e-08, + "logits/chosen": 0.06112390756607056, + "logits/rejected": 0.14306578040122986, + "logps/chosen": -532.900390625, + "logps/rejected": -495.4185485839844, + "loss": 0.5824, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.300320029258728, + "rewards/margins": 0.4503448009490967, + "rewards/rejected": -1.7506647109985352, + "step": 3640 + }, + { + "epoch": 0.9552473174561633, + "grad_norm": 17.69974708557129, + "learning_rate": 3.04583517959367e-08, + "logits/chosen": 0.006334272213280201, + "logits/rejected": 0.10581526905298233, + "logps/chosen": -490.5302734375, + "logps/rejected": -496.6221618652344, + "loss": 0.5431, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2481968402862549, + "rewards/margins": 0.5572769045829773, + "rewards/rejected": -1.8054739236831665, + "step": 3650 + }, + { + "epoch": 0.9578644333943994, + "grad_norm": 21.300947189331055, + "learning_rate": 2.7006375255985984e-08, + "logits/chosen": 0.26631540060043335, + "logits/rejected": 0.2855226993560791, + "logps/chosen": -517.7824096679688, + "logps/rejected": -547.991455078125, + "loss": 0.6044, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3668075799942017, + "rewards/margins": 0.38124316930770874, + "rewards/rejected": -1.7480506896972656, + "step": 3660 + }, + { + "epoch": 0.9604815493326354, + "grad_norm": 20.375226974487305, + "learning_rate": 2.3760892972027328e-08, + "logits/chosen": 0.05325336381793022, + "logits/rejected": 0.11790470033884048, + "logps/chosen": -533.8929443359375, + "logps/rejected": -533.8286743164062, + "loss": 0.5873, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4277513027191162, + "rewards/margins": 0.46243423223495483, + "rewards/rejected": -1.8901855945587158, + "step": 3670 + }, + { + "epoch": 0.9630986652708715, + "grad_norm": 22.46816062927246, + "learning_rate": 2.072217594089765e-08, + "logits/chosen": 0.17507997155189514, + "logits/rejected": 0.20632827281951904, + "logps/chosen": -517.567138671875, + "logps/rejected": -549.365234375, + "loss": 0.543, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3189188241958618, + "rewards/margins": 0.5807808637619019, + "rewards/rejected": -1.8996999263763428, + "step": 3680 + }, + { + "epoch": 0.9657157812091076, + "grad_norm": 17.85133934020996, + "learning_rate": 1.789047789459375e-08, + "logits/chosen": 0.027010198682546616, + "logits/rejected": 0.21749186515808105, + "logps/chosen": -594.4285278320312, + "logps/rejected": -558.7322387695312, + "loss": 0.5585, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2619202136993408, + "rewards/margins": 0.5537473559379578, + "rewards/rejected": -1.8156675100326538, + "step": 3690 + }, + { + "epoch": 0.9683328971473436, + "grad_norm": 28.183317184448242, + "learning_rate": 1.5266035279088708e-08, + "logits/chosen": 0.10956914722919464, + "logits/rejected": 0.19487416744232178, + "logps/chosen": -572.9104614257812, + "logps/rejected": -580.4852905273438, + "loss": 0.5488, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4588168859481812, + "rewards/margins": 0.5328525304794312, + "rewards/rejected": -1.9916694164276123, + "step": 3700 + }, + { + "epoch": 0.9683328971473436, + "eval_logits/chosen": 0.05304437875747681, + "eval_logits/rejected": 0.1655142903327942, + "eval_logps/chosen": -525.91552734375, + "eval_logps/rejected": -526.5010375976562, + "eval_loss": 0.5903691649436951, + "eval_rewards/accuracies": 0.6834999918937683, + "eval_rewards/chosen": -1.375501275062561, + "eval_rewards/margins": 0.4426679015159607, + "eval_rewards/rejected": -1.818169116973877, + "eval_runtime": 232.1562, + "eval_samples_per_second": 8.615, + "eval_steps_per_second": 1.077, + "step": 3700 + }, + { + "epoch": 0.9709500130855797, + "grad_norm": 33.79894256591797, + "learning_rate": 1.2849067234584623e-08, + "logits/chosen": 0.24655885994434357, + "logits/rejected": 0.3253239095211029, + "logps/chosen": -476.681396484375, + "logps/rejected": -499.00494384765625, + "loss": 0.6091, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3145527839660645, + "rewards/margins": 0.4425618648529053, + "rewards/rejected": -1.7571147680282593, + "step": 3710 + }, + { + "epoch": 0.9735671290238157, + "grad_norm": 20.961793899536133, + "learning_rate": 1.0639775577218625e-08, + "logits/chosen": 0.16103433072566986, + "logits/rejected": 0.22914746403694153, + "logps/chosen": -490.27777099609375, + "logps/rejected": -488.0320739746094, + "loss": 0.5535, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3744248151779175, + "rewards/margins": 0.5473370552062988, + "rewards/rejected": -1.9217618703842163, + "step": 3720 + }, + { + "epoch": 0.9761842449620518, + "grad_norm": 18.385028839111328, + "learning_rate": 8.638344782207486e-09, + "logits/chosen": 0.1808285266160965, + "logits/rejected": 0.2911062240600586, + "logps/chosen": -493.48321533203125, + "logps/rejected": -501.9202575683594, + "loss": 0.56, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.277074933052063, + "rewards/margins": 0.5030657649040222, + "rewards/rejected": -1.7801406383514404, + "step": 3730 + }, + { + "epoch": 0.9788013609002879, + "grad_norm": 19.80834197998047, + "learning_rate": 6.84494196844715e-09, + "logits/chosen": 0.11059341579675674, + "logits/rejected": 0.3094932436943054, + "logps/chosen": -525.2141723632812, + "logps/rejected": -548.9354248046875, + "loss": 0.5209, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2858692407608032, + "rewards/margins": 0.6285649538040161, + "rewards/rejected": -1.9144340753555298, + "step": 3740 + }, + { + "epoch": 0.9814184768385239, + "grad_norm": 17.796337127685547, + "learning_rate": 5.259716884556121e-09, + "logits/chosen": 0.08546547591686249, + "logits/rejected": 0.2145168036222458, + "logps/chosen": -523.2117919921875, + "logps/rejected": -539.4160766601562, + "loss": 0.5475, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.280202865600586, + "rewards/margins": 0.5271276831626892, + "rewards/rejected": -1.8073304891586304, + "step": 3750 + }, + { + "epoch": 0.98403559277676, + "grad_norm": 19.8670654296875, + "learning_rate": 3.882801896372967e-09, + "logits/chosen": 0.17666058242321014, + "logits/rejected": 0.23271194100379944, + "logps/chosen": -512.79736328125, + "logps/rejected": -507.11505126953125, + "loss": 0.6112, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3305553197860718, + "rewards/margins": 0.4106437563896179, + "rewards/rejected": -1.7411991357803345, + "step": 3760 + }, + { + "epoch": 0.9866527087149961, + "grad_norm": 21.29877281188965, + "learning_rate": 2.7143119759026614e-09, + "logits/chosen": 0.22256436944007874, + "logits/rejected": 0.329804003238678, + "logps/chosen": -550.2042236328125, + "logps/rejected": -548.9179077148438, + "loss": 0.5132, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2466299533843994, + "rewards/margins": 0.5815029144287109, + "rewards/rejected": -1.8281329870224, + "step": 3770 + }, + { + "epoch": 0.9892698246532321, + "grad_norm": 21.455854415893555, + "learning_rate": 1.754344691717591e-09, + "logits/chosen": 0.06916506588459015, + "logits/rejected": 0.15464463829994202, + "logps/chosen": -504.8829040527344, + "logps/rejected": -547.6551513671875, + "loss": 0.6246, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3374298810958862, + "rewards/margins": 0.33050769567489624, + "rewards/rejected": -1.6679375171661377, + "step": 3780 + }, + { + "epoch": 0.9918869405914682, + "grad_norm": 24.640779495239258, + "learning_rate": 1.0029802008096335e-09, + "logits/chosen": 0.07244547456502914, + "logits/rejected": 0.21235093474388123, + "logps/chosen": -528.9513549804688, + "logps/rejected": -539.2667236328125, + "loss": 0.5648, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3598108291625977, + "rewards/margins": 0.5505877733230591, + "rewards/rejected": -1.9103988409042358, + "step": 3790 + }, + { + "epoch": 0.9945040565297043, + "grad_norm": 23.488046646118164, + "learning_rate": 4.602812418974534e-10, + "logits/chosen": 0.006186048500239849, + "logits/rejected": 0.1412689983844757, + "logps/chosen": -535.95263671875, + "logps/rejected": -533.4951782226562, + "loss": 0.5674, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.274775505065918, + "rewards/margins": 0.5218435525894165, + "rewards/rejected": -1.7966190576553345, + "step": 3800 + }, + { + "epoch": 0.9945040565297043, + "eval_logits/chosen": 0.052397292107343674, + "eval_logits/rejected": 0.16461612284183502, + "eval_logps/chosen": -525.971923828125, + "eval_logps/rejected": -526.6827392578125, + "eval_loss": 0.5901351571083069, + "eval_rewards/accuracies": 0.684499979019165, + "eval_rewards/chosen": -1.3760651350021362, + "eval_rewards/margins": 0.44392091035842896, + "eval_rewards/rejected": -1.81998610496521, + "eval_runtime": 232.5667, + "eval_samples_per_second": 8.6, + "eval_steps_per_second": 1.075, + "step": 3800 + }, + { + "epoch": 0.9971211724679403, + "grad_norm": 21.794204711914062, + "learning_rate": 1.2629313018819312e-10, + "logits/chosen": 0.02779226377606392, + "logits/rejected": 0.21272841095924377, + "logps/chosen": -511.3211364746094, + "logps/rejected": -515.0205688476562, + "loss": 0.5489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2764627933502197, + "rewards/margins": 0.537833571434021, + "rewards/rejected": -1.8142963647842407, + "step": 3810 + }, + { + "epoch": 0.9997382884061764, + "grad_norm": 21.642515182495117, + "learning_rate": 1.0437535929996855e-12, + "logits/chosen": 0.1357661634683609, + "logits/rejected": 0.14389568567276, + "logps/chosen": -550.05810546875, + "logps/rejected": -537.67041015625, + "loss": 0.543, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.337266206741333, + "rewards/margins": 0.5570909380912781, + "rewards/rejected": -1.8943573236465454, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.6070349644347967, + "train_runtime": 21926.9285, + "train_samples_per_second": 2.788, + "train_steps_per_second": 0.174 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}