{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9980364656381484, "eval_steps": 100, "global_step": 2004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04488078541374474, "grad_norm": 4.790558815002441, "learning_rate": 9.850299401197606e-05, "logits/chosen": -3.3742988109588623, "logits/rejected": -3.0817112922668457, "logps/chosen": -273.48614501953125, "logps/rejected": -234.3329315185547, "loss": 0.6831, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0712718814611435, "rewards/margins": 0.024287192150950432, "rewards/rejected": 0.04698468744754791, "step": 30 }, { "epoch": 0.08976157082748948, "grad_norm": 5.551278591156006, "learning_rate": 9.700598802395209e-05, "logits/chosen": -3.378220558166504, "logits/rejected": -3.129826307296753, "logps/chosen": -267.0759582519531, "logps/rejected": -238.60873413085938, "loss": 0.6691, "rewards/accuracies": 0.590624988079071, "rewards/chosen": 0.21243497729301453, "rewards/margins": 0.07040555775165558, "rewards/rejected": 0.14202943444252014, "step": 60 }, { "epoch": 0.13464235624123422, "grad_norm": 7.342463493347168, "learning_rate": 9.550898203592816e-05, "logits/chosen": -3.3940162658691406, "logits/rejected": -3.142778158187866, "logps/chosen": -267.77581787109375, "logps/rejected": -233.1001434326172, "loss": 0.6586, "rewards/accuracies": 0.6145833134651184, "rewards/chosen": 0.2707298994064331, "rewards/margins": 0.12286876887083054, "rewards/rejected": 0.14786113798618317, "step": 90 }, { "epoch": 0.1496026180458158, "eval_logits/chosen": -3.403446674346924, "eval_logits/rejected": -3.1215860843658447, "eval_logps/chosen": -268.9533386230469, "eval_logps/rejected": -229.84756469726562, "eval_loss": 0.6490052342414856, "eval_rewards/accuracies": 0.6090127229690552, "eval_rewards/chosen": 0.2928723990917206, "eval_rewards/margins": 0.17060735821723938, "eval_rewards/rejected": 0.12226507067680359, "eval_runtime": 1689.4226, "eval_samples_per_second": 3.166, "eval_steps_per_second": 3.166, "step": 100 }, { "epoch": 0.17952314165497896, "grad_norm": 7.352132797241211, "learning_rate": 9.40119760479042e-05, "logits/chosen": -3.416881561279297, "logits/rejected": -3.130246877670288, "logps/chosen": -271.2243347167969, "logps/rejected": -233.06614685058594, "loss": 0.6385, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.295549601316452, "rewards/margins": 0.20651350915431976, "rewards/rejected": 0.08903612196445465, "step": 120 }, { "epoch": 0.2244039270687237, "grad_norm": 7.124303340911865, "learning_rate": 9.251497005988024e-05, "logits/chosen": -3.3911712169647217, "logits/rejected": -3.1303460597991943, "logps/chosen": -265.11651611328125, "logps/rejected": -234.63478088378906, "loss": 0.6436, "rewards/accuracies": 0.6322916746139526, "rewards/chosen": 0.23116879165172577, "rewards/margins": 0.23007448017597198, "rewards/rejected": 0.0010943154338747263, "step": 150 }, { "epoch": 0.26928471248246844, "grad_norm": 6.228757381439209, "learning_rate": 9.101796407185628e-05, "logits/chosen": -3.4022183418273926, "logits/rejected": -3.151683807373047, "logps/chosen": -267.1882019042969, "logps/rejected": -232.48471069335938, "loss": 0.6485, "rewards/accuracies": 0.6322916746139526, "rewards/chosen": 0.3013507127761841, "rewards/margins": 0.20289267599582672, "rewards/rejected": 0.09845803678035736, "step": 180 }, { "epoch": 0.2992052360916316, "eval_logits/chosen": -3.4198923110961914, "eval_logits/rejected": -3.143324613571167, "eval_logps/chosen": -268.99163818359375, "eval_logps/rejected": -230.40538024902344, "eval_loss": 0.6396481394767761, "eval_rewards/accuracies": 0.6142483353614807, "eval_rewards/chosen": 0.2890413999557495, "eval_rewards/margins": 0.22255805134773254, "eval_rewards/rejected": 0.06648338586091995, "eval_runtime": 1688.7699, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 200 }, { "epoch": 0.3141654978962132, "grad_norm": 6.4461493492126465, "learning_rate": 8.952095808383235e-05, "logits/chosen": -3.434596300125122, "logits/rejected": -3.132395029067993, "logps/chosen": -267.54437255859375, "logps/rejected": -224.43540954589844, "loss": 0.638, "rewards/accuracies": 0.6197916865348816, "rewards/chosen": 0.2697771489620209, "rewards/margins": 0.2309388816356659, "rewards/rejected": 0.03883826732635498, "step": 210 }, { "epoch": 0.3590462833099579, "grad_norm": 4.610179424285889, "learning_rate": 8.80239520958084e-05, "logits/chosen": -3.428438901901245, "logits/rejected": -3.169167995452881, "logps/chosen": -265.898193359375, "logps/rejected": -230.3724822998047, "loss": 0.6406, "rewards/accuracies": 0.6166666746139526, "rewards/chosen": 0.28266531229019165, "rewards/margins": 0.23549547791481018, "rewards/rejected": 0.04716984182596207, "step": 240 }, { "epoch": 0.40392706872370265, "grad_norm": 5.838982582092285, "learning_rate": 8.652694610778443e-05, "logits/chosen": -3.4263041019439697, "logits/rejected": -3.1684255599975586, "logps/chosen": -267.129150390625, "logps/rejected": -233.6484832763672, "loss": 0.6251, "rewards/accuracies": 0.6302083134651184, "rewards/chosen": 0.1708066761493683, "rewards/margins": 0.28462281823158264, "rewards/rejected": -0.11381613463163376, "step": 270 }, { "epoch": 0.4488078541374474, "grad_norm": 5.045168876647949, "learning_rate": 8.502994011976048e-05, "logits/chosen": -3.439959764480591, "logits/rejected": -3.173079490661621, "logps/chosen": -273.4431457519531, "logps/rejected": -236.24371337890625, "loss": 0.6327, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.17748431861400604, "rewards/margins": 0.269964337348938, "rewards/rejected": -0.09247999638319016, "step": 300 }, { "epoch": 0.4488078541374474, "eval_logits/chosen": -3.424139976501465, "eval_logits/rejected": -3.150641918182373, "eval_logps/chosen": -269.5113525390625, "eval_logps/rejected": -231.47146606445312, "eval_loss": 0.6353974938392639, "eval_rewards/accuracies": 0.6181750297546387, "eval_rewards/chosen": 0.23706810176372528, "eval_rewards/margins": 0.2771916091442108, "eval_rewards/rejected": -0.04012349247932434, "eval_runtime": 1688.647, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 300 }, { "epoch": 0.49368863955119213, "grad_norm": 5.068808555603027, "learning_rate": 8.353293413173653e-05, "logits/chosen": -3.4144017696380615, "logits/rejected": -3.1683106422424316, "logps/chosen": -272.5228271484375, "logps/rejected": -239.20681762695312, "loss": 0.646, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": 0.22192205488681793, "rewards/margins": 0.24012483656406403, "rewards/rejected": -0.018202781677246094, "step": 330 }, { "epoch": 0.5385694249649369, "grad_norm": 6.0258941650390625, "learning_rate": 8.203592814371259e-05, "logits/chosen": -3.4079012870788574, "logits/rejected": -3.1440396308898926, "logps/chosen": -276.3011474609375, "logps/rejected": -235.62054443359375, "loss": 0.6228, "rewards/accuracies": 0.6270833611488342, "rewards/chosen": 0.09719991683959961, "rewards/margins": 0.2948659658432007, "rewards/rejected": -0.19766603410243988, "step": 360 }, { "epoch": 0.5834502103786816, "grad_norm": 5.713747024536133, "learning_rate": 8.053892215568862e-05, "logits/chosen": -3.3723533153533936, "logits/rejected": -3.1148476600646973, "logps/chosen": -274.2776794433594, "logps/rejected": -234.34136962890625, "loss": 0.6342, "rewards/accuracies": 0.6270833611488342, "rewards/chosen": 0.17114956676959991, "rewards/margins": 0.29201894998550415, "rewards/rejected": -0.12086938321590424, "step": 390 }, { "epoch": 0.5984104721832632, "eval_logits/chosen": -3.390080451965332, "eval_logits/rejected": -3.111392021179199, "eval_logps/chosen": -268.8110656738281, "eval_logps/rejected": -230.63925170898438, "eval_loss": 0.6309967041015625, "eval_rewards/accuracies": 0.630142092704773, "eval_rewards/chosen": 0.30709749460220337, "eval_rewards/margins": 0.2640005946159363, "eval_rewards/rejected": 0.04309689626097679, "eval_runtime": 1688.4508, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 400 }, { "epoch": 0.6283309957924264, "grad_norm": 5.401741027832031, "learning_rate": 7.904191616766467e-05, "logits/chosen": -3.389784574508667, "logits/rejected": -3.113330602645874, "logps/chosen": -270.7179260253906, "logps/rejected": -234.7329864501953, "loss": 0.6352, "rewards/accuracies": 0.6260416507720947, "rewards/chosen": 0.2668881416320801, "rewards/margins": 0.25750893354415894, "rewards/rejected": 0.009379198774695396, "step": 420 }, { "epoch": 0.6732117812061711, "grad_norm": 6.569407939910889, "learning_rate": 7.754491017964072e-05, "logits/chosen": -3.420933246612549, "logits/rejected": -3.107722520828247, "logps/chosen": -279.6606750488281, "logps/rejected": -232.16326904296875, "loss": 0.6152, "rewards/accuracies": 0.6395833492279053, "rewards/chosen": 0.23483921587467194, "rewards/margins": 0.30834510922431946, "rewards/rejected": -0.07350588589906693, "step": 450 }, { "epoch": 0.7180925666199158, "grad_norm": 4.889843463897705, "learning_rate": 7.604790419161677e-05, "logits/chosen": -3.4380805492401123, "logits/rejected": -3.1253116130828857, "logps/chosen": -279.8207092285156, "logps/rejected": -233.9474639892578, "loss": 0.612, "rewards/accuracies": 0.6697916388511658, "rewards/chosen": 0.06458248198032379, "rewards/margins": 0.378538578748703, "rewards/rejected": -0.3139561414718628, "step": 480 }, { "epoch": 0.748013090229079, "eval_logits/chosen": -3.419874429702759, "eval_logits/rejected": -3.1424779891967773, "eval_logps/chosen": -270.2199401855469, "eval_logps/rejected": -232.4933319091797, "eval_loss": 0.6269693374633789, "eval_rewards/accuracies": 0.627711296081543, "eval_rewards/chosen": 0.16621026396751404, "eval_rewards/margins": 0.3085208237171173, "eval_rewards/rejected": -0.14231054484844208, "eval_runtime": 1688.3228, "eval_samples_per_second": 3.168, "eval_steps_per_second": 3.168, "step": 500 }, { "epoch": 0.7629733520336606, "grad_norm": 4.36655330657959, "learning_rate": 7.455089820359282e-05, "logits/chosen": -3.4343178272247314, "logits/rejected": -3.1612308025360107, "logps/chosen": -272.99578857421875, "logps/rejected": -234.4145965576172, "loss": 0.629, "rewards/accuracies": 0.6427083611488342, "rewards/chosen": 0.13686993718147278, "rewards/margins": 0.3045249283313751, "rewards/rejected": -0.16765499114990234, "step": 510 }, { "epoch": 0.8078541374474053, "grad_norm": 4.971264839172363, "learning_rate": 7.305389221556886e-05, "logits/chosen": -3.4246087074279785, "logits/rejected": -3.172884464263916, "logps/chosen": -267.14556884765625, "logps/rejected": -233.85691833496094, "loss": 0.6269, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.22364649176597595, "rewards/margins": 0.28497254848480225, "rewards/rejected": -0.06132606416940689, "step": 540 }, { "epoch": 0.85273492286115, "grad_norm": 5.077197074890137, "learning_rate": 7.155688622754491e-05, "logits/chosen": -3.4349772930145264, "logits/rejected": -3.1722497940063477, "logps/chosen": -268.02630615234375, "logps/rejected": -231.99020385742188, "loss": 0.63, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.2208840399980545, "rewards/margins": 0.2860158383846283, "rewards/rejected": -0.06513180583715439, "step": 570 }, { "epoch": 0.8976157082748948, "grad_norm": 4.760651111602783, "learning_rate": 7.005988023952096e-05, "logits/chosen": -3.4018094539642334, "logits/rejected": -3.1606853008270264, "logps/chosen": -268.86090087890625, "logps/rejected": -233.84007263183594, "loss": 0.6432, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": 0.25363439321517944, "rewards/margins": 0.2553554177284241, "rewards/rejected": -0.0017210314981639385, "step": 600 }, { "epoch": 0.8976157082748948, "eval_logits/chosen": -3.4228434562683105, "eval_logits/rejected": -3.145069122314453, "eval_logps/chosen": -269.40740966796875, "eval_logps/rejected": -231.58685302734375, "eval_loss": 0.6246524453163147, "eval_rewards/accuracies": 0.6312640309333801, "eval_rewards/chosen": 0.24746553599834442, "eval_rewards/margins": 0.2991257905960083, "eval_rewards/rejected": -0.051660239696502686, "eval_runtime": 1688.5162, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 600 }, { "epoch": 0.9424964936886395, "grad_norm": 5.144285678863525, "learning_rate": 6.856287425149701e-05, "logits/chosen": -3.4329488277435303, "logits/rejected": -3.1452276706695557, "logps/chosen": -269.5411376953125, "logps/rejected": -228.07046508789062, "loss": 0.6185, "rewards/accuracies": 0.6260416507720947, "rewards/chosen": 0.26634886860847473, "rewards/margins": 0.3162167966365814, "rewards/rejected": -0.049867913126945496, "step": 630 }, { "epoch": 0.9873772791023843, "grad_norm": 4.551113128662109, "learning_rate": 6.706586826347305e-05, "logits/chosen": -3.435673713684082, "logits/rejected": -3.1743414402008057, "logps/chosen": -273.6510314941406, "logps/rejected": -241.5527801513672, "loss": 0.6236, "rewards/accuracies": 0.6364583373069763, "rewards/chosen": 0.11331641674041748, "rewards/margins": 0.32483571767807007, "rewards/rejected": -0.2115192860364914, "step": 660 }, { "epoch": 1.032258064516129, "grad_norm": 4.672567844390869, "learning_rate": 6.55688622754491e-05, "logits/chosen": -3.4276251792907715, "logits/rejected": -3.1490509510040283, "logps/chosen": -269.5851135253906, "logps/rejected": -237.02989196777344, "loss": 0.5554, "rewards/accuracies": 0.7302083373069763, "rewards/chosen": 0.08210794627666473, "rewards/margins": 0.49889788031578064, "rewards/rejected": -0.4167899191379547, "step": 690 }, { "epoch": 1.0472183263207107, "eval_logits/chosen": -3.4185428619384766, "eval_logits/rejected": -3.1414475440979004, "eval_logps/chosen": -270.3139343261719, "eval_logps/rejected": -232.80120849609375, "eval_loss": 0.6221644282341003, "eval_rewards/accuracies": 0.6338818073272705, "eval_rewards/chosen": 0.15681201219558716, "eval_rewards/margins": 0.3299100995063782, "eval_rewards/rejected": -0.17309808731079102, "eval_runtime": 1688.7954, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 700 }, { "epoch": 1.0771388499298737, "grad_norm": 4.2797369956970215, "learning_rate": 6.407185628742515e-05, "logits/chosen": -3.445012092590332, "logits/rejected": -3.1330997943878174, "logps/chosen": -266.95782470703125, "logps/rejected": -227.27879333496094, "loss": 0.5249, "rewards/accuracies": 0.7635416388511658, "rewards/chosen": 0.25232160091400146, "rewards/margins": 0.5462218523025513, "rewards/rejected": -0.2939002215862274, "step": 720 }, { "epoch": 1.1220196353436185, "grad_norm": 5.101881980895996, "learning_rate": 6.25748502994012e-05, "logits/chosen": -3.425431728363037, "logits/rejected": -3.1551194190979004, "logps/chosen": -271.9197082519531, "logps/rejected": -233.2086639404297, "loss": 0.5308, "rewards/accuracies": 0.7385416626930237, "rewards/chosen": 0.2957630157470703, "rewards/margins": 0.5949270129203796, "rewards/rejected": -0.29916396737098694, "step": 750 }, { "epoch": 1.1669004207573632, "grad_norm": 5.360141754150391, "learning_rate": 6.107784431137725e-05, "logits/chosen": -3.4009079933166504, "logits/rejected": -3.1200320720672607, "logps/chosen": -272.1022644042969, "logps/rejected": -236.18499755859375, "loss": 0.5226, "rewards/accuracies": 0.7520833611488342, "rewards/chosen": 0.30986490845680237, "rewards/margins": 0.59710294008255, "rewards/rejected": -0.2872380018234253, "step": 780 }, { "epoch": 1.1968209443665265, "eval_logits/chosen": -3.405834674835205, "eval_logits/rejected": -3.1333200931549072, "eval_logps/chosen": -272.1116638183594, "eval_logps/rejected": -235.3762664794922, "eval_loss": 0.6281805038452148, "eval_rewards/accuracies": 0.6335078477859497, "eval_rewards/chosen": -0.022958112880587578, "eval_rewards/margins": 0.40764307975769043, "eval_rewards/rejected": -0.43060120940208435, "eval_runtime": 1688.3487, "eval_samples_per_second": 3.168, "eval_steps_per_second": 3.168, "step": 800 }, { "epoch": 1.211781206171108, "grad_norm": 5.8285088539123535, "learning_rate": 5.95808383233533e-05, "logits/chosen": -3.3979651927948, "logits/rejected": -3.1520321369171143, "logps/chosen": -274.0641174316406, "logps/rejected": -240.42205810546875, "loss": 0.5402, "rewards/accuracies": 0.7354166507720947, "rewards/chosen": 0.1310122311115265, "rewards/margins": 0.5752567052841187, "rewards/rejected": -0.44424447417259216, "step": 810 }, { "epoch": 1.2566619915848527, "grad_norm": 5.5216851234436035, "learning_rate": 5.808383233532935e-05, "logits/chosen": -3.4025676250457764, "logits/rejected": -3.1448330879211426, "logps/chosen": -274.1934509277344, "logps/rejected": -243.01490783691406, "loss": 0.5201, "rewards/accuracies": 0.7552083134651184, "rewards/chosen": 0.14752289652824402, "rewards/margins": 0.626766562461853, "rewards/rejected": -0.479243665933609, "step": 840 }, { "epoch": 1.3015427769985974, "grad_norm": 5.673742294311523, "learning_rate": 5.6586826347305385e-05, "logits/chosen": -3.3895277976989746, "logits/rejected": -3.1401288509368896, "logps/chosen": -273.1130676269531, "logps/rejected": -241.987060546875, "loss": 0.5497, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02567141316831112, "rewards/margins": 0.5613437294960022, "rewards/rejected": -0.5356722474098206, "step": 870 }, { "epoch": 1.3464235624123422, "grad_norm": 6.6557440757751465, "learning_rate": 5.508982035928144e-05, "logits/chosen": -3.3836898803710938, "logits/rejected": -3.1433603763580322, "logps/chosen": -266.1312561035156, "logps/rejected": -238.70309448242188, "loss": 0.5474, "rewards/accuracies": 0.7322916388511658, "rewards/chosen": 0.016369260847568512, "rewards/margins": 0.563011109828949, "rewards/rejected": -0.5466418862342834, "step": 900 }, { "epoch": 1.3464235624123422, "eval_logits/chosen": -3.3754773139953613, "eval_logits/rejected": -3.106959819793701, "eval_logps/chosen": -271.97869873046875, "eval_logps/rejected": -234.78985595703125, "eval_loss": 0.629611074924469, "eval_rewards/accuracies": 0.6299551129341125, "eval_rewards/chosen": -0.009662697091698647, "eval_rewards/margins": 0.3622985780239105, "eval_rewards/rejected": -0.3719612658023834, "eval_runtime": 1688.3293, "eval_samples_per_second": 3.168, "eval_steps_per_second": 3.168, "step": 900 }, { "epoch": 1.391304347826087, "grad_norm": 5.2359724044799805, "learning_rate": 5.359281437125748e-05, "logits/chosen": -3.3651976585388184, "logits/rejected": -3.123444080352783, "logps/chosen": -271.6989440917969, "logps/rejected": -236.84417724609375, "loss": 0.54, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": 0.1108192428946495, "rewards/margins": 0.5575817823410034, "rewards/rejected": -0.4467625319957733, "step": 930 }, { "epoch": 1.4361851332398317, "grad_norm": 5.669713497161865, "learning_rate": 5.209580838323354e-05, "logits/chosen": -3.3611793518066406, "logits/rejected": -3.099807024002075, "logps/chosen": -274.7862243652344, "logps/rejected": -237.52139282226562, "loss": 0.5405, "rewards/accuracies": 0.7260416746139526, "rewards/chosen": 0.027847904711961746, "rewards/margins": 0.5692722797393799, "rewards/rejected": -0.5414243936538696, "step": 960 }, { "epoch": 1.4810659186535764, "grad_norm": 6.500946044921875, "learning_rate": 5.059880239520959e-05, "logits/chosen": -3.3827614784240723, "logits/rejected": -3.09436297416687, "logps/chosen": -276.1948547363281, "logps/rejected": -238.23243713378906, "loss": 0.5235, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.051430441439151764, "rewards/margins": 0.6151652932167053, "rewards/rejected": -0.6665957570075989, "step": 990 }, { "epoch": 1.496026180458158, "eval_logits/chosen": -3.366751194000244, "eval_logits/rejected": -3.1013710498809814, "eval_logps/chosen": -272.0386047363281, "eval_logps/rejected": -234.93768310546875, "eval_loss": 0.628265380859375, "eval_rewards/accuracies": 0.6325729489326477, "eval_rewards/chosen": -0.01565566658973694, "eval_rewards/margins": 0.37109050154685974, "eval_rewards/rejected": -0.38674619793891907, "eval_runtime": 1688.5815, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 1000 }, { "epoch": 1.5259467040673211, "grad_norm": 5.458423614501953, "learning_rate": 4.910179640718563e-05, "logits/chosen": -3.3471567630767822, "logits/rejected": -3.1277146339416504, "logps/chosen": -269.085205078125, "logps/rejected": -243.0161895751953, "loss": 0.5338, "rewards/accuracies": 0.7489583492279053, "rewards/chosen": 0.08274559676647186, "rewards/margins": 0.5906849503517151, "rewards/rejected": -0.5079393982887268, "step": 1020 }, { "epoch": 1.5708274894810659, "grad_norm": 7.57076358795166, "learning_rate": 4.7604790419161675e-05, "logits/chosen": -3.3720383644104004, "logits/rejected": -3.0789096355438232, "logps/chosen": -266.96270751953125, "logps/rejected": -228.08006286621094, "loss": 0.5331, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.014148912392556667, "rewards/margins": 0.5797646045684814, "rewards/rejected": -0.5656156539916992, "step": 1050 }, { "epoch": 1.6157082748948106, "grad_norm": 5.330569744110107, "learning_rate": 4.610778443113773e-05, "logits/chosen": -3.359792947769165, "logits/rejected": -3.09041166305542, "logps/chosen": -276.38226318359375, "logps/rejected": -243.1844024658203, "loss": 0.5232, "rewards/accuracies": 0.7395833134651184, "rewards/chosen": 0.06125294789671898, "rewards/margins": 0.6237131953239441, "rewards/rejected": -0.56246018409729, "step": 1080 }, { "epoch": 1.645628798503974, "eval_logits/chosen": -3.3514223098754883, "eval_logits/rejected": -3.0870354175567627, "eval_logps/chosen": -271.9643859863281, "eval_logps/rejected": -234.57383728027344, "eval_loss": 0.6333222389221191, "eval_rewards/accuracies": 0.620792806148529, "eval_rewards/chosen": -0.008234047330915928, "eval_rewards/margins": 0.34212782979011536, "eval_rewards/rejected": -0.3503618538379669, "eval_runtime": 1688.4687, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 1100 }, { "epoch": 1.6605890603085554, "grad_norm": 6.506576061248779, "learning_rate": 4.4610778443113777e-05, "logits/chosen": -3.3484690189361572, "logits/rejected": -3.130286455154419, "logps/chosen": -273.8847961425781, "logps/rejected": -247.9475860595703, "loss": 0.5384, "rewards/accuracies": 0.7322916388511658, "rewards/chosen": 0.10436714440584183, "rewards/margins": 0.5445392727851868, "rewards/rejected": -0.44017213582992554, "step": 1110 }, { "epoch": 1.7054698457223, "grad_norm": 6.028050422668457, "learning_rate": 4.311377245508982e-05, "logits/chosen": -3.3555943965911865, "logits/rejected": -3.125157356262207, "logps/chosen": -267.73516845703125, "logps/rejected": -236.43356323242188, "loss": 0.5549, "rewards/accuracies": 0.7354166507720947, "rewards/chosen": -0.052730146795511246, "rewards/margins": 0.5250240564346313, "rewards/rejected": -0.5777541995048523, "step": 1140 }, { "epoch": 1.7503506311360448, "grad_norm": 6.8205647468566895, "learning_rate": 4.161676646706587e-05, "logits/chosen": -3.383364677429199, "logits/rejected": -3.1156816482543945, "logps/chosen": -273.9105529785156, "logps/rejected": -237.84432983398438, "loss": 0.523, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.06969426572322845, "rewards/margins": 0.6354466676712036, "rewards/rejected": -0.705141007900238, "step": 1170 }, { "epoch": 1.7952314165497896, "grad_norm": 6.074138641357422, "learning_rate": 4.0119760479041915e-05, "logits/chosen": -3.391815185546875, "logits/rejected": -3.1276473999023438, "logps/chosen": -279.9575500488281, "logps/rejected": -244.89010620117188, "loss": 0.5156, "rewards/accuracies": 0.7479166388511658, "rewards/chosen": -0.07707042992115021, "rewards/margins": 0.6421669125556946, "rewards/rejected": -0.7192373871803284, "step": 1200 }, { "epoch": 1.7952314165497896, "eval_logits/chosen": -3.379970073699951, "eval_logits/rejected": -3.1176722049713135, "eval_logps/chosen": -274.0703430175781, "eval_logps/rejected": -237.28604125976562, "eval_loss": 0.6306100487709045, "eval_rewards/accuracies": 0.6350037455558777, "eval_rewards/chosen": -0.21883098781108856, "eval_rewards/margins": 0.4027484953403473, "eval_rewards/rejected": -0.6215794086456299, "eval_runtime": 1688.6398, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 1200 }, { "epoch": 1.8401122019635343, "grad_norm": 5.888485431671143, "learning_rate": 3.8622754491017966e-05, "logits/chosen": -3.3851656913757324, "logits/rejected": -3.1222336292266846, "logps/chosen": -272.52117919921875, "logps/rejected": -237.61158752441406, "loss": 0.5372, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -0.14584079384803772, "rewards/margins": 0.6132307052612305, "rewards/rejected": -0.7590714693069458, "step": 1230 }, { "epoch": 1.884992987377279, "grad_norm": 6.371288299560547, "learning_rate": 3.712574850299401e-05, "logits/chosen": -3.379087209701538, "logits/rejected": -3.1119582653045654, "logps/chosen": -273.9693603515625, "logps/rejected": -238.9442901611328, "loss": 0.5142, "rewards/accuracies": 0.7614583373069763, "rewards/chosen": -0.13823945820331573, "rewards/margins": 0.6301066279411316, "rewards/rejected": -0.7683460116386414, "step": 1260 }, { "epoch": 1.9298737727910238, "grad_norm": 6.35048246383667, "learning_rate": 3.562874251497006e-05, "logits/chosen": -3.3981616497039795, "logits/rejected": -3.162132740020752, "logps/chosen": -268.2223205566406, "logps/rejected": -237.77635192871094, "loss": 0.5352, "rewards/accuracies": 0.7364583611488342, "rewards/chosen": -0.19809262454509735, "rewards/margins": 0.6094833016395569, "rewards/rejected": -0.8075758814811707, "step": 1290 }, { "epoch": 1.9448340345956054, "eval_logits/chosen": -3.3784019947052, "eval_logits/rejected": -3.116711378097534, "eval_logps/chosen": -274.5598449707031, "eval_logps/rejected": -237.6013946533203, "eval_loss": 0.6299869418144226, "eval_rewards/accuracies": 0.6327599287033081, "eval_rewards/chosen": -0.2677817940711975, "eval_rewards/margins": 0.38533419370651245, "eval_rewards/rejected": -0.6531160473823547, "eval_runtime": 1688.4156, "eval_samples_per_second": 3.167, "eval_steps_per_second": 3.167, "step": 1300 }, { "epoch": 1.9747545582047685, "grad_norm": 6.9590959548950195, "learning_rate": 3.413173652694611e-05, "logits/chosen": -3.376148223876953, "logits/rejected": -3.1122324466705322, "logps/chosen": -282.5127258300781, "logps/rejected": -247.69212341308594, "loss": 0.5232, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.10552702099084854, "rewards/margins": 0.6293079257011414, "rewards/rejected": -0.7348350286483765, "step": 1320 }, { "epoch": 2.0196353436185133, "grad_norm": 5.992002010345459, "learning_rate": 3.263473053892216e-05, "logits/chosen": -3.397113800048828, "logits/rejected": -3.136545419692993, "logps/chosen": -278.75390625, "logps/rejected": -246.4496307373047, "loss": 0.5015, "rewards/accuracies": 0.765625, "rewards/chosen": -0.11011376976966858, "rewards/margins": 0.6908566355705261, "rewards/rejected": -0.8009704351425171, "step": 1350 }, { "epoch": 2.064516129032258, "grad_norm": 7.003468990325928, "learning_rate": 3.1137724550898205e-05, "logits/chosen": -3.370246410369873, "logits/rejected": -3.093019723892212, "logps/chosen": -279.3021240234375, "logps/rejected": -242.58258056640625, "loss": 0.446, "rewards/accuracies": 0.8114583492279053, "rewards/chosen": -0.05044478550553322, "rewards/margins": 0.8847902417182922, "rewards/rejected": -0.9352350234985352, "step": 1380 }, { "epoch": 2.0944366526414213, "eval_logits/chosen": -3.3703560829162598, "eval_logits/rejected": -3.111078977584839, "eval_logps/chosen": -274.6524353027344, "eval_logps/rejected": -237.6984405517578, "eval_loss": 0.6312919855117798, "eval_rewards/accuracies": 0.6325729489326477, "eval_rewards/chosen": -0.27703869342803955, "eval_rewards/margins": 0.38578376173973083, "eval_rewards/rejected": -0.6628224849700928, "eval_runtime": 1685.7321, "eval_samples_per_second": 3.173, "eval_steps_per_second": 3.173, "step": 1400 }, { "epoch": 2.1093969144460027, "grad_norm": 5.607975959777832, "learning_rate": 2.9640718562874252e-05, "logits/chosen": -3.365170955657959, "logits/rejected": -3.1272387504577637, "logps/chosen": -271.7861022949219, "logps/rejected": -242.59573364257812, "loss": 0.4698, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.10354464501142502, "rewards/margins": 0.7553961277008057, "rewards/rejected": -0.8589407801628113, "step": 1410 }, { "epoch": 2.1542776998597475, "grad_norm": 5.758065223693848, "learning_rate": 2.81437125748503e-05, "logits/chosen": -3.362076997756958, "logits/rejected": -3.1057257652282715, "logps/chosen": -267.41705322265625, "logps/rejected": -235.0531463623047, "loss": 0.4586, "rewards/accuracies": 0.8166666626930237, "rewards/chosen": -0.11131696403026581, "rewards/margins": 0.785390317440033, "rewards/rejected": -0.89670729637146, "step": 1440 }, { "epoch": 2.1991584852734922, "grad_norm": 6.8294501304626465, "learning_rate": 2.6646706586826347e-05, "logits/chosen": -3.368708610534668, "logits/rejected": -3.0964319705963135, "logps/chosen": -269.26409912109375, "logps/rejected": -237.1984405517578, "loss": 0.4496, "rewards/accuracies": 0.8145833611488342, "rewards/chosen": -0.10699882358312607, "rewards/margins": 0.8314520120620728, "rewards/rejected": -0.938450813293457, "step": 1470 }, { "epoch": 2.244039270687237, "grad_norm": 6.356990337371826, "learning_rate": 2.5149700598802394e-05, "logits/chosen": -3.374453067779541, "logits/rejected": -3.129500389099121, "logps/chosen": -271.7542724609375, "logps/rejected": -241.45423889160156, "loss": 0.4552, "rewards/accuracies": 0.8135416507720947, "rewards/chosen": -0.20287248492240906, "rewards/margins": 0.8104608058929443, "rewards/rejected": -1.0133334398269653, "step": 1500 }, { "epoch": 2.244039270687237, "eval_logits/chosen": -3.360283613204956, "eval_logits/rejected": -3.1080915927886963, "eval_logps/chosen": -276.3040466308594, "eval_logps/rejected": -239.7833251953125, "eval_loss": 0.6368128657341003, "eval_rewards/accuracies": 0.6351907253265381, "eval_rewards/chosen": -0.44220101833343506, "eval_rewards/margins": 0.4291093647480011, "eval_rewards/rejected": -0.871310293674469, "eval_runtime": 1686.3289, "eval_samples_per_second": 3.171, "eval_steps_per_second": 3.171, "step": 1500 }, { "epoch": 2.2889200561009817, "grad_norm": 6.016663551330566, "learning_rate": 2.3652694610778445e-05, "logits/chosen": -3.3569111824035645, "logits/rejected": -3.123525857925415, "logps/chosen": -274.6582946777344, "logps/rejected": -241.02252197265625, "loss": 0.4577, "rewards/accuracies": 0.7947916388511658, "rewards/chosen": -0.20317865908145905, "rewards/margins": 0.8162151575088501, "rewards/rejected": -1.0193939208984375, "step": 1530 }, { "epoch": 2.3338008415147264, "grad_norm": 5.684780120849609, "learning_rate": 2.2155688622754492e-05, "logits/chosen": -3.3533644676208496, "logits/rejected": -3.146190881729126, "logps/chosen": -271.4990234375, "logps/rejected": -242.20486450195312, "loss": 0.4674, "rewards/accuracies": 0.7989583611488342, "rewards/chosen": -0.12297000735998154, "rewards/margins": 0.8095114827156067, "rewards/rejected": -0.9324816465377808, "step": 1560 }, { "epoch": 2.378681626928471, "grad_norm": 7.419367790222168, "learning_rate": 2.065868263473054e-05, "logits/chosen": -3.364116907119751, "logits/rejected": -3.092254638671875, "logps/chosen": -270.5090026855469, "logps/rejected": -237.64964294433594, "loss": 0.4443, "rewards/accuracies": 0.8177083134651184, "rewards/chosen": -0.22570447623729706, "rewards/margins": 0.84433513879776, "rewards/rejected": -1.0700395107269287, "step": 1590 }, { "epoch": 2.393641888733053, "eval_logits/chosen": -3.354207992553711, "eval_logits/rejected": -3.103837013244629, "eval_logps/chosen": -276.166015625, "eval_logps/rejected": -239.59542846679688, "eval_loss": 0.6390828490257263, "eval_rewards/accuracies": 0.6344428062438965, "eval_rewards/chosen": -0.4283973276615143, "eval_rewards/margins": 0.42412257194519043, "eval_rewards/rejected": -0.8525198101997375, "eval_runtime": 1685.5337, "eval_samples_per_second": 3.173, "eval_steps_per_second": 3.173, "step": 1600 }, { "epoch": 2.423562412342216, "grad_norm": 7.774267196655273, "learning_rate": 1.916167664670659e-05, "logits/chosen": -3.355332851409912, "logits/rejected": -3.1059510707855225, "logps/chosen": -277.3658752441406, "logps/rejected": -247.025146484375, "loss": 0.4466, "rewards/accuracies": 0.8052083253860474, "rewards/chosen": -0.20841935276985168, "rewards/margins": 0.860171377658844, "rewards/rejected": -1.0685906410217285, "step": 1620 }, { "epoch": 2.4684431977559607, "grad_norm": 7.505038738250732, "learning_rate": 1.7664670658682637e-05, "logits/chosen": -3.345045804977417, "logits/rejected": -3.1149702072143555, "logps/chosen": -278.6654968261719, "logps/rejected": -250.63827514648438, "loss": 0.4452, "rewards/accuracies": 0.828125, "rewards/chosen": -0.24601925909519196, "rewards/margins": 0.8955973982810974, "rewards/rejected": -1.141616702079773, "step": 1650 }, { "epoch": 2.5133239831697054, "grad_norm": 7.055671215057373, "learning_rate": 1.6167664670658684e-05, "logits/chosen": -3.3556442260742188, "logits/rejected": -3.086568593978882, "logps/chosen": -275.1831359863281, "logps/rejected": -240.82598876953125, "loss": 0.4564, "rewards/accuracies": 0.8052083253860474, "rewards/chosen": -0.29922303557395935, "rewards/margins": 0.8765833377838135, "rewards/rejected": -1.1758064031600952, "step": 1680 }, { "epoch": 2.5432445067788687, "eval_logits/chosen": -3.349193811416626, "eval_logits/rejected": -3.098674774169922, "eval_logps/chosen": -277.01483154296875, "eval_logps/rejected": -240.51805114746094, "eval_loss": 0.642005980014801, "eval_rewards/accuracies": 0.6297681331634521, "eval_rewards/chosen": -0.5132736563682556, "eval_rewards/margins": 0.4315095543861389, "eval_rewards/rejected": -0.9447831511497498, "eval_runtime": 1686.0549, "eval_samples_per_second": 3.172, "eval_steps_per_second": 3.172, "step": 1700 }, { "epoch": 2.55820476858345, "grad_norm": 7.247310638427734, "learning_rate": 1.467065868263473e-05, "logits/chosen": -3.3303916454315186, "logits/rejected": -3.118966817855835, "logps/chosen": -276.04510498046875, "logps/rejected": -250.57984924316406, "loss": 0.4615, "rewards/accuracies": 0.8072916865348816, "rewards/chosen": -0.27220281958580017, "rewards/margins": 0.8313066363334656, "rewards/rejected": -1.103509545326233, "step": 1710 }, { "epoch": 2.603085553997195, "grad_norm": 6.719433784484863, "learning_rate": 1.317365269461078e-05, "logits/chosen": -3.3551132678985596, "logits/rejected": -3.1186678409576416, "logps/chosen": -277.4861755371094, "logps/rejected": -251.39437866210938, "loss": 0.455, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -0.23053057491779327, "rewards/margins": 0.8569380640983582, "rewards/rejected": -1.0874686241149902, "step": 1740 }, { "epoch": 2.6479663394109396, "grad_norm": 6.049899101257324, "learning_rate": 1.1676646706586828e-05, "logits/chosen": -3.3462002277374268, "logits/rejected": -3.0950281620025635, "logps/chosen": -279.08447265625, "logps/rejected": -243.8069305419922, "loss": 0.4414, "rewards/accuracies": 0.8072916865348816, "rewards/chosen": -0.24068358540534973, "rewards/margins": 0.8998420238494873, "rewards/rejected": -1.1405255794525146, "step": 1770 }, { "epoch": 2.6928471248246844, "grad_norm": 7.545809268951416, "learning_rate": 1.0179640718562875e-05, "logits/chosen": -3.346256732940674, "logits/rejected": -3.112372875213623, "logps/chosen": -270.18499755859375, "logps/rejected": -240.69723510742188, "loss": 0.4603, "rewards/accuracies": 0.8083333373069763, "rewards/chosen": -0.25966814160346985, "rewards/margins": 0.8120385408401489, "rewards/rejected": -1.071706771850586, "step": 1800 }, { "epoch": 2.6928471248246844, "eval_logits/chosen": -3.3438971042633057, "eval_logits/rejected": -3.0931475162506104, "eval_logps/chosen": -276.7391662597656, "eval_logps/rejected": -240.1473388671875, "eval_loss": 0.6427502036094666, "eval_rewards/accuracies": 0.6297681331634521, "eval_rewards/chosen": -0.48571139574050903, "eval_rewards/margins": 0.4220017194747925, "eval_rewards/rejected": -0.9077131152153015, "eval_runtime": 1686.025, "eval_samples_per_second": 3.172, "eval_steps_per_second": 3.172, "step": 1800 }, { "epoch": 2.737727910238429, "grad_norm": 5.611355304718018, "learning_rate": 8.682634730538922e-06, "logits/chosen": -3.347557306289673, "logits/rejected": -3.109966993331909, "logps/chosen": -275.6930236816406, "logps/rejected": -247.47406005859375, "loss": 0.4457, "rewards/accuracies": 0.8291666507720947, "rewards/chosen": -0.3010416030883789, "rewards/margins": 0.8664290308952332, "rewards/rejected": -1.1674706935882568, "step": 1830 }, { "epoch": 2.782608695652174, "grad_norm": 8.53209114074707, "learning_rate": 7.18562874251497e-06, "logits/chosen": -3.3400204181671143, "logits/rejected": -3.103865623474121, "logps/chosen": -285.2029724121094, "logps/rejected": -255.09078979492188, "loss": 0.4524, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.22924675047397614, "rewards/margins": 0.8349610567092896, "rewards/rejected": -1.0642077922821045, "step": 1860 }, { "epoch": 2.8274894810659186, "grad_norm": 7.011772155761719, "learning_rate": 5.688622754491018e-06, "logits/chosen": -3.3375208377838135, "logits/rejected": -3.0882365703582764, "logps/chosen": -269.7694091796875, "logps/rejected": -238.83042907714844, "loss": 0.4511, "rewards/accuracies": 0.8031250238418579, "rewards/chosen": -0.3288494944572449, "rewards/margins": 0.8713601231575012, "rewards/rejected": -1.2002094984054565, "step": 1890 }, { "epoch": 2.8424497428705005, "eval_logits/chosen": -3.3421826362609863, "eval_logits/rejected": -3.0923619270324707, "eval_logps/chosen": -277.226806640625, "eval_logps/rejected": -240.6798858642578, "eval_loss": 0.6432516574859619, "eval_rewards/accuracies": 0.6295811533927917, "eval_rewards/chosen": -0.5344744324684143, "eval_rewards/margins": 0.42649218440055847, "eval_rewards/rejected": -0.9609667062759399, "eval_runtime": 1686.3389, "eval_samples_per_second": 3.171, "eval_steps_per_second": 3.171, "step": 1900 }, { "epoch": 2.8723702664796633, "grad_norm": 7.099593162536621, "learning_rate": 4.191616766467066e-06, "logits/chosen": -3.359609365463257, "logits/rejected": -3.0945444107055664, "logps/chosen": -280.75030517578125, "logps/rejected": -245.13504028320312, "loss": 0.4418, "rewards/accuracies": 0.8197916746139526, "rewards/chosen": -0.30151474475860596, "rewards/margins": 0.8953721523284912, "rewards/rejected": -1.1968867778778076, "step": 1920 }, { "epoch": 2.917251051893408, "grad_norm": 7.788060188293457, "learning_rate": 2.6946107784431138e-06, "logits/chosen": -3.3403496742248535, "logits/rejected": -3.091184139251709, "logps/chosen": -280.9390869140625, "logps/rejected": -247.351806640625, "loss": 0.444, "rewards/accuracies": 0.8302083611488342, "rewards/chosen": -0.2591714859008789, "rewards/margins": 0.885311484336853, "rewards/rejected": -1.1444830894470215, "step": 1950 }, { "epoch": 2.962131837307153, "grad_norm": 7.973437786102295, "learning_rate": 1.1976047904191619e-06, "logits/chosen": -3.328507900238037, "logits/rejected": -3.088214635848999, "logps/chosen": -271.0534362792969, "logps/rejected": -242.70079040527344, "loss": 0.4531, "rewards/accuracies": 0.815625011920929, "rewards/chosen": -0.357342392206192, "rewards/margins": 0.865265429019928, "rewards/rejected": -1.2226077318191528, "step": 1980 }, { "epoch": 2.992052360916316, "eval_logits/chosen": -3.342743158340454, "eval_logits/rejected": -3.0929155349731445, "eval_logps/chosen": -277.3058166503906, "eval_logps/rejected": -240.79270935058594, "eval_loss": 0.6429719924926758, "eval_rewards/accuracies": 0.6299551129341125, "eval_rewards/chosen": -0.5423800349235535, "eval_rewards/margins": 0.42986956238746643, "eval_rewards/rejected": -0.9722495079040527, "eval_runtime": 1684.4785, "eval_samples_per_second": 3.175, "eval_steps_per_second": 3.175, "step": 2000 } ], "logging_steps": 30, "max_steps": 2004, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }