zephyr-7b-dpo-full / trainer_state.json
lewtun's picture
lewtun HF staff
Model save
1461795
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -2.6023898124694824,
"logits/rejected": -2.49088191986084,
"logps/chosen": -330.5306396484375,
"logps/rejected": -275.0410461425781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.624011516571045,
"logits/rejected": -2.59273624420166,
"logps/chosen": -247.91769409179688,
"logps/rejected": -215.07041931152344,
"loss": 0.6932,
"rewards/accuracies": 0.3541666567325592,
"rewards/chosen": -0.00047609664034098387,
"rewards/margins": -0.0011458636727184057,
"rewards/rejected": 0.0006697670323774219,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.614908218383789,
"logits/rejected": -2.573396682739258,
"logps/chosen": -273.2959289550781,
"logps/rejected": -251.2639617919922,
"loss": 0.6925,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.0009359431569464505,
"rewards/margins": 0.002007069531828165,
"rewards/rejected": -0.0010711264330893755,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.6856637001037598,
"logits/rejected": -2.6220130920410156,
"logps/chosen": -284.86114501953125,
"logps/rejected": -277.53057861328125,
"loss": 0.6886,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.00352325732819736,
"rewards/margins": 0.007650823798030615,
"rewards/rejected": -0.0041275653056800365,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.579878807067871,
"logits/rejected": -2.5135815143585205,
"logps/chosen": -292.1109619140625,
"logps/rejected": -274.44683837890625,
"loss": 0.6756,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.02378256432712078,
"rewards/margins": 0.03553395718336105,
"rewards/rejected": -0.011751385405659676,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -2.5302300453186035,
"logits/rejected": -2.4865477085113525,
"logps/chosen": -315.3640441894531,
"logps/rejected": -310.5618591308594,
"loss": 0.6601,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.014850592240691185,
"rewards/margins": 0.06933780014514923,
"rewards/rejected": -0.08418838679790497,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -2.461594820022583,
"logits/rejected": -2.393406867980957,
"logps/chosen": -264.4418640136719,
"logps/rejected": -252.02163696289062,
"loss": 0.6391,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.06258662045001984,
"rewards/margins": 0.1386002004146576,
"rewards/rejected": -0.20118682086467743,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -2.5176403522491455,
"logits/rejected": -2.444599151611328,
"logps/chosen": -308.10845947265625,
"logps/rejected": -298.1520690917969,
"loss": 0.6219,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.26666340231895447,
"rewards/margins": 0.21313416957855225,
"rewards/rejected": -0.4797976016998291,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -2.4516353607177734,
"logits/rejected": -2.4085216522216797,
"logps/chosen": -298.8356018066406,
"logps/rejected": -325.5304260253906,
"loss": 0.611,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2911642789840698,
"rewards/margins": 0.20117318630218506,
"rewards/rejected": -0.49233752489089966,
"step": 80
},
{
"epoch": 0.19,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -2.426361560821533,
"logits/rejected": -2.3368563652038574,
"logps/chosen": -293.616943359375,
"logps/rejected": -308.7396545410156,
"loss": 0.5867,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.2579975724220276,
"rewards/margins": 0.30983540415763855,
"rewards/rejected": -0.5678330063819885,
"step": 90
},
{
"epoch": 0.21,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -2.488579034805298,
"logits/rejected": -2.3800113201141357,
"logps/chosen": -328.0105285644531,
"logps/rejected": -337.8644104003906,
"loss": 0.5723,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.4366111755371094,
"rewards/margins": 0.3044855296611786,
"rewards/rejected": -0.7410967350006104,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -2.4070217609405518,
"eval_logits/rejected": -2.3494362831115723,
"eval_logps/chosen": -304.3812255859375,
"eval_logps/rejected": -350.8694763183594,
"eval_loss": 0.5851432681083679,
"eval_rewards/accuracies": 0.703125,
"eval_rewards/chosen": -0.4096587896347046,
"eval_rewards/margins": 0.46554654836654663,
"eval_rewards/rejected": -0.8752052783966064,
"eval_runtime": 91.1907,
"eval_samples_per_second": 21.932,
"eval_steps_per_second": 0.351,
"step": 100
},
{
"epoch": 0.23,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -2.0290980339050293,
"logits/rejected": -1.8976500034332275,
"logps/chosen": -374.5489807128906,
"logps/rejected": -375.1778869628906,
"loss": 0.5723,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.5513430833816528,
"rewards/margins": 0.49042654037475586,
"rewards/rejected": -1.0417697429656982,
"step": 110
},
{
"epoch": 0.25,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -0.8261772990226746,
"logits/rejected": -0.4543725550174713,
"logps/chosen": -370.54437255859375,
"logps/rejected": -376.8744201660156,
"loss": 0.546,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.893993079662323,
"rewards/margins": 0.5693421363830566,
"rewards/rejected": -1.4633351564407349,
"step": 120
},
{
"epoch": 0.27,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -0.5733903050422668,
"logits/rejected": -0.41144052147865295,
"logps/chosen": -331.88458251953125,
"logps/rejected": -418.39404296875,
"loss": 0.5492,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6849642395973206,
"rewards/margins": 0.5858219265937805,
"rewards/rejected": -1.2707862854003906,
"step": 130
},
{
"epoch": 0.29,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -0.7106949687004089,
"logits/rejected": -0.2236645519733429,
"logps/chosen": -367.40484619140625,
"logps/rejected": -390.296142578125,
"loss": 0.5335,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7277344465255737,
"rewards/margins": 0.6220408082008362,
"rewards/rejected": -1.3497753143310547,
"step": 140
},
{
"epoch": 0.31,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -0.2654598355293274,
"logits/rejected": 0.43950486183166504,
"logps/chosen": -385.2984924316406,
"logps/rejected": -397.6144714355469,
"loss": 0.5356,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9714946746826172,
"rewards/margins": 0.61899733543396,
"rewards/rejected": -1.5904920101165771,
"step": 150
},
{
"epoch": 0.33,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": 0.1484789103269577,
"logits/rejected": 0.8263363838195801,
"logps/chosen": -369.7867736816406,
"logps/rejected": -436.39373779296875,
"loss": 0.5065,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8267679214477539,
"rewards/margins": 0.8252193331718445,
"rewards/rejected": -1.6519873142242432,
"step": 160
},
{
"epoch": 0.36,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": 0.2387746274471283,
"logits/rejected": 0.7541650533676147,
"logps/chosen": -330.07525634765625,
"logps/rejected": -366.41204833984375,
"loss": 0.5659,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8212235569953918,
"rewards/margins": 0.529572606086731,
"rewards/rejected": -1.3507962226867676,
"step": 170
},
{
"epoch": 0.38,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": 0.45646604895591736,
"logits/rejected": 0.8084599375724792,
"logps/chosen": -366.8728942871094,
"logps/rejected": -432.2496032714844,
"loss": 0.5249,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6927820444107056,
"rewards/margins": 0.8015207052230835,
"rewards/rejected": -1.4943029880523682,
"step": 180
},
{
"epoch": 0.4,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": 1.0517617464065552,
"logits/rejected": 1.6709725856781006,
"logps/chosen": -378.12396240234375,
"logps/rejected": -458.1866149902344,
"loss": 0.5056,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.9326898455619812,
"rewards/margins": 0.9154269099235535,
"rewards/rejected": -1.8481168746948242,
"step": 190
},
{
"epoch": 0.42,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": 0.9935806393623352,
"logits/rejected": 1.650398850440979,
"logps/chosen": -391.5450744628906,
"logps/rejected": -418.3558654785156,
"loss": 0.5084,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.0861790180206299,
"rewards/margins": 0.634604275226593,
"rewards/rejected": -1.7207832336425781,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": 0.9247687458992004,
"eval_logits/rejected": 1.3918358087539673,
"eval_logps/chosen": -354.5789794921875,
"eval_logps/rejected": -438.0662536621094,
"eval_loss": 0.5251370072364807,
"eval_rewards/accuracies": 0.7421875,
"eval_rewards/chosen": -0.9116362929344177,
"eval_rewards/margins": 0.8355368375778198,
"eval_rewards/rejected": -1.7471731901168823,
"eval_runtime": 91.7577,
"eval_samples_per_second": 21.797,
"eval_steps_per_second": 0.349,
"step": 200
},
{
"epoch": 0.44,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": 1.0475047826766968,
"logits/rejected": 1.849473237991333,
"logps/chosen": -367.184814453125,
"logps/rejected": -398.2117614746094,
"loss": 0.5251,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.8909347653388977,
"rewards/margins": 0.6959229707717896,
"rewards/rejected": -1.586857557296753,
"step": 210
},
{
"epoch": 0.46,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": 1.6884968280792236,
"logits/rejected": 2.2008445262908936,
"logps/chosen": -353.2514343261719,
"logps/rejected": -404.71221923828125,
"loss": 0.5269,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7567670345306396,
"rewards/margins": 0.8415945768356323,
"rewards/rejected": -1.5983617305755615,
"step": 220
},
{
"epoch": 0.48,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": 1.460933804512024,
"logits/rejected": 1.9314343929290771,
"logps/chosen": -351.2489318847656,
"logps/rejected": -453.9790954589844,
"loss": 0.519,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.7599745988845825,
"rewards/margins": 0.8532025218009949,
"rewards/rejected": -1.6131770610809326,
"step": 230
},
{
"epoch": 0.5,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": 1.796936273574829,
"logits/rejected": 2.389878988265991,
"logps/chosen": -351.67498779296875,
"logps/rejected": -421.3821716308594,
"loss": 0.5261,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9927783012390137,
"rewards/margins": 0.786289632320404,
"rewards/rejected": -1.7790677547454834,
"step": 240
},
{
"epoch": 0.52,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": 1.5744327306747437,
"logits/rejected": 2.3407230377197266,
"logps/chosen": -358.4691467285156,
"logps/rejected": -418.01031494140625,
"loss": 0.5134,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.903947651386261,
"rewards/margins": 0.6940609216690063,
"rewards/rejected": -1.5980085134506226,
"step": 250
},
{
"epoch": 0.54,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": 1.873732566833496,
"logits/rejected": 2.9474740028381348,
"logps/chosen": -371.85552978515625,
"logps/rejected": -420.95904541015625,
"loss": 0.4922,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.9079627990722656,
"rewards/margins": 0.8738547563552856,
"rewards/rejected": -1.7818174362182617,
"step": 260
},
{
"epoch": 0.56,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": 2.415181875228882,
"logits/rejected": 3.162013530731201,
"logps/chosen": -388.0815734863281,
"logps/rejected": -478.11785888671875,
"loss": 0.498,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.128756046295166,
"rewards/margins": 1.0180633068084717,
"rewards/rejected": -2.146819591522217,
"step": 270
},
{
"epoch": 0.59,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": 1.9844467639923096,
"logits/rejected": 2.9561781883239746,
"logps/chosen": -369.2903747558594,
"logps/rejected": -419.6259765625,
"loss": 0.5207,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9253425598144531,
"rewards/margins": 0.8587535619735718,
"rewards/rejected": -1.784096121788025,
"step": 280
},
{
"epoch": 0.61,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": 1.8705106973648071,
"logits/rejected": 2.6589739322662354,
"logps/chosen": -380.0862731933594,
"logps/rejected": -439.79168701171875,
"loss": 0.515,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9231119155883789,
"rewards/margins": 0.735679030418396,
"rewards/rejected": -1.6587913036346436,
"step": 290
},
{
"epoch": 0.63,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": 1.3944432735443115,
"logits/rejected": 2.3618969917297363,
"logps/chosen": -389.6896057128906,
"logps/rejected": -470.2090759277344,
"loss": 0.5059,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8919968605041504,
"rewards/margins": 0.6746976971626282,
"rewards/rejected": -1.5666944980621338,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": 1.2558308839797974,
"eval_logits/rejected": 2.033073902130127,
"eval_logps/chosen": -349.8758239746094,
"eval_logps/rejected": -438.77349853515625,
"eval_loss": 0.5130496621131897,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -0.8646047711372375,
"eval_rewards/margins": 0.8896409273147583,
"eval_rewards/rejected": -1.7542455196380615,
"eval_runtime": 92.0798,
"eval_samples_per_second": 21.72,
"eval_steps_per_second": 0.348,
"step": 300
},
{
"epoch": 0.65,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": 1.2974698543548584,
"logits/rejected": 2.6388087272644043,
"logps/chosen": -382.4002990722656,
"logps/rejected": -406.01153564453125,
"loss": 0.4978,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0794718265533447,
"rewards/margins": 0.7805131673812866,
"rewards/rejected": -1.8599849939346313,
"step": 310
},
{
"epoch": 0.67,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": 1.9306262731552124,
"logits/rejected": 2.9958901405334473,
"logps/chosen": -357.4389953613281,
"logps/rejected": -452.7220764160156,
"loss": 0.5064,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0856704711914062,
"rewards/margins": 1.057279109954834,
"rewards/rejected": -2.1429495811462402,
"step": 320
},
{
"epoch": 0.69,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": 1.4244121313095093,
"logits/rejected": 2.2654335498809814,
"logps/chosen": -404.91082763671875,
"logps/rejected": -450.8277893066406,
"loss": 0.5096,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1859899759292603,
"rewards/margins": 0.7777279019355774,
"rewards/rejected": -1.9637176990509033,
"step": 330
},
{
"epoch": 0.71,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": 1.5507278442382812,
"logits/rejected": 2.3268961906433105,
"logps/chosen": -363.16473388671875,
"logps/rejected": -420.6800231933594,
"loss": 0.5173,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0014616250991821,
"rewards/margins": 0.7089160680770874,
"rewards/rejected": -1.7103776931762695,
"step": 340
},
{
"epoch": 0.73,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": 1.5202906131744385,
"logits/rejected": 2.6713767051696777,
"logps/chosen": -359.4294128417969,
"logps/rejected": -433.394287109375,
"loss": 0.4787,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.0805784463882446,
"rewards/margins": 0.9193571209907532,
"rewards/rejected": -1.999935507774353,
"step": 350
},
{
"epoch": 0.75,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": 1.5974103212356567,
"logits/rejected": 3.016284942626953,
"logps/chosen": -435.1712951660156,
"logps/rejected": -469.9830017089844,
"loss": 0.4902,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1632494926452637,
"rewards/margins": 0.9136824607849121,
"rewards/rejected": -2.0769317150115967,
"step": 360
},
{
"epoch": 0.77,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": 1.7092777490615845,
"logits/rejected": 2.965677261352539,
"logps/chosen": -423.5621643066406,
"logps/rejected": -466.57196044921875,
"loss": 0.5002,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.1220273971557617,
"rewards/margins": 0.9678171277046204,
"rewards/rejected": -2.0898444652557373,
"step": 370
},
{
"epoch": 0.79,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": 1.7860336303710938,
"logits/rejected": 2.569241523742676,
"logps/chosen": -395.4902648925781,
"logps/rejected": -483.0901794433594,
"loss": 0.4772,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1515331268310547,
"rewards/margins": 0.8898499608039856,
"rewards/rejected": -2.0413832664489746,
"step": 380
},
{
"epoch": 0.82,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": 2.0826852321624756,
"logits/rejected": 2.8060660362243652,
"logps/chosen": -398.78375244140625,
"logps/rejected": -471.2264099121094,
"loss": 0.5066,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2567694187164307,
"rewards/margins": 0.7216086983680725,
"rewards/rejected": -1.9783780574798584,
"step": 390
},
{
"epoch": 0.84,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": 1.802354097366333,
"logits/rejected": 2.5923492908477783,
"logps/chosen": -446.500244140625,
"logps/rejected": -510.20269775390625,
"loss": 0.4853,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.2238214015960693,
"rewards/margins": 0.9289990663528442,
"rewards/rejected": -2.152820587158203,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": 1.8194458484649658,
"eval_logits/rejected": 2.592175245285034,
"eval_logps/chosen": -372.7066650390625,
"eval_logps/rejected": -474.1963195800781,
"eval_loss": 0.5050143003463745,
"eval_rewards/accuracies": 0.75390625,
"eval_rewards/chosen": -1.0929131507873535,
"eval_rewards/margins": 1.0155609846115112,
"eval_rewards/rejected": -2.108474016189575,
"eval_runtime": 90.5801,
"eval_samples_per_second": 22.08,
"eval_steps_per_second": 0.353,
"step": 400
},
{
"epoch": 0.86,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": 2.2372403144836426,
"logits/rejected": 3.196664333343506,
"logps/chosen": -370.81719970703125,
"logps/rejected": -452.06549072265625,
"loss": 0.5086,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.1016533374786377,
"rewards/margins": 0.9261430501937866,
"rewards/rejected": -2.0277962684631348,
"step": 410
},
{
"epoch": 0.88,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": 1.831080675125122,
"logits/rejected": 2.4410791397094727,
"logps/chosen": -385.7922058105469,
"logps/rejected": -492.590576171875,
"loss": 0.5061,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.084149956703186,
"rewards/margins": 0.9615718722343445,
"rewards/rejected": -2.0457215309143066,
"step": 420
},
{
"epoch": 0.9,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": 1.4260971546173096,
"logits/rejected": 2.3162856101989746,
"logps/chosen": -407.1165466308594,
"logps/rejected": -454.90374755859375,
"loss": 0.5059,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0966671705245972,
"rewards/margins": 0.9018322229385376,
"rewards/rejected": -1.9984995126724243,
"step": 430
},
{
"epoch": 0.92,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": 1.783463716506958,
"logits/rejected": 2.5885117053985596,
"logps/chosen": -373.5993347167969,
"logps/rejected": -458.12091064453125,
"loss": 0.487,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.1429402828216553,
"rewards/margins": 0.8700854182243347,
"rewards/rejected": -2.0130257606506348,
"step": 440
},
{
"epoch": 0.94,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": 1.8070141077041626,
"logits/rejected": 2.747885227203369,
"logps/chosen": -355.58221435546875,
"logps/rejected": -426.42584228515625,
"loss": 0.5082,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0844265222549438,
"rewards/margins": 0.8474240303039551,
"rewards/rejected": -1.9318506717681885,
"step": 450
},
{
"epoch": 0.96,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": 2.0278899669647217,
"logits/rejected": 3.022653818130493,
"logps/chosen": -362.0993347167969,
"logps/rejected": -428.6521911621094,
"loss": 0.4861,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1751190423965454,
"rewards/margins": 0.813240647315979,
"rewards/rejected": -1.9883596897125244,
"step": 460
},
{
"epoch": 0.98,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": 1.3355131149291992,
"logits/rejected": 2.729475736618042,
"logps/chosen": -406.28033447265625,
"logps/rejected": -480.8604431152344,
"loss": 0.4807,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0706168413162231,
"rewards/margins": 1.0933626890182495,
"rewards/rejected": -2.1639795303344727,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.5379065808890754,
"train_runtime": 5396.8094,
"train_samples_per_second": 11.328,
"train_steps_per_second": 0.089
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}