Gemma-7B-It-ORPO-SALT / trainer_state.json
chchen's picture
End of training
045b08b verified
{
"best_metric": 1.265723466873169,
"best_model_checkpoint": "saves/Gemma-7B-It/lora/orpo-salt/checkpoint-1500",
"epoch": 2.9969690846635686,
"eval_steps": 500,
"global_step": 1854,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01616488179430188,
"grad_norm": 4.377878189086914,
"learning_rate": 4.999648198770648e-06,
"logits/chosen": 209.9345245361328,
"logits/rejected": 210.6967315673828,
"logps/chosen": -2.4765946865081787,
"logps/rejected": -2.9186055660247803,
"loss": 2.5449,
"odds_ratio_loss": 0.6828715205192566,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.24765947461128235,
"rewards/margins": 0.04420109838247299,
"rewards/rejected": -0.29186058044433594,
"sft_loss": 2.4765946865081787,
"step": 10
},
{
"epoch": 0.03232976358860376,
"grad_norm": 2.781564950942993,
"learning_rate": 4.998578646361359e-06,
"logits/chosen": 210.4038543701172,
"logits/rejected": 212.20718383789062,
"logps/chosen": -2.4702863693237305,
"logps/rejected": -2.504176616668701,
"loss": 2.564,
"odds_ratio_loss": 0.9375804662704468,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.2470286339521408,
"rewards/margins": 0.0033890369813889265,
"rewards/rejected": -0.25041764974594116,
"sft_loss": 2.4702863693237305,
"step": 20
},
{
"epoch": 0.04849464538290564,
"grad_norm": 5.785957336425781,
"learning_rate": 4.996791614004449e-06,
"logits/chosen": 209.83865356445312,
"logits/rejected": 212.08535766601562,
"logps/chosen": -2.6004161834716797,
"logps/rejected": -2.695502758026123,
"loss": 2.6963,
"odds_ratio_loss": 0.9585107564926147,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.26004162430763245,
"rewards/margins": 0.009508667513728142,
"rewards/rejected": -0.26955026388168335,
"sft_loss": 2.6004161834716797,
"step": 30
},
{
"epoch": 0.06465952717720752,
"grad_norm": 7.009506702423096,
"learning_rate": 4.994287614855618e-06,
"logits/chosen": 210.0410614013672,
"logits/rejected": 211.40286254882812,
"logps/chosen": -2.6340386867523193,
"logps/rejected": -2.6249070167541504,
"loss": 2.7374,
"odds_ratio_loss": 1.0338027477264404,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.26340389251708984,
"rewards/margins": -0.0009131729602813721,
"rewards/rejected": -0.2624906897544861,
"sft_loss": 2.6340386867523193,
"step": 40
},
{
"epoch": 0.0808244089715094,
"grad_norm": 4.594735145568848,
"learning_rate": 4.991067367951343e-06,
"logits/chosen": 219.71932983398438,
"logits/rejected": 219.6745147705078,
"logps/chosen": -2.3416378498077393,
"logps/rejected": -2.4940619468688965,
"loss": 2.4215,
"odds_ratio_loss": 0.7983426451683044,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.2341637909412384,
"rewards/margins": 0.015242427587509155,
"rewards/rejected": -0.24940618872642517,
"sft_loss": 2.3416378498077393,
"step": 50
},
{
"epoch": 0.09698929076581128,
"grad_norm": 2.953855276107788,
"learning_rate": 4.987131798002389e-06,
"logits/chosen": 217.3623504638672,
"logits/rejected": 218.24862670898438,
"logps/chosen": -2.2888264656066895,
"logps/rejected": -2.6409952640533447,
"loss": 2.3829,
"odds_ratio_loss": 0.940882682800293,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.22888264060020447,
"rewards/margins": 0.035216934978961945,
"rewards/rejected": -0.2640995383262634,
"sft_loss": 2.2888264656066895,
"step": 60
},
{
"epoch": 0.11315417256011315,
"grad_norm": 4.224141597747803,
"learning_rate": 4.982482035128285e-06,
"logits/chosen": 217.9263458251953,
"logits/rejected": 218.54122924804688,
"logps/chosen": -2.326590061187744,
"logps/rejected": -2.605003833770752,
"loss": 2.4191,
"odds_ratio_loss": 0.9254364967346191,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.2326590120792389,
"rewards/margins": 0.027841363102197647,
"rewards/rejected": -0.26050037145614624,
"sft_loss": 2.326590061187744,
"step": 70
},
{
"epoch": 0.12931905435441504,
"grad_norm": 7.0222883224487305,
"learning_rate": 4.9771194145328e-06,
"logits/chosen": 224.885986328125,
"logits/rejected": 225.7215576171875,
"logps/chosen": -1.8678385019302368,
"logps/rejected": -2.1334400177001953,
"loss": 1.9429,
"odds_ratio_loss": 0.7510749697685242,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1867838352918625,
"rewards/margins": 0.026560146361589432,
"rewards/rejected": -0.213344007730484,
"sft_loss": 1.8678385019302368,
"step": 80
},
{
"epoch": 0.1454839361487169,
"grad_norm": 9.777688026428223,
"learning_rate": 4.971045476120532e-06,
"logits/chosen": 226.64450073242188,
"logits/rejected": 227.11874389648438,
"logps/chosen": -1.9129263162612915,
"logps/rejected": -2.10162091255188,
"loss": 1.9975,
"odds_ratio_loss": 0.8461491465568542,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1912926286458969,
"rewards/margins": 0.01886945590376854,
"rewards/rejected": -0.21016211807727814,
"sft_loss": 1.9129263162612915,
"step": 90
},
{
"epoch": 0.1616488179430188,
"grad_norm": 3.441721200942993,
"learning_rate": 4.964261964054713e-06,
"logits/chosen": 230.32669067382812,
"logits/rejected": 231.39498901367188,
"logps/chosen": -1.8438594341278076,
"logps/rejected": -2.1114680767059326,
"loss": 1.923,
"odds_ratio_loss": 0.7917153239250183,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.18438595533370972,
"rewards/margins": 0.026760881766676903,
"rewards/rejected": -0.21114683151245117,
"sft_loss": 1.8438594341278076,
"step": 100
},
{
"epoch": 0.17781369973732067,
"grad_norm": 3.7387969493865967,
"learning_rate": 4.956770826256372e-06,
"logits/chosen": 233.9343719482422,
"logits/rejected": 234.51516723632812,
"logps/chosen": -1.6228179931640625,
"logps/rejected": -1.8143441677093506,
"loss": 1.6988,
"odds_ratio_loss": 0.7598803043365479,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1622818112373352,
"rewards/margins": 0.01915261521935463,
"rewards/rejected": -0.18143442273139954,
"sft_loss": 1.6228179931640625,
"step": 110
},
{
"epoch": 0.19397858153162256,
"grad_norm": 2.157771110534668,
"learning_rate": 4.94857421384497e-06,
"logits/chosen": 235.01248168945312,
"logits/rejected": 235.390869140625,
"logps/chosen": -1.6021674871444702,
"logps/rejected": -1.885866403579712,
"loss": 1.6762,
"odds_ratio_loss": 0.7404050230979919,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.1602167785167694,
"rewards/margins": 0.02836987003684044,
"rewards/rejected": -0.18858662247657776,
"sft_loss": 1.6021674871444702,
"step": 120
},
{
"epoch": 0.21014346332592443,
"grad_norm": 2.794867515563965,
"learning_rate": 4.939674480520701e-06,
"logits/chosen": 236.7910614013672,
"logits/rejected": 237.41806030273438,
"logps/chosen": -1.5707839727401733,
"logps/rejected": -1.6964565515518188,
"loss": 1.6508,
"odds_ratio_loss": 0.8003607988357544,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.15707840025424957,
"rewards/margins": 0.01256726123392582,
"rewards/rejected": -0.16964565217494965,
"sft_loss": 1.5707839727401733,
"step": 130
},
{
"epoch": 0.2263083451202263,
"grad_norm": 1.2237716913223267,
"learning_rate": 4.930074181888613e-06,
"logits/chosen": 240.5333251953125,
"logits/rejected": 241.0712432861328,
"logps/chosen": -1.6245830059051514,
"logps/rejected": -1.83035409450531,
"loss": 1.6912,
"odds_ratio_loss": 0.6659940481185913,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.16245830059051514,
"rewards/margins": 0.02057710848748684,
"rewards/rejected": -0.18303541839122772,
"sft_loss": 1.6245830059051514,
"step": 140
},
{
"epoch": 0.2424732269145282,
"grad_norm": 3.366241693496704,
"learning_rate": 4.91977607472475e-06,
"logits/chosen": 240.38449096679688,
"logits/rejected": 241.23794555664062,
"logps/chosen": -1.5312227010726929,
"logps/rejected": -1.6705970764160156,
"loss": 1.6035,
"odds_ratio_loss": 0.7224593162536621,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.15312227606773376,
"rewards/margins": 0.013937436044216156,
"rewards/rejected": -0.16705971956253052,
"sft_loss": 1.5312227010726929,
"step": 150
},
{
"epoch": 0.2586381087088301,
"grad_norm": 2.1750807762145996,
"learning_rate": 4.908783116184534e-06,
"logits/chosen": 240.67446899414062,
"logits/rejected": 241.75424194335938,
"logps/chosen": -1.4731028079986572,
"logps/rejected": -1.7678340673446655,
"loss": 1.5362,
"odds_ratio_loss": 0.6307698488235474,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.1473102867603302,
"rewards/margins": 0.029473140835762024,
"rewards/rejected": -0.17678341269493103,
"sft_loss": 1.4731028079986572,
"step": 160
},
{
"epoch": 0.27480299050313195,
"grad_norm": 2.9354147911071777,
"learning_rate": 4.897098462953598e-06,
"logits/chosen": 243.85806274414062,
"logits/rejected": 244.68405151367188,
"logps/chosen": -1.3806183338165283,
"logps/rejected": -1.7258154153823853,
"loss": 1.4423,
"odds_ratio_loss": 0.6170833706855774,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.13806185126304626,
"rewards/margins": 0.03451969474554062,
"rewards/rejected": -0.1725815385580063,
"sft_loss": 1.3806183338165283,
"step": 170
},
{
"epoch": 0.2909678722974338,
"grad_norm": 1.4452638626098633,
"learning_rate": 4.884725470341331e-06,
"logits/chosen": 242.984619140625,
"logits/rejected": 243.6776580810547,
"logps/chosen": -1.2990996837615967,
"logps/rejected": -1.616987943649292,
"loss": 1.3597,
"odds_ratio_loss": 0.6057690382003784,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.12990999221801758,
"rewards/margins": 0.031788814812898636,
"rewards/rejected": -0.16169880330562592,
"sft_loss": 1.2990996837615967,
"step": 180
},
{
"epoch": 0.3071327540917357,
"grad_norm": 4.690347194671631,
"learning_rate": 4.871667691317377e-06,
"logits/chosen": 244.59634399414062,
"logits/rejected": 244.5352325439453,
"logps/chosen": -1.4848819971084595,
"logps/rejected": -1.573250412940979,
"loss": 1.5639,
"odds_ratio_loss": 0.7902374267578125,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14848819375038147,
"rewards/margins": 0.008836844936013222,
"rewards/rejected": -0.15732502937316895,
"sft_loss": 1.4848819971084595,
"step": 190
},
{
"epoch": 0.3232976358860376,
"grad_norm": 7.527270317077637,
"learning_rate": 4.857928875491392e-06,
"logits/chosen": 243.60494995117188,
"logits/rejected": 244.3643035888672,
"logps/chosen": -1.3324997425079346,
"logps/rejected": -1.5205867290496826,
"loss": 1.402,
"odds_ratio_loss": 0.6946145296096802,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13324996829032898,
"rewards/margins": 0.018808716908097267,
"rewards/rejected": -0.1520586758852005,
"sft_loss": 1.3324997425079346,
"step": 200
},
{
"epoch": 0.33946251768033947,
"grad_norm": 2.1978328227996826,
"learning_rate": 4.843512968036314e-06,
"logits/chosen": 244.3915557861328,
"logits/rejected": 244.58700561523438,
"logps/chosen": -1.3562281131744385,
"logps/rejected": -1.4892576932907104,
"loss": 1.4274,
"odds_ratio_loss": 0.7121940851211548,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1356228142976761,
"rewards/margins": 0.01330297440290451,
"rewards/rejected": -0.1489257663488388,
"sft_loss": 1.3562281131744385,
"step": 210
},
{
"epoch": 0.35562739947464134,
"grad_norm": 6.31206750869751,
"learning_rate": 4.828424108555486e-06,
"logits/chosen": 246.1901092529297,
"logits/rejected": 246.36703491210938,
"logps/chosen": -1.5392124652862549,
"logps/rejected": -1.7705978155136108,
"loss": 1.6086,
"odds_ratio_loss": 0.6943382024765015,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.1539212465286255,
"rewards/margins": 0.023138541728258133,
"rewards/rejected": -0.17705979943275452,
"sft_loss": 1.5392124652862549,
"step": 220
},
{
"epoch": 0.3717922812689432,
"grad_norm": 1.1257890462875366,
"learning_rate": 4.812666629893957e-06,
"logits/chosen": 246.37399291992188,
"logits/rejected": 246.72891235351562,
"logps/chosen": -1.3704453706741333,
"logps/rejected": -1.4485595226287842,
"loss": 1.4433,
"odds_ratio_loss": 0.7287623882293701,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1370445340871811,
"rewards/margins": 0.007811415940523148,
"rewards/rejected": -0.14485594630241394,
"sft_loss": 1.3704453706741333,
"step": 230
},
{
"epoch": 0.3879571630632451,
"grad_norm": 1.9700157642364502,
"learning_rate": 4.796245056894273e-06,
"logits/chosen": 244.54165649414062,
"logits/rejected": 244.89407348632812,
"logps/chosen": -1.4429550170898438,
"logps/rejected": -1.5743396282196045,
"loss": 1.5184,
"odds_ratio_loss": 0.7547486424446106,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14429552853107452,
"rewards/margins": 0.013138455338776112,
"rewards/rejected": -0.15743397176265717,
"sft_loss": 1.4429550170898438,
"step": 240
},
{
"epoch": 0.404122044857547,
"grad_norm": 1.5832947492599487,
"learning_rate": 4.779164105097148e-06,
"logits/chosen": 246.41659545898438,
"logits/rejected": 246.4707489013672,
"logps/chosen": -1.3124094009399414,
"logps/rejected": -1.5739551782608032,
"loss": 1.3768,
"odds_ratio_loss": 0.6443756818771362,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13124093413352966,
"rewards/margins": 0.026154566556215286,
"rewards/rejected": -0.15739551186561584,
"sft_loss": 1.3124094009399414,
"step": 250
},
{
"epoch": 0.42028692665184886,
"grad_norm": 2.2224152088165283,
"learning_rate": 4.761428679387373e-06,
"logits/chosen": 247.0335235595703,
"logits/rejected": 247.7626953125,
"logps/chosen": -1.2735482454299927,
"logps/rejected": -1.5084031820297241,
"loss": 1.3358,
"odds_ratio_loss": 0.6226388216018677,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.12735481560230255,
"rewards/margins": 0.02348550595343113,
"rewards/rejected": -0.15084032714366913,
"sft_loss": 1.2735482454299927,
"step": 260
},
{
"epoch": 0.4364518084461507,
"grad_norm": 2.0271799564361572,
"learning_rate": 4.7430438725853515e-06,
"logits/chosen": 247.60205078125,
"logits/rejected": 247.61654663085938,
"logps/chosen": -1.3570277690887451,
"logps/rejected": -1.714133858680725,
"loss": 1.4226,
"odds_ratio_loss": 0.6552777290344238,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.13570277392864227,
"rewards/margins": 0.035710614174604416,
"rewards/rejected": -0.171413391828537,
"sft_loss": 1.3570277690887451,
"step": 270
},
{
"epoch": 0.4526166902404526,
"grad_norm": 2.142329216003418,
"learning_rate": 4.724014963984669e-06,
"logits/chosen": 248.28439331054688,
"logits/rejected": 249.0177459716797,
"logps/chosen": -1.3674625158309937,
"logps/rejected": -1.6127933263778687,
"loss": 1.435,
"odds_ratio_loss": 0.6751004457473755,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13674625754356384,
"rewards/margins": 0.02453308179974556,
"rewards/rejected": -0.1612793505191803,
"sft_loss": 1.3674625158309937,
"step": 280
},
{
"epoch": 0.4687815720347545,
"grad_norm": 2.8357582092285156,
"learning_rate": 4.704347417836116e-06,
"logits/chosen": 247.2007598876953,
"logits/rejected": 247.60107421875,
"logps/chosen": -1.2728191614151,
"logps/rejected": -1.5069888830184937,
"loss": 1.3382,
"odds_ratio_loss": 0.6542028784751892,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.12728191912174225,
"rewards/margins": 0.023416969925165176,
"rewards/rejected": -0.15069888532161713,
"sft_loss": 1.2728191614151,
"step": 290
},
{
"epoch": 0.4849464538290564,
"grad_norm": 3.075584888458252,
"learning_rate": 4.684046881778603e-06,
"logits/chosen": 247.69580078125,
"logits/rejected": 247.7963409423828,
"logps/chosen": -1.3267529010772705,
"logps/rejected": -1.46425461769104,
"loss": 1.3929,
"odds_ratio_loss": 0.6614553332328796,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.13267529010772705,
"rewards/margins": 0.013750175014138222,
"rewards/rejected": -0.14642547070980072,
"sft_loss": 1.3267529010772705,
"step": 300
},
{
"epoch": 0.5011113356233583,
"grad_norm": 1.1745957136154175,
"learning_rate": 4.663119185217409e-06,
"logits/chosen": 247.5077667236328,
"logits/rejected": 247.80752563476562,
"logps/chosen": -1.2750051021575928,
"logps/rejected": -1.5364891290664673,
"loss": 1.3385,
"odds_ratio_loss": 0.6352204084396362,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1275005042552948,
"rewards/margins": 0.026148397475481033,
"rewards/rejected": -0.15364892780780792,
"sft_loss": 1.2750051021575928,
"step": 310
},
{
"epoch": 0.5172762174176602,
"grad_norm": 1.1816167831420898,
"learning_rate": 4.641570337650232e-06,
"logits/chosen": 248.5536651611328,
"logits/rejected": 248.5113067626953,
"logps/chosen": -1.1914936304092407,
"logps/rejected": -1.4479808807373047,
"loss": 1.2531,
"odds_ratio_loss": 0.615585446357727,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.11914938688278198,
"rewards/margins": 0.025648722425103188,
"rewards/rejected": -0.14479808509349823,
"sft_loss": 1.1914936304092407,
"step": 320
},
{
"epoch": 0.533441099211962,
"grad_norm": 6.805661678314209,
"learning_rate": 4.61940652694154e-06,
"logits/chosen": 246.8784637451172,
"logits/rejected": 247.60842895507812,
"logps/chosen": -1.371927261352539,
"logps/rejected": -1.4951013326644897,
"loss": 1.444,
"odds_ratio_loss": 0.7208858728408813,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1371927261352539,
"rewards/margins": 0.012317392975091934,
"rewards/rejected": -0.14951011538505554,
"sft_loss": 1.371927261352539,
"step": 330
},
{
"epoch": 0.5496059810062639,
"grad_norm": 2.8288872241973877,
"learning_rate": 4.596634117545689e-06,
"logits/chosen": 248.96542358398438,
"logits/rejected": 249.38369750976562,
"logps/chosen": -1.3861172199249268,
"logps/rejected": -1.6291033029556274,
"loss": 1.4514,
"odds_ratio_loss": 0.6529659032821655,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13861171901226044,
"rewards/margins": 0.02429860271513462,
"rewards/rejected": -0.1629103422164917,
"sft_loss": 1.3861172199249268,
"step": 340
},
{
"epoch": 0.5657708628005658,
"grad_norm": 2.343557834625244,
"learning_rate": 4.573259648679335e-06,
"logits/chosen": 247.5604248046875,
"logits/rejected": 247.8214111328125,
"logps/chosen": -1.3334286212921143,
"logps/rejected": -1.642163634300232,
"loss": 1.3937,
"odds_ratio_loss": 0.603044331073761,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.13334286212921143,
"rewards/margins": 0.030873507261276245,
"rewards/rejected": -0.16421635448932648,
"sft_loss": 1.3334286212921143,
"step": 350
},
{
"epoch": 0.5819357445948676,
"grad_norm": 6.341250896453857,
"learning_rate": 4.549289832443663e-06,
"logits/chosen": 249.6760711669922,
"logits/rejected": 249.2826385498047,
"logps/chosen": -1.2829958200454712,
"logps/rejected": -1.5420135259628296,
"loss": 1.351,
"odds_ratio_loss": 0.6805331110954285,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.12829959392547607,
"rewards/margins": 0.025901764631271362,
"rewards/rejected": -0.15420134365558624,
"sft_loss": 1.2829958200454712,
"step": 360
},
{
"epoch": 0.5981006263891695,
"grad_norm": 1.415165901184082,
"learning_rate": 4.524731551896978e-06,
"logits/chosen": 247.46142578125,
"logits/rejected": 247.42459106445312,
"logps/chosen": -1.2169711589813232,
"logps/rejected": -1.3963136672973633,
"loss": 1.2853,
"odds_ratio_loss": 0.6832239031791687,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.12169712781906128,
"rewards/margins": 0.017934244126081467,
"rewards/rejected": -0.13963137567043304,
"sft_loss": 1.2169711589813232,
"step": 370
},
{
"epoch": 0.6142655081834714,
"grad_norm": 2.7373573780059814,
"learning_rate": 4.4995918590781925e-06,
"logits/chosen": 250.4862518310547,
"logits/rejected": 250.1310272216797,
"logps/chosen": -1.2185784578323364,
"logps/rejected": -1.435258150100708,
"loss": 1.2834,
"odds_ratio_loss": 0.6484395265579224,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.12185785919427872,
"rewards/margins": 0.021667957305908203,
"rewards/rejected": -0.14352580904960632,
"sft_loss": 1.2185784578323364,
"step": 380
},
{
"epoch": 0.6304303899777733,
"grad_norm": 1.0431718826293945,
"learning_rate": 4.473877972981797e-06,
"logits/chosen": 247.82730102539062,
"logits/rejected": 248.197998046875,
"logps/chosen": -1.3133275508880615,
"logps/rejected": -1.566138505935669,
"loss": 1.378,
"odds_ratio_loss": 0.6465703248977661,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13133276998996735,
"rewards/margins": 0.02528109773993492,
"rewards/rejected": -0.15661385655403137,
"sft_loss": 1.3133275508880615,
"step": 390
},
{
"epoch": 0.6465952717720752,
"grad_norm": 2.605905771255493,
"learning_rate": 4.447597277484894e-06,
"logits/chosen": 248.4889678955078,
"logits/rejected": 248.0493927001953,
"logps/chosen": -1.1982879638671875,
"logps/rejected": -1.3909344673156738,
"loss": 1.2662,
"odds_ratio_loss": 0.6788827180862427,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.11982879787683487,
"rewards/margins": 0.01926465705037117,
"rewards/rejected": -0.13909344375133514,
"sft_loss": 1.1982879638671875,
"step": 400
},
{
"epoch": 0.6627601535663771,
"grad_norm": 2.7441976070404053,
"learning_rate": 4.42075731922687e-06,
"logits/chosen": 250.9984893798828,
"logits/rejected": 250.879150390625,
"logps/chosen": -1.3381072282791138,
"logps/rejected": -1.476546049118042,
"loss": 1.4062,
"odds_ratio_loss": 0.6813501119613647,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13381072878837585,
"rewards/margins": 0.013843873515725136,
"rewards/rejected": -0.14765460789203644,
"sft_loss": 1.3381072282791138,
"step": 410
},
{
"epoch": 0.6789250353606789,
"grad_norm": 3.2034897804260254,
"learning_rate": 4.3933658054423465e-06,
"logits/chosen": 249.34951782226562,
"logits/rejected": 249.37582397460938,
"logps/chosen": -1.2343724966049194,
"logps/rejected": -1.4455711841583252,
"loss": 1.2964,
"odds_ratio_loss": 0.6205655932426453,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.12343724817037582,
"rewards/margins": 0.021119873970746994,
"rewards/rejected": -0.1445571333169937,
"sft_loss": 1.2343724966049194,
"step": 420
},
{
"epoch": 0.6950899171549808,
"grad_norm": 2.552898645401001,
"learning_rate": 4.365430601748003e-06,
"logits/chosen": 247.7689208984375,
"logits/rejected": 247.95205688476562,
"logps/chosen": -1.3558423519134521,
"logps/rejected": -1.4942983388900757,
"loss": 1.4265,
"odds_ratio_loss": 0.7065616250038147,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.13558423519134521,
"rewards/margins": 0.013845594599843025,
"rewards/rejected": -0.1494298279285431,
"sft_loss": 1.3558423519134521,
"step": 430
},
{
"epoch": 0.7112547989492827,
"grad_norm": 7.701834201812744,
"learning_rate": 4.336959729883925e-06,
"logits/chosen": 248.16812133789062,
"logits/rejected": 248.47384643554688,
"logps/chosen": -1.2508445978164673,
"logps/rejected": -1.3401494026184082,
"loss": 1.3242,
"odds_ratio_loss": 0.7333300113677979,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12508445978164673,
"rewards/margins": 0.008930487558245659,
"rewards/rejected": -0.13401496410369873,
"sft_loss": 1.2508445978164673,
"step": 440
},
{
"epoch": 0.7274196807435845,
"grad_norm": 1.3677743673324585,
"learning_rate": 4.307961365410118e-06,
"logits/chosen": 249.19546508789062,
"logits/rejected": 249.5364990234375,
"logps/chosen": -1.2851893901824951,
"logps/rejected": -1.4277610778808594,
"loss": 1.3525,
"odds_ratio_loss": 0.6732175946235657,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.12851892411708832,
"rewards/margins": 0.014257180504500866,
"rewards/rejected": -0.14277611672878265,
"sft_loss": 1.2851893901824951,
"step": 450
},
{
"epoch": 0.7435845625378864,
"grad_norm": 3.3310444355010986,
"learning_rate": 4.278443835358854e-06,
"logits/chosen": 249.6570281982422,
"logits/rejected": 249.6079864501953,
"logps/chosen": -1.1893627643585205,
"logps/rejected": -1.4945032596588135,
"loss": 1.2482,
"odds_ratio_loss": 0.5885173082351685,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.11893627792596817,
"rewards/margins": 0.030514035373926163,
"rewards/rejected": -0.14945031702518463,
"sft_loss": 1.1893627643585205,
"step": 460
},
{
"epoch": 0.7597494443321883,
"grad_norm": 2.5770180225372314,
"learning_rate": 4.248415615843523e-06,
"logits/chosen": 249.5537567138672,
"logits/rejected": 249.5972442626953,
"logps/chosen": -1.2710212469100952,
"logps/rejected": -1.4037456512451172,
"loss": 1.3415,
"odds_ratio_loss": 0.7046067714691162,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.12710212171077728,
"rewards/margins": 0.01327243447303772,
"rewards/rejected": -0.140374556183815,
"sft_loss": 1.2710212469100952,
"step": 470
},
{
"epoch": 0.7759143261264903,
"grad_norm": 9.182385444641113,
"learning_rate": 4.217885329624666e-06,
"logits/chosen": 249.1255645751953,
"logits/rejected": 249.16854858398438,
"logps/chosen": -1.1571811437606812,
"logps/rejected": -1.4825657606124878,
"loss": 1.2175,
"odds_ratio_loss": 0.6027355194091797,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.11571812629699707,
"rewards/margins": 0.032538462430238724,
"rewards/rejected": -0.1482565701007843,
"sft_loss": 1.1571811437606812,
"step": 480
},
{
"epoch": 0.7920792079207921,
"grad_norm": 2.0430970191955566,
"learning_rate": 4.186861743633911e-06,
"logits/chosen": 248.51168823242188,
"logits/rejected": 248.83370971679688,
"logps/chosen": -1.216133713722229,
"logps/rejected": -1.4709254503250122,
"loss": 1.2856,
"odds_ratio_loss": 0.694364070892334,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.12161336094141006,
"rewards/margins": 0.025479182600975037,
"rewards/rejected": -0.1470925360918045,
"sft_loss": 1.216133713722229,
"step": 490
},
{
"epoch": 0.808244089715094,
"grad_norm": 2.13413143157959,
"learning_rate": 4.155353766456497e-06,
"logits/chosen": 252.05142211914062,
"logits/rejected": 251.9636993408203,
"logps/chosen": -1.3067327737808228,
"logps/rejected": -1.4753313064575195,
"loss": 1.374,
"odds_ratio_loss": 0.6729229688644409,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13067328929901123,
"rewards/margins": 0.01685984991490841,
"rewards/rejected": -0.147533118724823,
"sft_loss": 1.3067327737808228,
"step": 500
},
{
"epoch": 0.808244089715094,
"eval_logits/chosen": 249.61227416992188,
"eval_logits/rejected": 249.90635681152344,
"eval_logps/chosen": -1.2762008905410767,
"eval_logps/rejected": -1.5033098459243774,
"eval_loss": 1.3435848951339722,
"eval_odds_ratio_loss": 0.6738389730453491,
"eval_rewards/accuracies": 0.5672727227210999,
"eval_rewards/chosen": -0.12762011587619781,
"eval_rewards/margins": 0.0227108895778656,
"eval_rewards/rejected": -0.15033100545406342,
"eval_runtime": 221.4313,
"eval_samples_per_second": 4.968,
"eval_sft_loss": 1.2762008905410767,
"eval_steps_per_second": 2.484,
"step": 500
},
{
"epoch": 0.8244089715093958,
"grad_norm": 2.4113426208496094,
"learning_rate": 4.123370445773134e-06,
"logits/chosen": 250.3010711669922,
"logits/rejected": 250.5003662109375,
"logps/chosen": -1.2399041652679443,
"logps/rejected": -1.3490018844604492,
"loss": 1.3103,
"odds_ratio_loss": 0.7042885422706604,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.12399041652679443,
"rewards/margins": 0.010909780859947205,
"rewards/rejected": -0.13490018248558044,
"sft_loss": 1.2399041652679443,
"step": 510
},
{
"epoch": 0.8405738533036977,
"grad_norm": 4.632988452911377,
"learning_rate": 4.090920965761906e-06,
"logits/chosen": 249.44210815429688,
"logits/rejected": 249.96994018554688,
"logps/chosen": -1.2807283401489258,
"logps/rejected": -1.4942976236343384,
"loss": 1.3484,
"odds_ratio_loss": 0.6771414875984192,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1280728280544281,
"rewards/margins": 0.02135692723095417,
"rewards/rejected": -0.14942976832389832,
"sft_loss": 1.2807283401489258,
"step": 520
},
{
"epoch": 0.8567387350979996,
"grad_norm": 9.196592330932617,
"learning_rate": 4.058014644460991e-06,
"logits/chosen": 250.1853790283203,
"logits/rejected": 250.6529083251953,
"logps/chosen": -1.2633569240570068,
"logps/rejected": -1.4294393062591553,
"loss": 1.329,
"odds_ratio_loss": 0.6560603976249695,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.12633569538593292,
"rewards/margins": 0.01660825125873089,
"rewards/rejected": -0.14294394850730896,
"sft_loss": 1.2633569240570068,
"step": 530
},
{
"epoch": 0.8729036168923014,
"grad_norm": 1.8403383493423462,
"learning_rate": 4.024660931092939e-06,
"logits/chosen": 250.708251953125,
"logits/rejected": 251.0161895751953,
"logps/chosen": -1.287913203239441,
"logps/rejected": -1.553476095199585,
"loss": 1.3531,
"odds_ratio_loss": 0.6521779298782349,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.12879131734371185,
"rewards/margins": 0.026556288823485374,
"rewards/rejected": -0.15534761548042297,
"sft_loss": 1.287913203239441,
"step": 540
},
{
"epoch": 0.8890684986866033,
"grad_norm": 7.186382293701172,
"learning_rate": 3.990869403351272e-06,
"logits/chosen": 251.8153839111328,
"logits/rejected": 251.8900604248047,
"logps/chosen": -1.268169641494751,
"logps/rejected": -1.511528491973877,
"loss": 1.3283,
"odds_ratio_loss": 0.6014903783798218,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.12681695818901062,
"rewards/margins": 0.024335889145731926,
"rewards/rejected": -0.1511528491973877,
"sft_loss": 1.268169641494751,
"step": 550
},
{
"epoch": 0.9052333804809052,
"grad_norm": 2.923800230026245,
"learning_rate": 3.956649764650206e-06,
"logits/chosen": 250.7381591796875,
"logits/rejected": 250.7707061767578,
"logps/chosen": -1.2698795795440674,
"logps/rejected": -1.4995825290679932,
"loss": 1.3379,
"odds_ratio_loss": 0.6804186105728149,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.12698796391487122,
"rewards/margins": 0.0229702927172184,
"rewards/rejected": -0.14995825290679932,
"sft_loss": 1.2698795795440674,
"step": 560
},
{
"epoch": 0.9213982622752072,
"grad_norm": 6.1557416915893555,
"learning_rate": 3.92201184133826e-06,
"logits/chosen": 250.94808959960938,
"logits/rejected": 251.642822265625,
"logps/chosen": -1.2907052040100098,
"logps/rejected": -1.54143226146698,
"loss": 1.3538,
"odds_ratio_loss": 0.6311336755752563,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.12907053530216217,
"rewards/margins": 0.025072699412703514,
"rewards/rejected": -0.15414324402809143,
"sft_loss": 1.2907052040100098,
"step": 570
},
{
"epoch": 0.937563144069509,
"grad_norm": 2.1665000915527344,
"learning_rate": 3.886965579876572e-06,
"logits/chosen": 252.3577423095703,
"logits/rejected": 252.0865478515625,
"logps/chosen": -1.2575817108154297,
"logps/rejected": -1.3686320781707764,
"loss": 1.3293,
"odds_ratio_loss": 0.7170482873916626,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.12575815618038177,
"rewards/margins": 0.011105048470199108,
"rewards/rejected": -0.13686320185661316,
"sft_loss": 1.2575817108154297,
"step": 580
},
{
"epoch": 0.9537280258638109,
"grad_norm": 2.289733648300171,
"learning_rate": 3.851521043982716e-06,
"logits/chosen": 251.7819061279297,
"logits/rejected": 251.52749633789062,
"logps/chosen": -1.2542387247085571,
"logps/rejected": -1.3954790830612183,
"loss": 1.3206,
"odds_ratio_loss": 0.6639243960380554,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1254238784313202,
"rewards/margins": 0.014124047942459583,
"rewards/rejected": -0.1395479142665863,
"sft_loss": 1.2542387247085571,
"step": 590
},
{
"epoch": 0.9698929076581128,
"grad_norm": 2.7564313411712646,
"learning_rate": 3.81568841174086e-06,
"logits/chosen": 251.03280639648438,
"logits/rejected": 251.2174835205078,
"logps/chosen": -1.2807530164718628,
"logps/rejected": -1.5129293203353882,
"loss": 1.3482,
"odds_ratio_loss": 0.674277663230896,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.12807528674602509,
"rewards/margins": 0.023217635229229927,
"rewards/rejected": -0.15129292011260986,
"sft_loss": 1.2807530164718628,
"step": 600
},
{
"epoch": 0.9860577894524146,
"grad_norm": 2.1846888065338135,
"learning_rate": 3.7794779726790664e-06,
"logits/chosen": 249.8391571044922,
"logits/rejected": 250.3789520263672,
"logps/chosen": -1.1555012464523315,
"logps/rejected": -1.3768011331558228,
"loss": 1.2212,
"odds_ratio_loss": 0.6573610305786133,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.11555011570453644,
"rewards/margins": 0.022129978984594345,
"rewards/rejected": -0.13768009841442108,
"sft_loss": 1.1555012464523315,
"step": 610
},
{
"epoch": 1.0022226712467166,
"grad_norm": 2.2191011905670166,
"learning_rate": 3.7429001248146096e-06,
"logits/chosen": 250.8198699951172,
"logits/rejected": 251.24819946289062,
"logps/chosen": -1.272541880607605,
"logps/rejected": -1.5292177200317383,
"loss": 1.3338,
"odds_ratio_loss": 0.6125348806381226,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1272541880607605,
"rewards/margins": 0.0256675872951746,
"rewards/rejected": -0.15292176604270935,
"sft_loss": 1.272541880607605,
"step": 620
},
{
"epoch": 1.0183875530410185,
"grad_norm": 1.6834843158721924,
"learning_rate": 3.7059653716681227e-06,
"logits/chosen": 250.3338623046875,
"logits/rejected": 250.6593780517578,
"logps/chosen": -1.2664134502410889,
"logps/rejected": -1.469812035560608,
"loss": 1.3343,
"odds_ratio_loss": 0.6792756915092468,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.12664134800434113,
"rewards/margins": 0.020339861512184143,
"rewards/rejected": -0.14698120951652527,
"sft_loss": 1.2664134502410889,
"step": 630
},
{
"epoch": 1.0345524348353203,
"grad_norm": 5.188844203948975,
"learning_rate": 3.668684319247463e-06,
"logits/chosen": 249.46969604492188,
"logits/rejected": 250.1366729736328,
"logps/chosen": -1.1969501972198486,
"logps/rejected": -1.5598738193511963,
"loss": 1.2558,
"odds_ratio_loss": 0.588589072227478,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11969500780105591,
"rewards/margins": 0.03629238158464432,
"rewards/rejected": -0.15598741173744202,
"sft_loss": 1.1969501972198486,
"step": 640
},
{
"epoch": 1.0507173166296222,
"grad_norm": 1.8501890897750854,
"learning_rate": 3.6310676730021373e-06,
"logits/chosen": 250.78857421875,
"logits/rejected": 250.8007354736328,
"logps/chosen": -1.2203996181488037,
"logps/rejected": -1.3524749279022217,
"loss": 1.2867,
"odds_ratio_loss": 0.662962794303894,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12203995883464813,
"rewards/margins": 0.013207539916038513,
"rewards/rejected": -0.13524749875068665,
"sft_loss": 1.2203996181488037,
"step": 650
},
{
"epoch": 1.066882198423924,
"grad_norm": 3.5492091178894043,
"learning_rate": 3.593126234749178e-06,
"logits/chosen": 250.8761749267578,
"logits/rejected": 251.28622436523438,
"logps/chosen": -1.2661250829696655,
"logps/rejected": -1.455129861831665,
"loss": 1.3334,
"odds_ratio_loss": 0.6727336645126343,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.12661252915859222,
"rewards/margins": 0.01890045776963234,
"rewards/rejected": -0.14551296830177307,
"sft_loss": 1.2661250829696655,
"step": 660
},
{
"epoch": 1.083047080218226,
"grad_norm": 3.5715062618255615,
"learning_rate": 3.554870899571343e-06,
"logits/chosen": 252.4844512939453,
"logits/rejected": 252.82699584960938,
"logps/chosen": -1.2469182014465332,
"logps/rejected": -1.4401594400405884,
"loss": 1.3131,
"odds_ratio_loss": 0.6617658734321594,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.12469182908535004,
"rewards/margins": 0.019324112683534622,
"rewards/rejected": -0.14401593804359436,
"sft_loss": 1.2469182014465332,
"step": 670
},
{
"epoch": 1.0992119620125278,
"grad_norm": 4.318095684051514,
"learning_rate": 3.5163126526885373e-06,
"logits/chosen": 252.0143585205078,
"logits/rejected": 251.80081176757812,
"logps/chosen": -1.1914775371551514,
"logps/rejected": -1.4009137153625488,
"loss": 1.2573,
"odds_ratio_loss": 0.6579803824424744,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.11914775520563126,
"rewards/margins": 0.02094360813498497,
"rewards/rejected": -0.14009135961532593,
"sft_loss": 1.1914775371551514,
"step": 680
},
{
"epoch": 1.1153768438068297,
"grad_norm": 2.403775930404663,
"learning_rate": 3.4774625663033484e-06,
"logits/chosen": 251.2095184326172,
"logits/rejected": 251.48507690429688,
"logps/chosen": -1.2048381567001343,
"logps/rejected": -1.3877532482147217,
"loss": 1.27,
"odds_ratio_loss": 0.6517833471298218,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.12048381567001343,
"rewards/margins": 0.018291514366865158,
"rewards/rejected": -0.13877533376216888,
"sft_loss": 1.2048381567001343,
"step": 690
},
{
"epoch": 1.1315417256011315,
"grad_norm": 1.7898093461990356,
"learning_rate": 3.4383317964216067e-06,
"logits/chosen": 252.33316040039062,
"logits/rejected": 252.1842498779297,
"logps/chosen": -1.1471569538116455,
"logps/rejected": -1.306755781173706,
"loss": 1.2157,
"odds_ratio_loss": 0.6855098009109497,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.11471569538116455,
"rewards/margins": 0.01595989242196083,
"rewards/rejected": -0.13067558407783508,
"sft_loss": 1.1471569538116455,
"step": 700
},
{
"epoch": 1.1477066073954334,
"grad_norm": 3.209373712539673,
"learning_rate": 3.398931579648877e-06,
"logits/chosen": 251.15170288085938,
"logits/rejected": 251.59976196289062,
"logps/chosen": -1.239712119102478,
"logps/rejected": -1.5323327779769897,
"loss": 1.3045,
"odds_ratio_loss": 0.6475890874862671,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.12397120893001556,
"rewards/margins": 0.029262065887451172,
"rewards/rejected": -0.15323328971862793,
"sft_loss": 1.239712119102478,
"step": 710
},
{
"epoch": 1.1638714891897353,
"grad_norm": 2.6601579189300537,
"learning_rate": 3.359273229963813e-06,
"logits/chosen": 250.285400390625,
"logits/rejected": 250.47323608398438,
"logps/chosen": -1.2064179182052612,
"logps/rejected": -1.3739216327667236,
"loss": 1.2742,
"odds_ratio_loss": 0.6774007081985474,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.12064179033041,
"rewards/margins": 0.01675037480890751,
"rewards/rejected": -0.13739216327667236,
"sft_loss": 1.2064179182052612,
"step": 720
},
{
"epoch": 1.1800363709840371,
"grad_norm": 1.836297631263733,
"learning_rate": 3.319368135469285e-06,
"logits/chosen": 251.77334594726562,
"logits/rejected": 252.28701782226562,
"logps/chosen": -1.2479230165481567,
"logps/rejected": -1.4433178901672363,
"loss": 1.3175,
"odds_ratio_loss": 0.6954701542854309,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12479230016469955,
"rewards/margins": 0.019539497792720795,
"rewards/rejected": -0.14433178305625916,
"sft_loss": 1.2479230165481567,
"step": 730
},
{
"epoch": 1.196201252778339,
"grad_norm": 3.1846110820770264,
"learning_rate": 3.279227755122228e-06,
"logits/chosen": 252.08438110351562,
"logits/rejected": 252.65792846679688,
"logps/chosen": -1.196380376815796,
"logps/rejected": -1.5170973539352417,
"loss": 1.2585,
"odds_ratio_loss": 0.6215213537216187,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.11963804066181183,
"rewards/margins": 0.032071683555841446,
"rewards/rejected": -0.15170973539352417,
"sft_loss": 1.196380376815796,
"step": 740
},
{
"epoch": 1.2123661345726409,
"grad_norm": 3.024951934814453,
"learning_rate": 3.2388636154431417e-06,
"logits/chosen": 253.1087646484375,
"logits/rejected": 253.23635864257812,
"logps/chosen": -1.3020392656326294,
"logps/rejected": -1.5343925952911377,
"loss": 1.3675,
"odds_ratio_loss": 0.654754638671875,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.13020391762256622,
"rewards/margins": 0.02323536016047001,
"rewards/rejected": -0.1534392535686493,
"sft_loss": 1.3020392656326294,
"step": 750
},
{
"epoch": 1.2285310163669427,
"grad_norm": 2.166121482849121,
"learning_rate": 3.198287307206192e-06,
"logits/chosen": 251.711669921875,
"logits/rejected": 251.5684356689453,
"logps/chosen": -1.1889938116073608,
"logps/rejected": -1.4522913694381714,
"loss": 1.2499,
"odds_ratio_loss": 0.60938560962677,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.1188993826508522,
"rewards/margins": 0.02632974646985531,
"rewards/rejected": -0.14522913098335266,
"sft_loss": 1.1889938116073608,
"step": 760
},
{
"epoch": 1.2446958981612446,
"grad_norm": 1.8584887981414795,
"learning_rate": 3.157510482110856e-06,
"logits/chosen": 252.8727569580078,
"logits/rejected": 253.4295654296875,
"logps/chosen": -1.2046940326690674,
"logps/rejected": -1.360910177230835,
"loss": 1.2735,
"odds_ratio_loss": 0.6879505515098572,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.12046940624713898,
"rewards/margins": 0.01562163233757019,
"rewards/rejected": -0.13609102368354797,
"sft_loss": 1.2046940326690674,
"step": 770
},
{
"epoch": 1.2608607799555465,
"grad_norm": 1.6219208240509033,
"learning_rate": 3.116544849436077e-06,
"logits/chosen": 251.80764770507812,
"logits/rejected": 251.75509643554688,
"logps/chosen": -1.3175479173660278,
"logps/rejected": -1.6150630712509155,
"loss": 1.3813,
"odds_ratio_loss": 0.6378855109214783,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.1317548006772995,
"rewards/margins": 0.029751509428024292,
"rewards/rejected": -0.1615062952041626,
"sft_loss": 1.3175479173660278,
"step": 780
},
{
"epoch": 1.2770256617498483,
"grad_norm": 2.1420071125030518,
"learning_rate": 3.0754021726778848e-06,
"logits/chosen": 252.167724609375,
"logits/rejected": 251.9316864013672,
"logps/chosen": -1.1495087146759033,
"logps/rejected": -1.426129937171936,
"loss": 1.2132,
"odds_ratio_loss": 0.6372426748275757,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.11495087295770645,
"rewards/margins": 0.02766209840774536,
"rewards/rejected": -0.1426129937171936,
"sft_loss": 1.1495087146759033,
"step": 790
},
{
"epoch": 1.2931905435441502,
"grad_norm": 1.3823323249816895,
"learning_rate": 3.0340942661714463e-06,
"logits/chosen": 252.6959686279297,
"logits/rejected": 252.73464965820312,
"logps/chosen": -1.2912076711654663,
"logps/rejected": -1.4657213687896729,
"loss": 1.3573,
"odds_ratio_loss": 0.6610310673713684,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.12912078201770782,
"rewards/margins": 0.017451368272304535,
"rewards/rejected": -0.14657214283943176,
"sft_loss": 1.2912076711654663,
"step": 800
},
{
"epoch": 1.3093554253384523,
"grad_norm": 3.4516756534576416,
"learning_rate": 2.992632991698512e-06,
"logits/chosen": 250.41928100585938,
"logits/rejected": 250.66513061523438,
"logps/chosen": -1.219699501991272,
"logps/rejected": -1.483235239982605,
"loss": 1.2828,
"odds_ratio_loss": 0.6311507821083069,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12196997553110123,
"rewards/margins": 0.02635357342660427,
"rewards/rejected": -0.14832353591918945,
"sft_loss": 1.219699501991272,
"step": 810
},
{
"epoch": 1.3255203071327541,
"grad_norm": 2.465632677078247,
"learning_rate": 2.9510302550812537e-06,
"logits/chosen": 251.94296264648438,
"logits/rejected": 252.61587524414062,
"logps/chosen": -1.144325852394104,
"logps/rejected": -1.4354488849639893,
"loss": 1.2042,
"odds_ratio_loss": 0.5983381271362305,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.11443258821964264,
"rewards/margins": 0.02911229059100151,
"rewards/rejected": -0.14354488253593445,
"sft_loss": 1.144325852394104,
"step": 820
},
{
"epoch": 1.341685188927056,
"grad_norm": 3.969513416290283,
"learning_rate": 2.9092980027634325e-06,
"logits/chosen": 251.37832641601562,
"logits/rejected": 251.625244140625,
"logps/chosen": -1.1136391162872314,
"logps/rejected": -1.3801125288009644,
"loss": 1.1766,
"odds_ratio_loss": 0.6293498277664185,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.11136390268802643,
"rewards/margins": 0.026647353544831276,
"rewards/rejected": -0.13801124691963196,
"sft_loss": 1.1136391162872314,
"step": 830
},
{
"epoch": 1.3578500707213579,
"grad_norm": 1.7552839517593384,
"learning_rate": 2.867448218379927e-06,
"logits/chosen": 252.9868621826172,
"logits/rejected": 253.2499542236328,
"logps/chosen": -1.249079704284668,
"logps/rejected": -1.4685295820236206,
"loss": 1.3139,
"odds_ratio_loss": 0.6482545733451843,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.1249079704284668,
"rewards/margins": 0.02194499969482422,
"rewards/rejected": -0.14685297012329102,
"sft_loss": 1.249079704284668,
"step": 840
},
{
"epoch": 1.3740149525156597,
"grad_norm": 5.6061906814575195,
"learning_rate": 2.825492919315559e-06,
"logits/chosen": 252.72372436523438,
"logits/rejected": 252.32168579101562,
"logps/chosen": -1.2922828197479248,
"logps/rejected": -1.4327681064605713,
"loss": 1.3613,
"odds_ratio_loss": 0.6897528767585754,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.12922829389572144,
"rewards/margins": 0.01404851209372282,
"rewards/rejected": -0.14327679574489594,
"sft_loss": 1.2922828197479248,
"step": 850
},
{
"epoch": 1.3901798343099616,
"grad_norm": 2.2057290077209473,
"learning_rate": 2.7834441532542482e-06,
"logits/chosen": 251.51272583007812,
"logits/rejected": 251.97573852539062,
"logps/chosen": -1.1630654335021973,
"logps/rejected": -1.4224598407745361,
"loss": 1.2262,
"odds_ratio_loss": 0.6317997574806213,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.11630652844905853,
"rewards/margins": 0.025939440354704857,
"rewards/rejected": -0.14224597811698914,
"sft_loss": 1.1630654335021973,
"step": 860
},
{
"epoch": 1.4063447161042635,
"grad_norm": 2.0599286556243896,
"learning_rate": 2.74131399471945e-06,
"logits/chosen": 252.7571258544922,
"logits/rejected": 253.06008911132812,
"logps/chosen": -1.2314178943634033,
"logps/rejected": -1.404909372329712,
"loss": 1.297,
"odds_ratio_loss": 0.6555390357971191,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.12314176559448242,
"rewards/margins": 0.01734915003180504,
"rewards/rejected": -0.14049093425273895,
"sft_loss": 1.2314178943634033,
"step": 870
},
{
"epoch": 1.4225095978985653,
"grad_norm": 3.7026567459106445,
"learning_rate": 2.6991145416068947e-06,
"logits/chosen": 252.689697265625,
"logits/rejected": 252.87332153320312,
"logps/chosen": -1.2634754180908203,
"logps/rejected": -1.376312255859375,
"loss": 1.3339,
"odds_ratio_loss": 0.7037913799285889,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.12634754180908203,
"rewards/margins": 0.011283671483397484,
"rewards/rejected": -0.13763120770454407,
"sft_loss": 1.2634754180908203,
"step": 880
},
{
"epoch": 1.4386744796928672,
"grad_norm": 2.7741122245788574,
"learning_rate": 2.6568579117106143e-06,
"logits/chosen": 251.893310546875,
"logits/rejected": 251.9792022705078,
"logps/chosen": -1.1909462213516235,
"logps/rejected": -1.444592833518982,
"loss": 1.257,
"odds_ratio_loss": 0.6606670618057251,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.11909462511539459,
"rewards/margins": 0.025364672765135765,
"rewards/rejected": -0.1444592922925949,
"sft_loss": 1.1909462213516235,
"step": 890
},
{
"epoch": 1.454839361487169,
"grad_norm": 1.2793887853622437,
"learning_rate": 2.6145562392432544e-06,
"logits/chosen": 253.50723266601562,
"logits/rejected": 253.42764282226562,
"logps/chosen": -1.2168656587600708,
"logps/rejected": -1.336360216140747,
"loss": 1.2887,
"odds_ratio_loss": 0.7181415557861328,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.12168655544519424,
"rewards/margins": 0.01194946188479662,
"rewards/rejected": -0.13363602757453918,
"sft_loss": 1.2168656587600708,
"step": 900
},
{
"epoch": 1.471004243281471,
"grad_norm": 2.857558012008667,
"learning_rate": 2.5722216713516682e-06,
"logits/chosen": 252.78237915039062,
"logits/rejected": 253.788330078125,
"logps/chosen": -1.1416139602661133,
"logps/rejected": -1.3757555484771729,
"loss": 1.2043,
"odds_ratio_loss": 0.6271349787712097,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.11416139453649521,
"rewards/margins": 0.023414146155118942,
"rewards/rejected": -0.13757555186748505,
"sft_loss": 1.1416139602661133,
"step": 910
},
{
"epoch": 1.4871691250757728,
"grad_norm": 2.625776529312134,
"learning_rate": 2.5298663646288064e-06,
"logits/chosen": 253.61221313476562,
"logits/rejected": 254.0120391845703,
"logps/chosen": -1.1546480655670166,
"logps/rejected": -1.4036109447479248,
"loss": 1.2201,
"odds_ratio_loss": 0.6542297601699829,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.11546480655670166,
"rewards/margins": 0.024896297603845596,
"rewards/rejected": -0.14036110043525696,
"sft_loss": 1.1546480655670166,
"step": 920
},
{
"epoch": 1.503334006870075,
"grad_norm": 3.928030014038086,
"learning_rate": 2.487502481622879e-06,
"logits/chosen": 252.84619140625,
"logits/rejected": 253.71127319335938,
"logps/chosen": -1.2712576389312744,
"logps/rejected": -1.42746901512146,
"loss": 1.3413,
"odds_ratio_loss": 0.7003083229064941,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12712574005126953,
"rewards/margins": 0.01562117226421833,
"rewards/rejected": -0.1427469402551651,
"sft_loss": 1.2712576389312744,
"step": 930
},
{
"epoch": 1.5194988886643768,
"grad_norm": 2.4900426864624023,
"learning_rate": 2.4451421873448253e-06,
"logits/chosen": 252.51846313476562,
"logits/rejected": 253.07400512695312,
"logps/chosen": -1.193199634552002,
"logps/rejected": -1.3677222728729248,
"loss": 1.2601,
"odds_ratio_loss": 0.6688076257705688,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.11931997537612915,
"rewards/margins": 0.01745227724313736,
"rewards/rejected": -0.1367722451686859,
"sft_loss": 1.193199634552002,
"step": 940
},
{
"epoch": 1.5356637704586786,
"grad_norm": 6.85699987411499,
"learning_rate": 2.40279764577506e-06,
"logits/chosen": 253.85693359375,
"logits/rejected": 253.9010467529297,
"logps/chosen": -1.304840087890625,
"logps/rejected": -1.417873501777649,
"loss": 1.3741,
"odds_ratio_loss": 0.6923686861991882,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13048401474952698,
"rewards/margins": 0.011303339153528214,
"rewards/rejected": -0.1417873501777649,
"sft_loss": 1.304840087890625,
"step": 950
},
{
"epoch": 1.5518286522529805,
"grad_norm": 2.3570547103881836,
"learning_rate": 2.3604810163705242e-06,
"logits/chosen": 253.90060424804688,
"logits/rejected": 254.25430297851562,
"logps/chosen": -1.1358963251113892,
"logps/rejected": -1.3512394428253174,
"loss": 1.1966,
"odds_ratio_loss": 0.6068128943443298,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.11358962953090668,
"rewards/margins": 0.021534323692321777,
"rewards/rejected": -0.13512396812438965,
"sft_loss": 1.1358963251113892,
"step": 960
},
{
"epoch": 1.5679935340472824,
"grad_norm": 1.6715513467788696,
"learning_rate": 2.3182044505730364e-06,
"logits/chosen": 252.765380859375,
"logits/rejected": 252.7443389892578,
"logps/chosen": -1.0937732458114624,
"logps/rejected": -1.302191972732544,
"loss": 1.1567,
"odds_ratio_loss": 0.6288636922836304,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.10937733948230743,
"rewards/margins": 0.020841870456933975,
"rewards/rejected": -0.13021919131278992,
"sft_loss": 1.0937732458114624,
"step": 970
},
{
"epoch": 1.5841584158415842,
"grad_norm": 1.8489584922790527,
"learning_rate": 2.275980088319941e-06,
"logits/chosen": 253.30712890625,
"logits/rejected": 253.5155487060547,
"logps/chosen": -1.149460792541504,
"logps/rejected": -1.2745110988616943,
"loss": 1.2198,
"odds_ratio_loss": 0.7036079168319702,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.11494608223438263,
"rewards/margins": 0.012505029328167439,
"rewards/rejected": -0.1274511069059372,
"sft_loss": 1.149460792541504,
"step": 980
},
{
"epoch": 1.600323297635886,
"grad_norm": 2.3143341541290283,
"learning_rate": 2.2338200545580577e-06,
"logits/chosen": 253.9146728515625,
"logits/rejected": 254.3609619140625,
"logps/chosen": -1.1358720064163208,
"logps/rejected": -1.409860372543335,
"loss": 1.203,
"odds_ratio_loss": 0.6715231537818909,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.11358718574047089,
"rewards/margins": 0.02739885076880455,
"rewards/rejected": -0.14098605513572693,
"sft_loss": 1.1358720064163208,
"step": 990
},
{
"epoch": 1.616488179430188,
"grad_norm": 2.5078933238983154,
"learning_rate": 2.191736455761947e-06,
"logits/chosen": 252.4419708251953,
"logits/rejected": 252.6824493408203,
"logps/chosen": -1.102782964706421,
"logps/rejected": -1.295693039894104,
"loss": 1.1628,
"odds_ratio_loss": 0.5999386310577393,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.11027830839157104,
"rewards/margins": 0.019290992990136147,
"rewards/rejected": -0.12956929206848145,
"sft_loss": 1.102782964706421,
"step": 1000
},
{
"epoch": 1.616488179430188,
"eval_logits/chosen": 252.82716369628906,
"eval_logits/rejected": 253.18104553222656,
"eval_logps/chosen": -1.2153432369232178,
"eval_logps/rejected": -1.446128010749817,
"eval_loss": 1.2833058834075928,
"eval_odds_ratio_loss": 0.6796271204948425,
"eval_rewards/accuracies": 0.5618181824684143,
"eval_rewards/chosen": -0.1215343102812767,
"eval_rewards/margins": 0.023078490048646927,
"eval_rewards/rejected": -0.14461281895637512,
"eval_runtime": 221.4361,
"eval_samples_per_second": 4.968,
"eval_sft_loss": 1.2153432369232178,
"eval_steps_per_second": 2.484,
"step": 1000
},
{
"epoch": 1.6326530612244898,
"grad_norm": 1.7511672973632812,
"learning_rate": 2.1497413764574673e-06,
"logits/chosen": 253.8401336669922,
"logits/rejected": 253.7457733154297,
"logps/chosen": -1.2121939659118652,
"logps/rejected": -1.4931201934814453,
"loss": 1.2703,
"odds_ratio_loss": 0.5808267593383789,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.12121939659118652,
"rewards/margins": 0.028092628344893456,
"rewards/rejected": -0.14931201934814453,
"sft_loss": 1.2121939659118652,
"step": 1010
},
{
"epoch": 1.6488179430187917,
"grad_norm": 2.1624321937561035,
"learning_rate": 2.1078468757516395e-06,
"logits/chosen": 252.7372589111328,
"logits/rejected": 253.10342407226562,
"logps/chosen": -1.1226885318756104,
"logps/rejected": -1.302170991897583,
"loss": 1.1845,
"odds_ratio_loss": 0.6178861856460571,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.11226886510848999,
"rewards/margins": 0.017948249354958534,
"rewards/rejected": -0.13021712005138397,
"sft_loss": 1.1226885318756104,
"step": 1020
},
{
"epoch": 1.6649828248130936,
"grad_norm": 2.5826563835144043,
"learning_rate": 2.0660649838698145e-06,
"logits/chosen": 255.34326171875,
"logits/rejected": 255.65859985351562,
"logps/chosen": -1.1558864116668701,
"logps/rejected": -1.3295384645462036,
"loss": 1.2211,
"odds_ratio_loss": 0.6525439023971558,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.11558864265680313,
"rewards/margins": 0.01736520044505596,
"rewards/rejected": -0.13295385241508484,
"sft_loss": 1.1558864116668701,
"step": 1030
},
{
"epoch": 1.6811477066073954,
"grad_norm": 1.975549340248108,
"learning_rate": 2.0244076987011284e-06,
"logits/chosen": 255.1981964111328,
"logits/rejected": 255.7158966064453,
"logps/chosen": -1.2127221822738647,
"logps/rejected": -1.4685566425323486,
"loss": 1.2727,
"odds_ratio_loss": 0.6000550389289856,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1212722510099411,
"rewards/margins": 0.025583425536751747,
"rewards/rejected": -0.1468556672334671,
"sft_loss": 1.2127221822738647,
"step": 1040
},
{
"epoch": 1.6973125884016973,
"grad_norm": 2.224191904067993,
"learning_rate": 1.982886982353251e-06,
"logits/chosen": 252.6818389892578,
"logits/rejected": 252.80859375,
"logps/chosen": -1.193681240081787,
"logps/rejected": -1.44672691822052,
"loss": 1.2608,
"odds_ratio_loss": 0.6715336441993713,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.11936812102794647,
"rewards/margins": 0.025304565206170082,
"rewards/rejected": -0.144672691822052,
"sft_loss": 1.193681240081787,
"step": 1050
},
{
"epoch": 1.7134774701959992,
"grad_norm": 2.571403980255127,
"learning_rate": 1.941514757717392e-06,
"logits/chosen": 253.2911376953125,
"logits/rejected": 254.0371551513672,
"logps/chosen": -1.2021260261535645,
"logps/rejected": -1.443331003189087,
"loss": 1.2653,
"odds_ratio_loss": 0.6321113705635071,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.1202125996351242,
"rewards/margins": 0.02412049099802971,
"rewards/rejected": -0.1443330943584442,
"sft_loss": 1.2021260261535645,
"step": 1060
},
{
"epoch": 1.729642351990301,
"grad_norm": 4.061903476715088,
"learning_rate": 1.9003029050445953e-06,
"logits/chosen": 254.00650024414062,
"logits/rejected": 254.38876342773438,
"logps/chosen": -1.2242114543914795,
"logps/rejected": -1.4163745641708374,
"loss": 1.2891,
"odds_ratio_loss": 0.6486276984214783,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.12242114543914795,
"rewards/margins": 0.019216306507587433,
"rewards/rejected": -0.14163745939731598,
"sft_loss": 1.2242114543914795,
"step": 1070
},
{
"epoch": 1.745807233784603,
"grad_norm": 2.371570110321045,
"learning_rate": 1.8592632585342523e-06,
"logits/chosen": 254.29745483398438,
"logits/rejected": 254.67745971679688,
"logps/chosen": -1.1612073183059692,
"logps/rejected": -1.4247183799743652,
"loss": 1.2246,
"odds_ratio_loss": 0.6341406106948853,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.11612071841955185,
"rewards/margins": 0.026351114735007286,
"rewards/rejected": -0.1424718201160431,
"sft_loss": 1.1612073183059692,
"step": 1080
},
{
"epoch": 1.7619721155789048,
"grad_norm": 8.819137573242188,
"learning_rate": 1.8184076029358527e-06,
"logits/chosen": 253.06661987304688,
"logits/rejected": 252.38119506835938,
"logps/chosen": -1.161278486251831,
"logps/rejected": -1.2557886838912964,
"loss": 1.2286,
"odds_ratio_loss": 0.6734786033630371,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1161278635263443,
"rewards/margins": 0.009451002813875675,
"rewards/rejected": -0.1255788505077362,
"sft_loss": 1.161278486251831,
"step": 1090
},
{
"epoch": 1.7781369973732066,
"grad_norm": 1.7281618118286133,
"learning_rate": 1.7777476701649318e-06,
"logits/chosen": 251.3661651611328,
"logits/rejected": 252.11477661132812,
"logps/chosen": -1.1861474514007568,
"logps/rejected": -1.3936630487442017,
"loss": 1.2518,
"odds_ratio_loss": 0.6561599373817444,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.11861474812030792,
"rewards/margins": 0.020751552656292915,
"rewards/rejected": -0.1393662989139557,
"sft_loss": 1.1861474514007568,
"step": 1100
},
{
"epoch": 1.7943018791675085,
"grad_norm": 3.3538103103637695,
"learning_rate": 1.7372951359341925e-06,
"logits/chosen": 253.0167236328125,
"logits/rejected": 253.53396606445312,
"logps/chosen": -1.137481451034546,
"logps/rejected": -1.2894176244735718,
"loss": 1.2055,
"odds_ratio_loss": 0.6805364489555359,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.11374815553426743,
"rewards/margins": 0.01519359927624464,
"rewards/rejected": -0.12894175946712494,
"sft_loss": 1.137481451034546,
"step": 1110
},
{
"epoch": 1.8104667609618104,
"grad_norm": 3.6225833892822266,
"learning_rate": 1.6970616164007547e-06,
"logits/chosen": 252.6470489501953,
"logits/rejected": 252.9353485107422,
"logps/chosen": -1.1084340810775757,
"logps/rejected": -1.3413022756576538,
"loss": 1.1728,
"odds_ratio_loss": 0.6432042717933655,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.11084340512752533,
"rewards/margins": 0.02328682318329811,
"rewards/rejected": -0.13413023948669434,
"sft_loss": 1.1084340810775757,
"step": 1120
},
{
"epoch": 1.8266316427561122,
"grad_norm": 4.332692623138428,
"learning_rate": 1.6570586648305276e-06,
"logits/chosen": 253.6255645751953,
"logits/rejected": 253.76217651367188,
"logps/chosen": -1.1925103664398193,
"logps/rejected": -1.4342319965362549,
"loss": 1.2579,
"odds_ratio_loss": 0.6541949510574341,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.11925105005502701,
"rewards/margins": 0.024172160774469376,
"rewards/rejected": -0.14342321455478668,
"sft_loss": 1.1925103664398193,
"step": 1130
},
{
"epoch": 1.842796524550414,
"grad_norm": 3.238105535507202,
"learning_rate": 1.6172977682806151e-06,
"logits/chosen": 253.7568817138672,
"logits/rejected": 254.8863525390625,
"logps/chosen": -1.2200841903686523,
"logps/rejected": -1.4592787027359009,
"loss": 1.2837,
"odds_ratio_loss": 0.6365170478820801,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.12200842052698135,
"rewards/margins": 0.023919429630041122,
"rewards/rejected": -0.14592786133289337,
"sft_loss": 1.2200841903686523,
"step": 1140
},
{
"epoch": 1.858961406344716,
"grad_norm": 2.5219290256500244,
"learning_rate": 1.5777903443007586e-06,
"logits/chosen": 253.631103515625,
"logits/rejected": 253.82504272460938,
"logps/chosen": -1.235215425491333,
"logps/rejected": -1.4535129070281982,
"loss": 1.3023,
"odds_ratio_loss": 0.6708552241325378,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.12352155148983002,
"rewards/margins": 0.021829739212989807,
"rewards/rejected": -0.14535130560398102,
"sft_loss": 1.235215425491333,
"step": 1150
},
{
"epoch": 1.8751262881390178,
"grad_norm": 3.190958261489868,
"learning_rate": 1.5385477376547226e-06,
"logits/chosen": 255.1521759033203,
"logits/rejected": 255.2525634765625,
"logps/chosen": -1.229001760482788,
"logps/rejected": -1.4793845415115356,
"loss": 1.2891,
"odds_ratio_loss": 0.601111888885498,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.12290020287036896,
"rewards/margins": 0.02503824792802334,
"rewards/rejected": -0.14793843030929565,
"sft_loss": 1.229001760482788,
"step": 1160
},
{
"epoch": 1.89129116993332,
"grad_norm": 2.217510461807251,
"learning_rate": 1.4995812170625845e-06,
"logits/chosen": 253.1751251220703,
"logits/rejected": 253.72238159179688,
"logps/chosen": -1.2252581119537354,
"logps/rejected": -1.5921032428741455,
"loss": 1.2851,
"odds_ratio_loss": 0.598137617111206,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.12252581119537354,
"rewards/margins": 0.036684513092041016,
"rewards/rejected": -0.15921030938625336,
"sft_loss": 1.2252581119537354,
"step": 1170
},
{
"epoch": 1.9074560517276218,
"grad_norm": 3.0452287197113037,
"learning_rate": 1.4609019719648666e-06,
"logits/chosen": 254.07901000976562,
"logits/rejected": 254.59957885742188,
"logps/chosen": -1.2207356691360474,
"logps/rejected": -1.4706141948699951,
"loss": 1.2826,
"odds_ratio_loss": 0.6183902025222778,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.12207356840372086,
"rewards/margins": 0.024987850338220596,
"rewards/rejected": -0.14706142246723175,
"sft_loss": 1.2207356691360474,
"step": 1180
},
{
"epoch": 1.9236209335219236,
"grad_norm": 4.679479122161865,
"learning_rate": 1.42252110930943e-06,
"logits/chosen": 252.7305450439453,
"logits/rejected": 252.6374969482422,
"logps/chosen": -1.064835786819458,
"logps/rejected": -1.2910759449005127,
"loss": 1.1283,
"odds_ratio_loss": 0.6346200704574585,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.1064835786819458,
"rewards/margins": 0.02262401580810547,
"rewards/rejected": -0.12910759449005127,
"sft_loss": 1.064835786819458,
"step": 1190
},
{
"epoch": 1.9397858153162255,
"grad_norm": 3.286461353302002,
"learning_rate": 1.3844496503620493e-06,
"logits/chosen": 253.310302734375,
"logits/rejected": 253.27023315429688,
"logps/chosen": -1.2112998962402344,
"logps/rejected": -1.3967139720916748,
"loss": 1.2737,
"odds_ratio_loss": 0.6242542862892151,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.12112998962402344,
"rewards/margins": 0.01854141615331173,
"rewards/rejected": -0.1396714150905609,
"sft_loss": 1.2112998962402344,
"step": 1200
},
{
"epoch": 1.9559506971105274,
"grad_norm": 2.8077545166015625,
"learning_rate": 1.3466985275416081e-06,
"logits/chosen": 254.2769775390625,
"logits/rejected": 254.47360229492188,
"logps/chosen": -1.2563018798828125,
"logps/rejected": -1.4514508247375488,
"loss": 1.3239,
"odds_ratio_loss": 0.6757391691207886,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1256301999092102,
"rewards/margins": 0.019514882937073708,
"rewards/rejected": -0.14514507353305817,
"sft_loss": 1.2563018798828125,
"step": 1210
},
{
"epoch": 1.9721155789048292,
"grad_norm": 2.275397777557373,
"learning_rate": 1.309278581280791e-06,
"logits/chosen": 253.5750274658203,
"logits/rejected": 253.99935913085938,
"logps/chosen": -1.1356334686279297,
"logps/rejected": -1.4314597845077515,
"loss": 1.1934,
"odds_ratio_loss": 0.5772610902786255,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.11356334388256073,
"rewards/margins": 0.029582645744085312,
"rewards/rejected": -0.14314597845077515,
"sft_loss": 1.1356334686279297,
"step": 1220
},
{
"epoch": 1.9882804606991311,
"grad_norm": 1.454276204109192,
"learning_rate": 1.272200556913199e-06,
"logits/chosen": 254.544677734375,
"logits/rejected": 254.67251586914062,
"logps/chosen": -1.1884077787399292,
"logps/rejected": -1.395115613937378,
"loss": 1.2599,
"odds_ratio_loss": 0.7147720456123352,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1188407689332962,
"rewards/margins": 0.020670795813202858,
"rewards/rejected": -0.1395115852355957,
"sft_loss": 1.1884077787399292,
"step": 1230
},
{
"epoch": 2.004445342493433,
"grad_norm": 3.6475422382354736,
"learning_rate": 1.2354751015877698e-06,
"logits/chosen": 252.74777221679688,
"logits/rejected": 253.66641235351562,
"logps/chosen": -1.1167339086532593,
"logps/rejected": -1.4450454711914062,
"loss": 1.1791,
"odds_ratio_loss": 0.6237870454788208,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.11167339980602264,
"rewards/margins": 0.03283114731311798,
"rewards/rejected": -0.14450454711914062,
"sft_loss": 1.1167339086532593,
"step": 1240
},
{
"epoch": 2.020610224287735,
"grad_norm": 3.52698016166687,
"learning_rate": 1.1991127612113945e-06,
"logits/chosen": 254.6741943359375,
"logits/rejected": 254.9825897216797,
"logps/chosen": -1.1792643070220947,
"logps/rejected": -1.4326034784317017,
"loss": 1.2387,
"odds_ratio_loss": 0.5942111611366272,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.11792641878128052,
"rewards/margins": 0.02533392235636711,
"rewards/rejected": -0.14326035976409912,
"sft_loss": 1.1792643070220947,
"step": 1250
},
{
"epoch": 2.036775106082037,
"grad_norm": 3.579160690307617,
"learning_rate": 1.1631239774206035e-06,
"logits/chosen": 253.5153350830078,
"logits/rejected": 253.659912109375,
"logps/chosen": -1.1673438549041748,
"logps/rejected": -1.4458467960357666,
"loss": 1.2314,
"odds_ratio_loss": 0.6410170793533325,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.11673440039157867,
"rewards/margins": 0.027850273996591568,
"rewards/rejected": -0.14458468556404114,
"sft_loss": 1.1673438549041748,
"step": 1260
},
{
"epoch": 2.052939987876339,
"grad_norm": 3.0812463760375977,
"learning_rate": 1.1275190845831978e-06,
"logits/chosen": 254.5985870361328,
"logits/rejected": 254.2525177001953,
"logps/chosen": -1.1342524290084839,
"logps/rejected": -1.3948237895965576,
"loss": 1.1925,
"odds_ratio_loss": 0.5824798345565796,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11342523247003555,
"rewards/margins": 0.026057133451104164,
"rewards/rejected": -0.13948237895965576,
"sft_loss": 1.1342524290084839,
"step": 1270
},
{
"epoch": 2.0691048696706407,
"grad_norm": 2.4549710750579834,
"learning_rate": 1.0923083068306778e-06,
"logits/chosen": 254.7982635498047,
"logits/rejected": 255.1488494873047,
"logps/chosen": -1.1482160091400146,
"logps/rejected": -1.4850049018859863,
"loss": 1.2055,
"odds_ratio_loss": 0.5724589824676514,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.11482159048318863,
"rewards/margins": 0.033678896725177765,
"rewards/rejected": -0.1485004872083664,
"sft_loss": 1.1482160091400146,
"step": 1280
},
{
"epoch": 2.0852697514649425,
"grad_norm": 1.778605580329895,
"learning_rate": 1.0575017551223348e-06,
"logits/chosen": 253.03524780273438,
"logits/rejected": 253.53366088867188,
"logps/chosen": -1.087461233139038,
"logps/rejected": -1.321656584739685,
"loss": 1.1524,
"odds_ratio_loss": 0.6496065855026245,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.10874611139297485,
"rewards/margins": 0.023419544100761414,
"rewards/rejected": -0.13216565549373627,
"sft_loss": 1.087461233139038,
"step": 1290
},
{
"epoch": 2.1014346332592444,
"grad_norm": 1.522445797920227,
"learning_rate": 1.023109424341833e-06,
"logits/chosen": 254.5054168701172,
"logits/rejected": 255.06100463867188,
"logps/chosen": -1.2142359018325806,
"logps/rejected": -1.448194146156311,
"loss": 1.2781,
"odds_ratio_loss": 0.6386287808418274,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12142357975244522,
"rewards/margins": 0.023395827040076256,
"rewards/rejected": -0.14481940865516663,
"sft_loss": 1.2142359018325806,
"step": 1300
},
{
"epoch": 2.1175995150535463,
"grad_norm": 2.577580690383911,
"learning_rate": 9.891411904271273e-07,
"logits/chosen": 254.14779663085938,
"logits/rejected": 254.121826171875,
"logps/chosen": -1.100303292274475,
"logps/rejected": -1.3276548385620117,
"loss": 1.1632,
"odds_ratio_loss": 0.628852128982544,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.11003033071756363,
"rewards/margins": 0.02273516170680523,
"rewards/rejected": -0.1327655017375946,
"sft_loss": 1.100303292274475,
"step": 1310
},
{
"epoch": 2.133764396847848,
"grad_norm": 1.5676363706588745,
"learning_rate": 9.556068075345363e-07,
"logits/chosen": 255.0603485107422,
"logits/rejected": 255.1541748046875,
"logps/chosen": -1.1494947671890259,
"logps/rejected": -1.3283547163009644,
"loss": 1.2112,
"odds_ratio_loss": 0.6172733306884766,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.11494947969913483,
"rewards/margins": 0.017885997891426086,
"rewards/rejected": -0.1328354775905609,
"sft_loss": 1.1494947671890259,
"step": 1320
},
{
"epoch": 2.14992927864215,
"grad_norm": 1.964956521987915,
"learning_rate": 9.225159052377838e-07,
"logits/chosen": 254.16183471679688,
"logits/rejected": 254.3532257080078,
"logps/chosen": -1.1823852062225342,
"logps/rejected": -1.4285701513290405,
"loss": 1.2468,
"odds_ratio_loss": 0.6443654894828796,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.11823852360248566,
"rewards/margins": 0.024618491530418396,
"rewards/rejected": -0.14285701513290405,
"sft_loss": 1.1823852062225342,
"step": 1330
},
{
"epoch": 2.166094160436452,
"grad_norm": 4.320827484130859,
"learning_rate": 8.898779857628184e-07,
"logits/chosen": 253.94775390625,
"logits/rejected": 253.83328247070312,
"logps/chosen": -1.0813744068145752,
"logps/rejected": -1.289052963256836,
"loss": 1.1442,
"odds_ratio_loss": 0.6285432577133179,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.10813745111227036,
"rewards/margins": 0.020767847076058388,
"rewards/rejected": -0.1289052963256836,
"sft_loss": 1.0813744068145752,
"step": 1340
},
{
"epoch": 2.1822590422307537,
"grad_norm": 1.9166721105575562,
"learning_rate": 8.577024212591975e-07,
"logits/chosen": 255.42715454101562,
"logits/rejected": 255.6570281982422,
"logps/chosen": -1.2112232446670532,
"logps/rejected": -1.4022705554962158,
"loss": 1.2754,
"odds_ratio_loss": 0.6421025991439819,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.1211223155260086,
"rewards/margins": 0.019104719161987305,
"rewards/rejected": -0.1402270495891571,
"sft_loss": 1.2112232446670532,
"step": 1350
},
{
"epoch": 2.1984239240250556,
"grad_norm": 2.231593370437622,
"learning_rate": 8.259984511088276e-07,
"logits/chosen": 252.95217895507812,
"logits/rejected": 253.24972534179688,
"logps/chosen": -1.1978521347045898,
"logps/rejected": -1.4085915088653564,
"loss": 1.2643,
"odds_ratio_loss": 0.6643570065498352,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.11978521198034286,
"rewards/margins": 0.021073944866657257,
"rewards/rejected": -0.14085917174816132,
"sft_loss": 1.1978521347045898,
"step": 1360
},
{
"epoch": 2.2145888058193575,
"grad_norm": 1.891438364982605,
"learning_rate": 7.947751792728237e-07,
"logits/chosen": 252.89163208007812,
"logits/rejected": 252.9368133544922,
"logps/chosen": -1.1386100053787231,
"logps/rejected": -1.3931357860565186,
"loss": 1.2001,
"odds_ratio_loss": 0.6149393320083618,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.11386100947856903,
"rewards/margins": 0.02545258030295372,
"rewards/rejected": -0.13931360840797424,
"sft_loss": 1.1386100053787231,
"step": 1370
},
{
"epoch": 2.2307536876136593,
"grad_norm": 11.893668174743652,
"learning_rate": 7.640415716772626e-07,
"logits/chosen": 254.87893676757812,
"logits/rejected": 254.9901123046875,
"logps/chosen": -1.2301312685012817,
"logps/rejected": -1.4715359210968018,
"loss": 1.2969,
"odds_ratio_loss": 0.6676316857337952,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.12301311641931534,
"rewards/margins": 0.02414046786725521,
"rewards/rejected": -0.1471536010503769,
"sft_loss": 1.2301312685012817,
"step": 1380
},
{
"epoch": 2.246918569407961,
"grad_norm": 1.648759365081787,
"learning_rate": 7.338064536385722e-07,
"logits/chosen": 253.27536010742188,
"logits/rejected": 253.39450073242188,
"logps/chosen": -1.172890543937683,
"logps/rejected": -1.4573774337768555,
"loss": 1.2306,
"odds_ratio_loss": 0.5775946974754333,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.11728904396295547,
"rewards/margins": 0.02844870649278164,
"rewards/rejected": -0.14573773741722107,
"sft_loss": 1.172890543937683,
"step": 1390
},
{
"epoch": 2.263083451202263,
"grad_norm": 2.644590377807617,
"learning_rate": 7.040785073292883e-07,
"logits/chosen": 254.41006469726562,
"logits/rejected": 254.5663299560547,
"logps/chosen": -1.243436336517334,
"logps/rejected": -1.455594778060913,
"loss": 1.3115,
"odds_ratio_loss": 0.6802859902381897,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.12434364855289459,
"rewards/margins": 0.021215861663222313,
"rewards/rejected": -0.14555948972702026,
"sft_loss": 1.243436336517334,
"step": 1400
},
{
"epoch": 2.279248332996565,
"grad_norm": 3.2420878410339355,
"learning_rate": 6.748662692849297e-07,
"logits/chosen": 253.18417358398438,
"logits/rejected": 254.11279296875,
"logps/chosen": -1.1471028327941895,
"logps/rejected": -1.5119264125823975,
"loss": 1.2055,
"odds_ratio_loss": 0.5840214490890503,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11471028625965118,
"rewards/margins": 0.03648235648870468,
"rewards/rejected": -0.15119265019893646,
"sft_loss": 1.1471028327941895,
"step": 1410
},
{
"epoch": 2.295413214790867,
"grad_norm": 4.394900798797607,
"learning_rate": 6.46178127952686e-07,
"logits/chosen": 254.48062133789062,
"logits/rejected": 254.9115753173828,
"logps/chosen": -1.1684550046920776,
"logps/rejected": -1.38850998878479,
"loss": 1.2286,
"odds_ratio_loss": 0.601581871509552,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.1168455109000206,
"rewards/margins": 0.022005509585142136,
"rewards/rejected": -0.13885101675987244,
"sft_loss": 1.1684550046920776,
"step": 1420
},
{
"epoch": 2.3115780965851687,
"grad_norm": 2.162309169769287,
"learning_rate": 6.180223212826289e-07,
"logits/chosen": 253.58633422851562,
"logits/rejected": 253.8734588623047,
"logps/chosen": -1.1496318578720093,
"logps/rejected": -1.364654779434204,
"loss": 1.213,
"odds_ratio_loss": 0.633824348449707,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.11496319621801376,
"rewards/margins": 0.021502288058400154,
"rewards/rejected": -0.13646547496318817,
"sft_loss": 1.1496318578720093,
"step": 1430
},
{
"epoch": 2.3277429783794705,
"grad_norm": 1.522935152053833,
"learning_rate": 5.904069343621443e-07,
"logits/chosen": 255.19082641601562,
"logits/rejected": 255.11474609375,
"logps/chosen": -1.1330249309539795,
"logps/rejected": -1.386264443397522,
"loss": 1.195,
"odds_ratio_loss": 0.6193984746932983,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.11330248415470123,
"rewards/margins": 0.025323981419205666,
"rewards/rejected": -0.13862647116184235,
"sft_loss": 1.1330249309539795,
"step": 1440
},
{
"epoch": 2.3439078601737724,
"grad_norm": 2.982042074203491,
"learning_rate": 5.633398970942544e-07,
"logits/chosen": 254.9903564453125,
"logits/rejected": 255.1248321533203,
"logps/chosen": -1.1471365690231323,
"logps/rejected": -1.3323371410369873,
"loss": 1.2137,
"odds_ratio_loss": 0.6657688617706299,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.114713653922081,
"rewards/margins": 0.01852005161345005,
"rewards/rejected": -0.1332337111234665,
"sft_loss": 1.1471365690231323,
"step": 1450
},
{
"epoch": 2.3600727419680743,
"grad_norm": 3.2461607456207275,
"learning_rate": 5.368289819205069e-07,
"logits/chosen": 254.27847290039062,
"logits/rejected": 255.0779571533203,
"logps/chosen": -1.11297607421875,
"logps/rejected": -1.3122992515563965,
"loss": 1.1805,
"odds_ratio_loss": 0.6752298474311829,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1112975925207138,
"rewards/margins": 0.019932324066758156,
"rewards/rejected": -0.1312299221754074,
"sft_loss": 1.11297607421875,
"step": 1460
},
{
"epoch": 2.376237623762376,
"grad_norm": 2.7223591804504395,
"learning_rate": 5.108818015890785e-07,
"logits/chosen": 255.47216796875,
"logits/rejected": 255.6534423828125,
"logps/chosen": -1.2367959022521973,
"logps/rejected": -1.4369171857833862,
"loss": 1.3029,
"odds_ratio_loss": 0.6609222888946533,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.12367959320545197,
"rewards/margins": 0.020012129098176956,
"rewards/rejected": -0.14369171857833862,
"sft_loss": 1.2367959022521973,
"step": 1470
},
{
"epoch": 2.392402505556678,
"grad_norm": 2.912327527999878,
"learning_rate": 4.855058069687291e-07,
"logits/chosen": 253.0759735107422,
"logits/rejected": 253.6752471923828,
"logps/chosen": -1.111169695854187,
"logps/rejected": -1.4328919649124146,
"loss": 1.1697,
"odds_ratio_loss": 0.5851289629936218,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.11111698299646378,
"rewards/margins": 0.03217221051454544,
"rewards/rejected": -0.1432892084121704,
"sft_loss": 1.111169695854187,
"step": 1480
},
{
"epoch": 2.40856738735098,
"grad_norm": 2.995020627975464,
"learning_rate": 4.607082849092523e-07,
"logits/chosen": 253.9425811767578,
"logits/rejected": 254.0526580810547,
"logps/chosen": -1.2607060670852661,
"logps/rejected": -1.4026780128479004,
"loss": 1.3291,
"odds_ratio_loss": 0.6835006475448608,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12607058882713318,
"rewards/margins": 0.014197212643921375,
"rewards/rejected": -0.14026781916618347,
"sft_loss": 1.2607060670852661,
"step": 1490
},
{
"epoch": 2.4247322691452817,
"grad_norm": 3.760835886001587,
"learning_rate": 4.3649635614901405e-07,
"logits/chosen": 254.07601928710938,
"logits/rejected": 254.50985717773438,
"logps/chosen": -1.1233417987823486,
"logps/rejected": -1.2859314680099487,
"loss": 1.1874,
"odds_ratio_loss": 0.6403074860572815,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.11233416944742203,
"rewards/margins": 0.01625899039208889,
"rewards/rejected": -0.12859316170215607,
"sft_loss": 1.1233417987823486,
"step": 1500
},
{
"epoch": 2.4247322691452817,
"eval_logits/chosen": 253.6037139892578,
"eval_logits/rejected": 253.95994567871094,
"eval_logps/chosen": -1.1982638835906982,
"eval_logps/rejected": -1.4377323389053345,
"eval_loss": 1.265723466873169,
"eval_odds_ratio_loss": 0.6745957732200623,
"eval_rewards/accuracies": 0.5699999928474426,
"eval_rewards/chosen": -0.11982638388872147,
"eval_rewards/margins": 0.023946860805153847,
"eval_rewards/rejected": -0.14377322793006897,
"eval_runtime": 221.0804,
"eval_samples_per_second": 4.976,
"eval_sft_loss": 1.1982638835906982,
"eval_steps_per_second": 2.488,
"step": 1500
},
{
"epoch": 2.4408971509395836,
"grad_norm": 2.074381113052368,
"learning_rate": 4.128769732701973e-07,
"logits/chosen": 254.7397918701172,
"logits/rejected": 254.7943572998047,
"logps/chosen": -1.1791332960128784,
"logps/rejected": -1.4214551448822021,
"loss": 1.2445,
"odds_ratio_loss": 0.6540807485580444,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.11791334301233292,
"rewards/margins": 0.02423218823969364,
"rewards/rejected": -0.14214551448822021,
"sft_loss": 1.1791332960128784,
"step": 1510
},
{
"epoch": 2.4570620327338855,
"grad_norm": 3.3741965293884277,
"learning_rate": 3.8985691870233046e-07,
"logits/chosen": 254.44534301757812,
"logits/rejected": 254.93307495117188,
"logps/chosen": -1.209789514541626,
"logps/rejected": -1.4953523874282837,
"loss": 1.2744,
"odds_ratio_loss": 0.645904004573822,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.1209789514541626,
"rewards/margins": 0.02855629101395607,
"rewards/rejected": -0.14953525364398956,
"sft_loss": 1.209789514541626,
"step": 1520
},
{
"epoch": 2.4732269145281873,
"grad_norm": 4.19984245300293,
"learning_rate": 3.6744280277467904e-07,
"logits/chosen": 253.19186401367188,
"logits/rejected": 253.959228515625,
"logps/chosen": -1.1898527145385742,
"logps/rejected": -1.418869972229004,
"loss": 1.2551,
"odds_ratio_loss": 0.6521813273429871,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.11898528039455414,
"rewards/margins": 0.022901728749275208,
"rewards/rejected": -0.14188699424266815,
"sft_loss": 1.1898527145385742,
"step": 1530
},
{
"epoch": 2.489391796322489,
"grad_norm": 3.145732879638672,
"learning_rate": 3.456410618180503e-07,
"logits/chosen": 252.92282104492188,
"logits/rejected": 253.6022491455078,
"logps/chosen": -1.0681793689727783,
"logps/rejected": -1.446299433708191,
"loss": 1.1295,
"odds_ratio_loss": 0.6130428910255432,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.10681793838739395,
"rewards/margins": 0.0378120057284832,
"rewards/rejected": -0.14462995529174805,
"sft_loss": 1.0681793689727783,
"step": 1540
},
{
"epoch": 2.5055566781167915,
"grad_norm": 2.4530389308929443,
"learning_rate": 3.244579563165753e-07,
"logits/chosen": 252.8817138671875,
"logits/rejected": 253.0416259765625,
"logps/chosen": -1.1208980083465576,
"logps/rejected": -1.4661897420883179,
"loss": 1.1805,
"odds_ratio_loss": 0.5958081483840942,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.11208979785442352,
"rewards/margins": 0.034529171884059906,
"rewards/rejected": -0.14661899209022522,
"sft_loss": 1.1208980083465576,
"step": 1550
},
{
"epoch": 2.521721559911093,
"grad_norm": 1.9151691198349,
"learning_rate": 3.038995691099697e-07,
"logits/chosen": 252.8979949951172,
"logits/rejected": 253.34262084960938,
"logps/chosen": -1.2310686111450195,
"logps/rejected": -1.493896484375,
"loss": 1.2954,
"odds_ratio_loss": 0.6432778239250183,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.12310687452554703,
"rewards/margins": 0.02628278359770775,
"rewards/rejected": -0.14938965439796448,
"sft_loss": 1.2310686111450195,
"step": 1560
},
{
"epoch": 2.5378864417053952,
"grad_norm": 4.0270304679870605,
"learning_rate": 2.839718036468192e-07,
"logits/chosen": 255.1189422607422,
"logits/rejected": 255.6085662841797,
"logps/chosen": -1.2376972436904907,
"logps/rejected": -1.4336402416229248,
"loss": 1.306,
"odds_ratio_loss": 0.6828280091285706,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.12376973778009415,
"rewards/margins": 0.019594285637140274,
"rewards/rejected": -0.14336401224136353,
"sft_loss": 1.2376972436904907,
"step": 1570
},
{
"epoch": 2.5540513234996967,
"grad_norm": 2.8464980125427246,
"learning_rate": 2.646803822893723e-07,
"logits/chosen": 254.5944366455078,
"logits/rejected": 254.6045379638672,
"logps/chosen": -1.1911519765853882,
"logps/rejected": -1.4335906505584717,
"loss": 1.255,
"odds_ratio_loss": 0.6387141346931458,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1191151887178421,
"rewards/margins": 0.024243878200650215,
"rewards/rejected": -0.14335909485816956,
"sft_loss": 1.1911519765853882,
"step": 1580
},
{
"epoch": 2.570216205293999,
"grad_norm": 2.3468587398529053,
"learning_rate": 2.460308446703341e-07,
"logits/chosen": 255.0273895263672,
"logits/rejected": 255.33963012695312,
"logps/chosen": -1.1678217649459839,
"logps/rejected": -1.3280802965164185,
"loss": 1.2319,
"odds_ratio_loss": 0.6405949592590332,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.11678217351436615,
"rewards/margins": 0.016025854274630547,
"rewards/rejected": -0.13280804455280304,
"sft_loss": 1.1678217649459839,
"step": 1590
},
{
"epoch": 2.5863810870883004,
"grad_norm": 2.52286434173584,
"learning_rate": 2.2802854610213143e-07,
"logits/chosen": 253.905517578125,
"logits/rejected": 254.2148895263672,
"logps/chosen": -1.0991406440734863,
"logps/rejected": -1.5185635089874268,
"loss": 1.1551,
"odds_ratio_loss": 0.5599113702774048,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.10991404950618744,
"rewards/margins": 0.041942298412323,
"rewards/rejected": -0.15185634791851044,
"sft_loss": 1.0991406440734863,
"step": 1600
},
{
"epoch": 2.6025459688826027,
"grad_norm": 6.791391849517822,
"learning_rate": 2.106786560391072e-07,
"logits/chosen": 253.763671875,
"logits/rejected": 253.8026580810547,
"logps/chosen": -1.2003083229064941,
"logps/rejected": -1.4066941738128662,
"loss": 1.2644,
"odds_ratio_loss": 0.6404808163642883,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.12003083527088165,
"rewards/margins": 0.020638594403862953,
"rewards/rejected": -0.14066943526268005,
"sft_loss": 1.2003083229064941,
"step": 1610
},
{
"epoch": 2.6187108506769046,
"grad_norm": 2.421247720718384,
"learning_rate": 1.9398615659308255e-07,
"logits/chosen": 254.5044403076172,
"logits/rejected": 255.14804077148438,
"logps/chosen": -1.145989179611206,
"logps/rejected": -1.2900917530059814,
"loss": 1.2124,
"odds_ratio_loss": 0.663682222366333,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.11459891498088837,
"rewards/margins": 0.014410244300961494,
"rewards/rejected": -0.1290091574192047,
"sft_loss": 1.145989179611206,
"step": 1620
},
{
"epoch": 2.6348757324712064,
"grad_norm": 2.5792062282562256,
"learning_rate": 1.7795584110272184e-07,
"logits/chosen": 254.8577117919922,
"logits/rejected": 254.72903442382812,
"logps/chosen": -1.1733181476593018,
"logps/rejected": -1.3872268199920654,
"loss": 1.2372,
"odds_ratio_loss": 0.6392764449119568,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.11733181774616241,
"rewards/margins": 0.021390849724411964,
"rewards/rejected": -0.13872265815734863,
"sft_loss": 1.1733181476593018,
"step": 1630
},
{
"epoch": 2.6510406142655083,
"grad_norm": 4.67177152633667,
"learning_rate": 1.6259231275709636e-07,
"logits/chosen": 254.6771697998047,
"logits/rejected": 254.81906127929688,
"logps/chosen": -1.1654198169708252,
"logps/rejected": -1.331291913986206,
"loss": 1.2329,
"odds_ratio_loss": 0.6746650338172913,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.11654196679592133,
"rewards/margins": 0.016587218269705772,
"rewards/rejected": -0.13312919437885284,
"sft_loss": 1.1654198169708252,
"step": 1640
},
{
"epoch": 2.66720549605981,
"grad_norm": 2.415234327316284,
"learning_rate": 1.478999832738548e-07,
"logits/chosen": 253.84658813476562,
"logits/rejected": 254.2846221923828,
"logps/chosen": -1.1557317972183228,
"logps/rejected": -1.4208415746688843,
"loss": 1.2199,
"odds_ratio_loss": 0.6416669487953186,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.11557319015264511,
"rewards/margins": 0.026510965079069138,
"rewards/rejected": -0.14208415150642395,
"sft_loss": 1.1557317972183228,
"step": 1650
},
{
"epoch": 2.683370377854112,
"grad_norm": 2.4805846214294434,
"learning_rate": 1.338830716323769e-07,
"logits/chosen": 253.04605102539062,
"logits/rejected": 253.3199005126953,
"logps/chosen": -1.1211105585098267,
"logps/rejected": -1.3149446249008179,
"loss": 1.1846,
"odds_ratio_loss": 0.6351101994514465,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.11211104691028595,
"rewards/margins": 0.019383419305086136,
"rewards/rejected": -0.13149447739124298,
"sft_loss": 1.1211105585098267,
"step": 1660
},
{
"epoch": 2.699535259648414,
"grad_norm": 4.832085609436035,
"learning_rate": 1.205456028622723e-07,
"logits/chosen": 254.3978729248047,
"logits/rejected": 254.5135498046875,
"logps/chosen": -1.0987465381622314,
"logps/rejected": -1.4824903011322021,
"loss": 1.157,
"odds_ratio_loss": 0.582126259803772,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.1098746508359909,
"rewards/margins": 0.038374386727809906,
"rewards/rejected": -0.1482490599155426,
"sft_loss": 1.0987465381622314,
"step": 1670
},
{
"epoch": 2.7157001414427158,
"grad_norm": 1.8406291007995605,
"learning_rate": 1.0789140688756805e-07,
"logits/chosen": 254.9768524169922,
"logits/rejected": 255.4145965576172,
"logps/chosen": -1.1487500667572021,
"logps/rejected": -1.416771650314331,
"loss": 1.2068,
"odds_ratio_loss": 0.5807241201400757,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.11487500369548798,
"rewards/margins": 0.026802152395248413,
"rewards/rejected": -0.14167717099189758,
"sft_loss": 1.1487500667572021,
"step": 1680
},
{
"epoch": 2.7318650232370176,
"grad_norm": 9.125133514404297,
"learning_rate": 9.592411742693098e-08,
"logits/chosen": 253.77490234375,
"logits/rejected": 253.99368286132812,
"logps/chosen": -1.2172834873199463,
"logps/rejected": -1.371734857559204,
"loss": 1.2881,
"odds_ratio_loss": 0.7082632780075073,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12172834575176239,
"rewards/margins": 0.015445133671164513,
"rewards/rejected": -0.13717348873615265,
"sft_loss": 1.2172834873199463,
"step": 1690
},
{
"epoch": 2.7480299050313195,
"grad_norm": 1.7780921459197998,
"learning_rate": 8.464717095022168e-08,
"logits/chosen": 251.9024658203125,
"logits/rejected": 253.18222045898438,
"logps/chosen": -1.1575608253479004,
"logps/rejected": -1.4255679845809937,
"loss": 1.2219,
"odds_ratio_loss": 0.6432427167892456,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.115756094455719,
"rewards/margins": 0.026800716295838356,
"rewards/rejected": -0.1425568014383316,
"sft_loss": 1.1575608253479004,
"step": 1700
},
{
"epoch": 2.7641947868256214,
"grad_norm": 2.707679271697998,
"learning_rate": 7.406380569169841e-08,
"logits/chosen": 254.63046264648438,
"logits/rejected": 255.2324676513672,
"logps/chosen": -1.21084725856781,
"logps/rejected": -1.339179277420044,
"loss": 1.278,
"odds_ratio_loss": 0.6712638139724731,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.12108473479747772,
"rewards/margins": 0.012833192944526672,
"rewards/rejected": -0.1339179426431656,
"sft_loss": 1.21084725856781,
"step": 1710
},
{
"epoch": 2.7803596686199232,
"grad_norm": 9.321467399597168,
"learning_rate": 6.417706072013808e-08,
"logits/chosen": 255.14419555664062,
"logits/rejected": 255.5975341796875,
"logps/chosen": -1.1439273357391357,
"logps/rejected": -1.3444888591766357,
"loss": 1.2084,
"odds_ratio_loss": 0.644874095916748,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.1143927350640297,
"rewards/margins": 0.020056165754795074,
"rewards/rejected": -0.13444891571998596,
"sft_loss": 1.1439273357391357,
"step": 1720
},
{
"epoch": 2.796524550414225,
"grad_norm": 4.651902675628662,
"learning_rate": 5.498977506615294e-08,
"logits/chosen": 254.2846221923828,
"logits/rejected": 255.0900421142578,
"logps/chosen": -1.2078436613082886,
"logps/rejected": -1.3565863370895386,
"loss": 1.2795,
"odds_ratio_loss": 0.7164067029953003,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.12078437954187393,
"rewards/margins": 0.014874264597892761,
"rewards/rejected": -0.1356586515903473,
"sft_loss": 1.2078436613082886,
"step": 1730
},
{
"epoch": 2.812689432208527,
"grad_norm": 2.109281301498413,
"learning_rate": 4.6504586906947756e-08,
"logits/chosen": 255.70327758789062,
"logits/rejected": 255.75326538085938,
"logps/chosen": -1.204192876815796,
"logps/rejected": -1.3763076066970825,
"loss": 1.2661,
"odds_ratio_loss": 0.6194810271263123,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.12041930109262466,
"rewards/margins": 0.01721145212650299,
"rewards/rejected": -0.13763076066970825,
"sft_loss": 1.204192876815796,
"step": 1740
},
{
"epoch": 2.828854314002829,
"grad_norm": 11.28996467590332,
"learning_rate": 3.8723932808754914e-08,
"logits/chosen": 254.12826538085938,
"logits/rejected": 254.1260528564453,
"logps/chosen": -1.2869278192520142,
"logps/rejected": -1.4129821062088013,
"loss": 1.3563,
"odds_ratio_loss": 0.6939128637313843,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12869277596473694,
"rewards/margins": 0.012605440802872181,
"rewards/rejected": -0.14129820466041565,
"sft_loss": 1.2869278192520142,
"step": 1750
},
{
"epoch": 2.8450191957971307,
"grad_norm": 4.297213077545166,
"learning_rate": 3.1650047027158014e-08,
"logits/chosen": 254.026123046875,
"logits/rejected": 254.0368194580078,
"logps/chosen": -1.1498368978500366,
"logps/rejected": -1.3505172729492188,
"loss": 1.212,
"odds_ratio_loss": 0.6218072175979614,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.11498367786407471,
"rewards/margins": 0.020068055018782616,
"rewards/rejected": -0.13505175709724426,
"sft_loss": 1.1498368978500366,
"step": 1760
},
{
"epoch": 2.8611840775914326,
"grad_norm": 3.5106639862060547,
"learning_rate": 2.5284960865517848e-08,
"logits/chosen": 253.19677734375,
"logits/rejected": 253.49380493164062,
"logps/chosen": -1.0654290914535522,
"logps/rejected": -1.4208238124847412,
"loss": 1.1222,
"odds_ratio_loss": 0.5680567026138306,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1065429076552391,
"rewards/margins": 0.035539474338293076,
"rewards/rejected": -0.14208237826824188,
"sft_loss": 1.0654290914535522,
"step": 1770
},
{
"epoch": 2.8773489593857344,
"grad_norm": 1.9635688066482544,
"learning_rate": 1.9630502091670388e-08,
"logits/chosen": 254.31576538085938,
"logits/rejected": 254.73184204101562,
"logps/chosen": -1.16495680809021,
"logps/rejected": -1.438716173171997,
"loss": 1.2222,
"odds_ratio_loss": 0.5726233720779419,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.11649568378925323,
"rewards/margins": 0.027375921607017517,
"rewards/rejected": -0.14387162029743195,
"sft_loss": 1.16495680809021,
"step": 1780
},
{
"epoch": 2.8935138411800363,
"grad_norm": 7.4126458168029785,
"learning_rate": 1.4688294413074677e-08,
"logits/chosen": 253.7817840576172,
"logits/rejected": 254.4253692626953,
"logps/chosen": -1.081469178199768,
"logps/rejected": -1.3936357498168945,
"loss": 1.1414,
"odds_ratio_loss": 0.5996376872062683,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.10814690589904785,
"rewards/margins": 0.03121664747595787,
"rewards/rejected": -0.13936355710029602,
"sft_loss": 1.081469178199768,
"step": 1790
},
{
"epoch": 2.909678722974338,
"grad_norm": 1.8516889810562134,
"learning_rate": 1.0459757010556626e-08,
"logits/chosen": 252.57345581054688,
"logits/rejected": 252.3190155029297,
"logps/chosen": -1.173514723777771,
"logps/rejected": -1.303662657737732,
"loss": 1.2401,
"odds_ratio_loss": 0.6658231019973755,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1173514723777771,
"rewards/margins": 0.013014810159802437,
"rewards/rejected": -0.13036629557609558,
"sft_loss": 1.173514723777771,
"step": 1800
},
{
"epoch": 2.92584360476864,
"grad_norm": 7.74896764755249,
"learning_rate": 6.94610413078306e-09,
"logits/chosen": 253.01602172851562,
"logits/rejected": 253.71841430664062,
"logps/chosen": -1.2020865678787231,
"logps/rejected": -1.5033478736877441,
"loss": 1.2671,
"odds_ratio_loss": 0.6497219204902649,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12020864337682724,
"rewards/margins": 0.030126124620437622,
"rewards/rejected": -0.15033479034900665,
"sft_loss": 1.2020865678787231,
"step": 1810
},
{
"epoch": 2.942008486562942,
"grad_norm": 2.0265376567840576,
"learning_rate": 4.14834473758563e-09,
"logits/chosen": 252.50656127929688,
"logits/rejected": 252.5145721435547,
"logps/chosen": -1.0974245071411133,
"logps/rejected": -1.4048916101455688,
"loss": 1.1558,
"odds_ratio_loss": 0.5838974714279175,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.10974244773387909,
"rewards/margins": 0.030746713280677795,
"rewards/rejected": -0.14048916101455688,
"sft_loss": 1.0974245071411133,
"step": 1820
},
{
"epoch": 2.9581733683572438,
"grad_norm": 2.291332721710205,
"learning_rate": 2.067282222230349e-09,
"logits/chosen": 254.31192016601562,
"logits/rejected": 254.5823974609375,
"logps/chosen": -1.1228351593017578,
"logps/rejected": -1.4580986499786377,
"loss": 1.1841,
"odds_ratio_loss": 0.6124657392501831,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.11228351294994354,
"rewards/margins": 0.03352636098861694,
"rewards/rejected": -0.1458098590373993,
"sft_loss": 1.1228351593017578,
"step": 1830
},
{
"epoch": 2.9743382501515456,
"grad_norm": 8.377201080322266,
"learning_rate": 7.035141727212979e-10,
"logits/chosen": 252.5790252685547,
"logits/rejected": 253.33993530273438,
"logps/chosen": -1.062239646911621,
"logps/rejected": -1.3180148601531982,
"loss": 1.1223,
"odds_ratio_loss": 0.6001896858215332,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.10622396320104599,
"rewards/margins": 0.025577524676918983,
"rewards/rejected": -0.13180148601531982,
"sft_loss": 1.062239646911621,
"step": 1840
},
{
"epoch": 2.9905031319458475,
"grad_norm": 5.447793006896973,
"learning_rate": 5.743220219761592e-11,
"logits/chosen": 254.24691772460938,
"logits/rejected": 254.78369140625,
"logps/chosen": -1.195462942123413,
"logps/rejected": -1.4066945314407349,
"loss": 1.2627,
"odds_ratio_loss": 0.6723325252532959,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1195463091135025,
"rewards/margins": 0.021123168990015984,
"rewards/rejected": -0.14066946506500244,
"sft_loss": 1.195462942123413,
"step": 1850
},
{
"epoch": 2.9969690846635686,
"step": 1854,
"total_flos": 2.1935611788745114e+18,
"train_loss": 1.3469306265266197,
"train_runtime": 24131.5713,
"train_samples_per_second": 1.231,
"train_steps_per_second": 0.077
}
],
"logging_steps": 10,
"max_steps": 1854,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 2.1935611788745114e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}