llama3-L1-SFT-L2-KTO / trainer_state.json
EllieS's picture
Model save
e224aaf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998751404669747,
"eval_steps": 1000,
"global_step": 2002,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000499438132101386,
"grad_norm": 0.22265625,
"learning_rate": 2.4875621890547265e-08,
"logits/chosen": -0.3009346127510071,
"logits/rejected": -0.224898099899292,
"logps/chosen": -43.235816955566406,
"logps/rejected": -65.95542907714844,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.00499438132101386,
"grad_norm": 0.1904296875,
"learning_rate": 2.4875621890547267e-07,
"logits/chosen": -0.4162670373916626,
"logits/rejected": -0.31764352321624756,
"logps/chosen": -43.73904037475586,
"logps/rejected": -88.3354263305664,
"loss": 0.4999,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.0004759904695674777,
"rewards/margins": 0.0009994357824325562,
"rewards/rejected": -0.0005234453710727394,
"step": 10
},
{
"epoch": 0.00998876264202772,
"grad_norm": 0.16796875,
"learning_rate": 4.975124378109453e-07,
"logits/chosen": -0.41128048300743103,
"logits/rejected": -0.3287343382835388,
"logps/chosen": -43.18193054199219,
"logps/rejected": -69.37371063232422,
"loss": 0.4999,
"rewards/accuracies": 0.625,
"rewards/chosen": 8.649445953778923e-05,
"rewards/margins": 0.000692047062329948,
"rewards/rejected": -0.0006055526318959892,
"step": 20
},
{
"epoch": 0.014983143963041578,
"grad_norm": 0.25,
"learning_rate": 7.462686567164179e-07,
"logits/chosen": -0.4024788439273834,
"logits/rejected": -0.3096240162849426,
"logps/chosen": -42.980751037597656,
"logps/rejected": -73.10075378417969,
"loss": 0.4999,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0004937038174830377,
"rewards/margins": 0.0007297725533135235,
"rewards/rejected": -0.00023606869217474014,
"step": 30
},
{
"epoch": 0.01997752528405544,
"grad_norm": 0.208984375,
"learning_rate": 9.950248756218907e-07,
"logits/chosen": -0.41356319189071655,
"logits/rejected": -0.34054869413375854,
"logps/chosen": -43.257789611816406,
"logps/rejected": -69.32649230957031,
"loss": 0.4998,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0002575941034592688,
"rewards/margins": 0.0018243074882775545,
"rewards/rejected": -0.0015667133266106248,
"step": 40
},
{
"epoch": 0.024971906605069295,
"grad_norm": 0.26953125,
"learning_rate": 1.2437810945273632e-06,
"logits/chosen": -0.4217616021633148,
"logits/rejected": -0.3440130352973938,
"logps/chosen": -44.67601776123047,
"logps/rejected": -78.74809265136719,
"loss": 0.4995,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.0004624166467692703,
"rewards/margins": 0.002841049339622259,
"rewards/rejected": -0.00237863278016448,
"step": 50
},
{
"epoch": 0.029966287926083156,
"grad_norm": 0.158203125,
"learning_rate": 1.4925373134328358e-06,
"logits/chosen": -0.4335503578186035,
"logits/rejected": -0.3408567011356354,
"logps/chosen": -43.363746643066406,
"logps/rejected": -77.2335433959961,
"loss": 0.4991,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.0020871085580438375,
"rewards/margins": 0.0050177304074168205,
"rewards/rejected": -0.0029306220822036266,
"step": 60
},
{
"epoch": 0.034960669247097016,
"grad_norm": 0.1572265625,
"learning_rate": 1.7412935323383088e-06,
"logits/chosen": -0.4507155418395996,
"logits/rejected": -0.35845330357551575,
"logps/chosen": -42.748069763183594,
"logps/rejected": -73.00779724121094,
"loss": 0.4982,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.004088289104402065,
"rewards/margins": 0.009887892752885818,
"rewards/rejected": -0.005799603182822466,
"step": 70
},
{
"epoch": 0.03995505056811088,
"grad_norm": 0.1943359375,
"learning_rate": 1.9900497512437813e-06,
"logits/chosen": -0.41265735030174255,
"logits/rejected": -0.32930153608322144,
"logps/chosen": -42.023521423339844,
"logps/rejected": -82.02639770507812,
"loss": 0.497,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.008388923481106758,
"rewards/margins": 0.015186095610260963,
"rewards/rejected": -0.006797172129154205,
"step": 80
},
{
"epoch": 0.04494943188912474,
"grad_norm": 0.181640625,
"learning_rate": 2.238805970149254e-06,
"logits/chosen": -0.42406344413757324,
"logits/rejected": -0.32654517889022827,
"logps/chosen": -43.199241638183594,
"logps/rejected": -79.2525405883789,
"loss": 0.4952,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015419301576912403,
"rewards/margins": 0.025097712874412537,
"rewards/rejected": -0.009678413160145283,
"step": 90
},
{
"epoch": 0.04994381321013859,
"grad_norm": 0.232421875,
"learning_rate": 2.4875621890547264e-06,
"logits/chosen": -0.4186275601387024,
"logits/rejected": -0.31876617670059204,
"logps/chosen": -41.526851654052734,
"logps/rejected": -73.03739929199219,
"loss": 0.4934,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.020998705178499222,
"rewards/margins": 0.03922630846500397,
"rewards/rejected": -0.018227603286504745,
"step": 100
},
{
"epoch": 0.05493819453115245,
"grad_norm": 0.1630859375,
"learning_rate": 2.736318407960199e-06,
"logits/chosen": -0.3820754289627075,
"logits/rejected": -0.3049188256263733,
"logps/chosen": -40.626625061035156,
"logps/rejected": -77.20478057861328,
"loss": 0.4909,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.026516741141676903,
"rewards/margins": 0.0446377769112587,
"rewards/rejected": -0.018121037632226944,
"step": 110
},
{
"epoch": 0.05993257585216631,
"grad_norm": 0.1455078125,
"learning_rate": 2.9850746268656716e-06,
"logits/chosen": -0.4118029475212097,
"logits/rejected": -0.3379635214805603,
"logps/chosen": -40.367244720458984,
"logps/rejected": -72.1778564453125,
"loss": 0.4878,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.034462034702301025,
"rewards/margins": 0.07104991376399994,
"rewards/rejected": -0.036587879061698914,
"step": 120
},
{
"epoch": 0.06492695717318017,
"grad_norm": 0.1416015625,
"learning_rate": 3.233830845771145e-06,
"logits/chosen": -0.38510891795158386,
"logits/rejected": -0.2871672511100769,
"logps/chosen": -39.84120559692383,
"logps/rejected": -77.8514175415039,
"loss": 0.4845,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.033992547541856766,
"rewards/margins": 0.09460695832967758,
"rewards/rejected": -0.06061442568898201,
"step": 130
},
{
"epoch": 0.06992133849419403,
"grad_norm": 0.2197265625,
"learning_rate": 3.4825870646766175e-06,
"logits/chosen": -0.3648582696914673,
"logits/rejected": -0.2659669816493988,
"logps/chosen": -40.6865234375,
"logps/rejected": -84.98823547363281,
"loss": 0.4808,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04041652753949165,
"rewards/margins": 0.11247670650482178,
"rewards/rejected": -0.07206018269062042,
"step": 140
},
{
"epoch": 0.07491571981520789,
"grad_norm": 0.173828125,
"learning_rate": 3.73134328358209e-06,
"logits/chosen": -0.4185262620449066,
"logits/rejected": -0.31970107555389404,
"logps/chosen": -40.132545471191406,
"logps/rejected": -80.09419250488281,
"loss": 0.4784,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04452138394117355,
"rewards/margins": 0.12835349142551422,
"rewards/rejected": -0.08383210748434067,
"step": 150
},
{
"epoch": 0.07991010113622175,
"grad_norm": 0.130859375,
"learning_rate": 3.980099502487563e-06,
"logits/chosen": -0.37403732538223267,
"logits/rejected": -0.27564138174057007,
"logps/chosen": -39.31542205810547,
"logps/rejected": -90.21852111816406,
"loss": 0.4747,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.044510699808597565,
"rewards/margins": 0.15536533296108246,
"rewards/rejected": -0.11085464060306549,
"step": 160
},
{
"epoch": 0.08490448245723561,
"grad_norm": 0.1572265625,
"learning_rate": 4.228855721393035e-06,
"logits/chosen": -0.35230112075805664,
"logits/rejected": -0.2606234848499298,
"logps/chosen": -38.46342086791992,
"logps/rejected": -85.07556915283203,
"loss": 0.4689,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04909727722406387,
"rewards/margins": 0.20509441196918488,
"rewards/rejected": -0.1559971272945404,
"step": 170
},
{
"epoch": 0.08989886377824947,
"grad_norm": 0.2421875,
"learning_rate": 4.477611940298508e-06,
"logits/chosen": -0.3421555161476135,
"logits/rejected": -0.2503698766231537,
"logps/chosen": -39.706153869628906,
"logps/rejected": -85.4745101928711,
"loss": 0.4643,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0517905056476593,
"rewards/margins": 0.2295013666152954,
"rewards/rejected": -0.1777108609676361,
"step": 180
},
{
"epoch": 0.09489324509926333,
"grad_norm": 0.2197265625,
"learning_rate": 4.72636815920398e-06,
"logits/chosen": -0.2977878451347351,
"logits/rejected": -0.17351695895195007,
"logps/chosen": -38.165069580078125,
"logps/rejected": -104.08354187011719,
"loss": 0.4516,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0498540997505188,
"rewards/margins": 0.33853739500045776,
"rewards/rejected": -0.28868329524993896,
"step": 190
},
{
"epoch": 0.09988762642027718,
"grad_norm": 0.484375,
"learning_rate": 4.975124378109453e-06,
"logits/chosen": -0.2946663498878479,
"logits/rejected": -0.17343321442604065,
"logps/chosen": -37.89108657836914,
"logps/rejected": -119.83811950683594,
"loss": 0.4244,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06007291004061699,
"rewards/margins": 0.5597599148750305,
"rewards/rejected": -0.49968695640563965,
"step": 200
},
{
"epoch": 0.10488200774129104,
"grad_norm": 0.271484375,
"learning_rate": 4.999691923599309e-06,
"logits/chosen": -0.24224761128425598,
"logits/rejected": -0.10646134614944458,
"logps/chosen": -38.592735290527344,
"logps/rejected": -158.98190307617188,
"loss": 0.3761,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.052448518574237823,
"rewards/margins": 0.9627677202224731,
"rewards/rejected": -0.9103191494941711,
"step": 210
},
{
"epoch": 0.1098763890623049,
"grad_norm": 0.369140625,
"learning_rate": 4.998627065620946e-06,
"logits/chosen": -0.20557060837745667,
"logits/rejected": -0.019889693707227707,
"logps/chosen": -39.04503631591797,
"logps/rejected": -309.40240478515625,
"loss": 0.3162,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05903823301196098,
"rewards/margins": 2.2989630699157715,
"rewards/rejected": -2.239924907684326,
"step": 220
},
{
"epoch": 0.11487077038331876,
"grad_norm": 0.1435546875,
"learning_rate": 4.996801946581365e-06,
"logits/chosen": -0.08062759786844254,
"logits/rejected": 0.12686052918434143,
"logps/chosen": -38.807472229003906,
"logps/rejected": -391.716552734375,
"loss": 0.2979,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03988034278154373,
"rewards/margins": 3.2561073303222656,
"rewards/rejected": -3.216226577758789,
"step": 230
},
{
"epoch": 0.11986515170433262,
"grad_norm": 0.173828125,
"learning_rate": 4.99421712181231e-06,
"logits/chosen": -0.09656897932291031,
"logits/rejected": 0.20183369517326355,
"logps/chosen": -40.211158752441406,
"logps/rejected": -563.9237060546875,
"loss": 0.2787,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03454852104187012,
"rewards/margins": 4.958992958068848,
"rewards/rejected": -4.924445152282715,
"step": 240
},
{
"epoch": 0.12485953302534648,
"grad_norm": 0.19140625,
"learning_rate": 4.990873377802351e-06,
"logits/chosen": -0.04213310405611992,
"logits/rejected": 0.26729267835617065,
"logps/chosen": -38.64299011230469,
"logps/rejected": -672.4085693359375,
"loss": 0.2738,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04478804022073746,
"rewards/margins": 6.028790473937988,
"rewards/rejected": -5.984002113342285,
"step": 250
},
{
"epoch": 0.12985391434636034,
"grad_norm": 0.1591796875,
"learning_rate": 4.986771731957569e-06,
"logits/chosen": -0.013924488797783852,
"logits/rejected": 0.32576116919517517,
"logps/chosen": -38.04896926879883,
"logps/rejected": -677.6256713867188,
"loss": 0.269,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04615269601345062,
"rewards/margins": 6.120830535888672,
"rewards/rejected": -6.07467794418335,
"step": 260
},
{
"epoch": 0.1348482956673742,
"grad_norm": 0.1240234375,
"learning_rate": 4.981913432291989e-06,
"logits/chosen": -0.0022221256513148546,
"logits/rejected": 0.3353291451931,
"logps/chosen": -36.26408004760742,
"logps/rejected": -657.3780517578125,
"loss": 0.2603,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07836371660232544,
"rewards/margins": 5.838679790496826,
"rewards/rejected": -5.760315895080566,
"step": 270
},
{
"epoch": 0.13984267698838806,
"grad_norm": 0.138671875,
"learning_rate": 4.976299957047846e-06,
"logits/chosen": -0.008776476606726646,
"logits/rejected": 0.35888582468032837,
"logps/chosen": -34.863895416259766,
"logps/rejected": -787.3309326171875,
"loss": 0.2619,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0779188945889473,
"rewards/margins": 7.239710330963135,
"rewards/rejected": -7.1617913246154785,
"step": 280
},
{
"epoch": 0.1448370583094019,
"grad_norm": 0.2373046875,
"learning_rate": 4.9699330142458e-06,
"logits/chosen": -0.005239410791546106,
"logits/rejected": 0.3836653232574463,
"logps/chosen": -30.54348373413086,
"logps/rejected": -689.7301635742188,
"loss": 0.2517,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1276303231716156,
"rewards/margins": 6.348529815673828,
"rewards/rejected": -6.2208991050720215,
"step": 290
},
{
"epoch": 0.14983143963041579,
"grad_norm": 0.15625,
"learning_rate": 4.96281454116523e-06,
"logits/chosen": -0.018074408173561096,
"logits/rejected": 0.34884509444236755,
"logps/chosen": -19.059213638305664,
"logps/rejected": -759.7054443359375,
"loss": 0.2355,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24109096825122833,
"rewards/margins": 7.104301452636719,
"rewards/rejected": -6.863211154937744,
"step": 300
},
{
"epoch": 0.15482582095142963,
"grad_norm": 0.14453125,
"learning_rate": 4.954946703754777e-06,
"logits/chosen": -0.022661946713924408,
"logits/rejected": 0.36587223410606384,
"logps/chosen": -14.688285827636719,
"logps/rejected": -652.6716918945312,
"loss": 0.2303,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2812207341194153,
"rewards/margins": 6.1413116455078125,
"rewards/rejected": -5.860090255737305,
"step": 310
},
{
"epoch": 0.1598202022724435,
"grad_norm": 0.06591796875,
"learning_rate": 4.946331895973308e-06,
"logits/chosen": 0.027700275182724,
"logits/rejected": 0.4779927134513855,
"logps/chosen": -13.264841079711914,
"logps/rejected": -853.7429809570312,
"loss": 0.2306,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29832005500793457,
"rewards/margins": 7.9851837158203125,
"rewards/rejected": -7.686862945556641,
"step": 320
},
{
"epoch": 0.16481458359345735,
"grad_norm": 0.1298828125,
"learning_rate": 4.936972739061503e-06,
"logits/chosen": 0.028876056894659996,
"logits/rejected": 0.4520367980003357,
"logps/chosen": -14.747647285461426,
"logps/rejected": -819.2418212890625,
"loss": 0.2308,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29899871349334717,
"rewards/margins": 7.518294334411621,
"rewards/rejected": -7.219296455383301,
"step": 330
},
{
"epoch": 0.16980896491447123,
"grad_norm": 0.2353515625,
"learning_rate": 4.926872080744284e-06,
"logits/chosen": 0.09099732339382172,
"logits/rejected": 0.6329769492149353,
"logps/chosen": -14.6112699508667,
"logps/rejected": -978.19482421875,
"loss": 0.2205,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.290539413690567,
"rewards/margins": 9.353235244750977,
"rewards/rejected": -9.06269645690918,
"step": 340
},
{
"epoch": 0.17480334623548507,
"grad_norm": 0.0673828125,
"learning_rate": 4.9160329943643335e-06,
"logits/chosen": 0.10238673537969589,
"logits/rejected": 0.6276119947433472,
"logps/chosen": -13.135534286499023,
"logps/rejected": -862.1193237304688,
"loss": 0.2233,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2938804030418396,
"rewards/margins": 8.220571517944336,
"rewards/rejected": -7.92669153213501,
"step": 350
},
{
"epoch": 0.17979772755649895,
"grad_norm": 0.064453125,
"learning_rate": 4.904458777946967e-06,
"logits/chosen": 0.023329418152570724,
"logits/rejected": 0.6091981530189514,
"logps/chosen": -13.777295112609863,
"logps/rejected": -1096.3214111328125,
"loss": 0.221,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2997492849826813,
"rewards/margins": 10.409059524536133,
"rewards/rejected": -10.1093111038208,
"step": 360
},
{
"epoch": 0.1847921088775128,
"grad_norm": 0.1201171875,
"learning_rate": 4.892152953196633e-06,
"logits/chosen": 0.029097210615873337,
"logits/rejected": 0.650887131690979,
"logps/chosen": -13.820713996887207,
"logps/rejected": -1171.0504150390625,
"loss": 0.2235,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2969459295272827,
"rewards/margins": 11.168619155883789,
"rewards/rejected": -10.871672630310059,
"step": 370
},
{
"epoch": 0.18978649019852667,
"grad_norm": 0.06640625,
"learning_rate": 4.879119264425366e-06,
"logits/chosen": 0.11170516163110733,
"logits/rejected": 0.7552271485328674,
"logps/chosen": -13.031651496887207,
"logps/rejected": -990.927734375,
"loss": 0.2191,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30227065086364746,
"rewards/margins": 9.518648147583008,
"rewards/rejected": -9.216377258300781,
"step": 380
},
{
"epoch": 0.19478087151954052,
"grad_norm": 0.0272216796875,
"learning_rate": 4.865361677413489e-06,
"logits/chosen": 0.10295484960079193,
"logits/rejected": 0.6912266612052917,
"logps/chosen": -14.148368835449219,
"logps/rejected": -973.99365234375,
"loss": 0.2236,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30019253492355347,
"rewards/margins": 9.242959022521973,
"rewards/rejected": -8.942765235900879,
"step": 390
},
{
"epoch": 0.19977525284055436,
"grad_norm": 0.048583984375,
"learning_rate": 4.850884378202947e-06,
"logits/chosen": 0.12218449264764786,
"logits/rejected": 0.7848892211914062,
"logps/chosen": -13.857281684875488,
"logps/rejected": -1093.4356689453125,
"loss": 0.2224,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30381980538368225,
"rewards/margins": 10.443190574645996,
"rewards/rejected": -10.139370918273926,
"step": 400
},
{
"epoch": 0.20476963416156824,
"grad_norm": 0.10888671875,
"learning_rate": 4.8356917718236125e-06,
"logits/chosen": 0.16129298508167267,
"logits/rejected": 0.83033287525177,
"logps/chosen": -13.215472221374512,
"logps/rejected": -1056.891357421875,
"loss": 0.2254,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30162861943244934,
"rewards/margins": 10.163104057312012,
"rewards/rejected": -9.861475944519043,
"step": 410
},
{
"epoch": 0.20976401548258208,
"grad_norm": 0.05712890625,
"learning_rate": 4.8197884809529575e-06,
"logits/chosen": 0.18466398119926453,
"logits/rejected": 0.8971255421638489,
"logps/chosen": -14.178210258483887,
"logps/rejected": -1095.867431640625,
"loss": 0.2192,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29163575172424316,
"rewards/margins": 10.56762409210205,
"rewards/rejected": -10.275988578796387,
"step": 420
},
{
"epoch": 0.21475839680359596,
"grad_norm": 0.078125,
"learning_rate": 4.803179344509505e-06,
"logits/chosen": 0.17180819809436798,
"logits/rejected": 0.9859398603439331,
"logps/chosen": -14.1710844039917,
"logps/rejected": -1132.571044921875,
"loss": 0.2206,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2903401851654053,
"rewards/margins": 10.91908073425293,
"rewards/rejected": -10.628740310668945,
"step": 430
},
{
"epoch": 0.2197527781246098,
"grad_norm": 0.0341796875,
"learning_rate": 4.785869416180489e-06,
"logits/chosen": 0.18128976225852966,
"logits/rejected": 0.9951160550117493,
"logps/chosen": -13.373617172241211,
"logps/rejected": -1239.9248046875,
"loss": 0.2185,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3014771342277527,
"rewards/margins": 11.977083206176758,
"rewards/rejected": -11.675604820251465,
"step": 440
},
{
"epoch": 0.22474715944562368,
"grad_norm": 0.035400390625,
"learning_rate": 4.767863962884156e-06,
"logits/chosen": 0.19665592908859253,
"logits/rejected": 1.0053622722625732,
"logps/chosen": -12.768911361694336,
"logps/rejected": -1196.0384521484375,
"loss": 0.217,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30236831307411194,
"rewards/margins": 11.53145694732666,
"rewards/rejected": -11.229089736938477,
"step": 450
},
{
"epoch": 0.22974154076663753,
"grad_norm": 0.06982421875,
"learning_rate": 4.74916846316719e-06,
"logits/chosen": 0.2026137411594391,
"logits/rejected": 0.9967263340950012,
"logps/chosen": -14.13359546661377,
"logps/rejected": -1080.839111328125,
"loss": 0.2196,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29357805848121643,
"rewards/margins": 10.319085121154785,
"rewards/rejected": -10.025506973266602,
"step": 460
},
{
"epoch": 0.2347359220876514,
"grad_norm": 0.047607421875,
"learning_rate": 4.7297886055377525e-06,
"logits/chosen": 0.22456324100494385,
"logits/rejected": 0.9802171587944031,
"logps/chosen": -13.012743949890137,
"logps/rejected": -1017.5540161132812,
"loss": 0.2208,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29690033197402954,
"rewards/margins": 9.80536937713623,
"rewards/rejected": -9.508468627929688,
"step": 470
},
{
"epoch": 0.23973030340866525,
"grad_norm": 0.02734375,
"learning_rate": 4.709730286734631e-06,
"logits/chosen": 0.2183937281370163,
"logits/rejected": 1.0708736181259155,
"logps/chosen": -12.663009643554688,
"logps/rejected": -1249.983642578125,
"loss": 0.2169,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3058857023715973,
"rewards/margins": 12.078222274780273,
"rewards/rejected": -11.772336959838867,
"step": 480
},
{
"epoch": 0.24472468472967912,
"grad_norm": 0.055908203125,
"learning_rate": 4.688999609933023e-06,
"logits/chosen": 0.22988371551036835,
"logits/rejected": 1.0844902992248535,
"logps/chosen": -12.956899642944336,
"logps/rejected": -1156.6395263671875,
"loss": 0.2176,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30138763785362244,
"rewards/margins": 11.153468132019043,
"rewards/rejected": -10.852079391479492,
"step": 490
},
{
"epoch": 0.24971906605069297,
"grad_norm": 0.049072265625,
"learning_rate": 4.6676028828875195e-06,
"logits/chosen": 0.19053277373313904,
"logits/rejected": 1.1232895851135254,
"logps/chosen": -13.526689529418945,
"logps/rejected": -1328.9798583984375,
"loss": 0.2173,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2967537045478821,
"rewards/margins": 12.862408638000488,
"rewards/rejected": -12.565653800964355,
"step": 500
},
{
"epoch": 0.2547134473717068,
"grad_norm": 0.00811767578125,
"learning_rate": 4.645546616012835e-06,
"logits/chosen": 0.19936171174049377,
"logits/rejected": 1.1598930358886719,
"logps/chosen": -13.963285446166992,
"logps/rejected": -1301.6494140625,
"loss": 0.217,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29725441336631775,
"rewards/margins": 12.456524848937988,
"rewards/rejected": -12.159271240234375,
"step": 510
},
{
"epoch": 0.2597078286927207,
"grad_norm": 0.022216796875,
"learning_rate": 4.622837520402869e-06,
"logits/chosen": 0.2132669985294342,
"logits/rejected": 1.1716349124908447,
"logps/chosen": -13.427679061889648,
"logps/rejected": -1347.979248046875,
"loss": 0.216,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30607202649116516,
"rewards/margins": 13.056879043579102,
"rewards/rejected": -12.75080680847168,
"step": 520
},
{
"epoch": 0.26470221001373456,
"grad_norm": 0.0654296875,
"learning_rate": 4.599482505788715e-06,
"logits/chosen": 0.1745399534702301,
"logits/rejected": 1.1154874563217163,
"logps/chosen": -13.649249076843262,
"logps/rejected": -1377.6204833984375,
"loss": 0.2163,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2969611883163452,
"rewards/margins": 13.256256103515625,
"rewards/rejected": -12.959295272827148,
"step": 530
},
{
"epoch": 0.2696965913347484,
"grad_norm": 0.040771484375,
"learning_rate": 4.575488678436228e-06,
"logits/chosen": 0.20975852012634277,
"logits/rejected": 1.2858575582504272,
"logps/chosen": -13.548286437988281,
"logps/rejected": -1526.4365234375,
"loss": 0.2161,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30410271883010864,
"rewards/margins": 14.82691764831543,
"rewards/rejected": -14.522814750671387,
"step": 540
},
{
"epoch": 0.27469097265576226,
"grad_norm": 0.0235595703125,
"learning_rate": 4.550863338983784e-06,
"logits/chosen": 0.23238572478294373,
"logits/rejected": 1.2929937839508057,
"logps/chosen": -12.818222045898438,
"logps/rejected": -1398.010009765625,
"loss": 0.2146,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3006662130355835,
"rewards/margins": 13.579294204711914,
"rewards/rejected": -13.2786283493042,
"step": 550
},
{
"epoch": 0.27968535397677613,
"grad_norm": 0.045654296875,
"learning_rate": 4.525613980220909e-06,
"logits/chosen": 0.21401552855968475,
"logits/rejected": 1.2280786037445068,
"logps/chosen": -13.640890121459961,
"logps/rejected": -1425.2213134765625,
"loss": 0.2165,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30365556478500366,
"rewards/margins": 13.8203706741333,
"rewards/rejected": -13.516714096069336,
"step": 560
},
{
"epoch": 0.28467973529779,
"grad_norm": 0.031494140625,
"learning_rate": 4.499748284808433e-06,
"logits/chosen": 0.2350139617919922,
"logits/rejected": 1.237275242805481,
"logps/chosen": -13.027705192565918,
"logps/rejected": -1249.4954833984375,
"loss": 0.2147,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3124990463256836,
"rewards/margins": 11.985966682434082,
"rewards/rejected": -11.673466682434082,
"step": 570
},
{
"epoch": 0.2896741166188038,
"grad_norm": 0.037109375,
"learning_rate": 4.473274122940879e-06,
"logits/chosen": 0.26041245460510254,
"logits/rejected": 1.2588506937026978,
"logps/chosen": -12.92595100402832,
"logps/rejected": -1345.543212890625,
"loss": 0.2145,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3006027638912201,
"rewards/margins": 13.026535034179688,
"rewards/rejected": -12.725933074951172,
"step": 580
},
{
"epoch": 0.2946684979398177,
"grad_norm": 0.023193359375,
"learning_rate": 4.446199549951782e-06,
"logits/chosen": 0.2726953327655792,
"logits/rejected": 1.2741527557373047,
"logps/chosen": -13.607648849487305,
"logps/rejected": -1350.020263671875,
"loss": 0.2164,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2966735064983368,
"rewards/margins": 13.099435806274414,
"rewards/rejected": -12.802760124206543,
"step": 590
},
{
"epoch": 0.29966287926083157,
"grad_norm": 0.060302734375,
"learning_rate": 4.418532803862684e-06,
"logits/chosen": 0.24927139282226562,
"logits/rejected": 1.2164738178253174,
"logps/chosen": -13.667009353637695,
"logps/rejected": -1226.8724365234375,
"loss": 0.2161,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3081030249595642,
"rewards/margins": 11.710563659667969,
"rewards/rejected": -11.402461051940918,
"step": 600
},
{
"epoch": 0.30465726058184545,
"grad_norm": 0.01220703125,
"learning_rate": 4.39028230287654e-06,
"logits/chosen": 0.24915924668312073,
"logits/rejected": 1.2275068759918213,
"logps/chosen": -14.152711868286133,
"logps/rejected": -1383.346923828125,
"loss": 0.2153,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.301180899143219,
"rewards/margins": 13.357465744018555,
"rewards/rejected": -13.05628490447998,
"step": 610
},
{
"epoch": 0.30965164190285926,
"grad_norm": 0.056884765625,
"learning_rate": 4.361456642816292e-06,
"logits/chosen": 0.18370430171489716,
"logits/rejected": 1.187785267829895,
"logps/chosen": -14.063751220703125,
"logps/rejected": -1465.3687744140625,
"loss": 0.2156,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3011600375175476,
"rewards/margins": 14.13781452178955,
"rewards/rejected": -13.836652755737305,
"step": 620
},
{
"epoch": 0.31464602322387314,
"grad_norm": 0.036376953125,
"learning_rate": 4.332064594509413e-06,
"logits/chosen": 0.19446897506713867,
"logits/rejected": 1.427197813987732,
"logps/chosen": -14.258028030395508,
"logps/rejected": -1825.0166015625,
"loss": 0.2145,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2957887053489685,
"rewards/margins": 17.80067253112793,
"rewards/rejected": -17.504884719848633,
"step": 630
},
{
"epoch": 0.319640404544887,
"grad_norm": 0.025634765625,
"learning_rate": 4.302115101119186e-06,
"logits/chosen": 0.19377607107162476,
"logits/rejected": 1.1977471113204956,
"logps/chosen": -13.28663158416748,
"logps/rejected": -1551.3272705078125,
"loss": 0.2146,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2949761748313904,
"rewards/margins": 14.840034484863281,
"rewards/rejected": -14.545059204101562,
"step": 640
},
{
"epoch": 0.3246347858659009,
"grad_norm": 0.028076171875,
"learning_rate": 4.271617275423564e-06,
"logits/chosen": 0.18471740186214447,
"logits/rejected": 1.2049682140350342,
"logps/chosen": -14.22096061706543,
"logps/rejected": -1509.560791015625,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2953301966190338,
"rewards/margins": 14.56297779083252,
"rewards/rejected": -14.267648696899414,
"step": 650
},
{
"epoch": 0.3296291671869147,
"grad_norm": 0.031494140625,
"learning_rate": 4.2405803970423995e-06,
"logits/chosen": 0.21741405129432678,
"logits/rejected": 1.3314052820205688,
"logps/chosen": -13.835968017578125,
"logps/rejected": -1617.0235595703125,
"loss": 0.2147,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29379233717918396,
"rewards/margins": 15.649670600891113,
"rewards/rejected": -15.355878829956055,
"step": 660
},
{
"epoch": 0.3346235485079286,
"grad_norm": 0.11572265625,
"learning_rate": 4.2090139096139306e-06,
"logits/chosen": 0.16212065517902374,
"logits/rejected": 1.2855770587921143,
"logps/chosen": -14.017046928405762,
"logps/rejected": -1740.139404296875,
"loss": 0.2134,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3004634976387024,
"rewards/margins": 16.887771606445312,
"rewards/rejected": -16.58730697631836,
"step": 670
},
{
"epoch": 0.33961792982894246,
"grad_norm": 0.043701171875,
"learning_rate": 4.176927417921343e-06,
"logits/chosen": 0.326777845621109,
"logits/rejected": 1.3592358827590942,
"logps/chosen": -13.120327949523926,
"logps/rejected": -1251.674560546875,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30190396308898926,
"rewards/margins": 12.16067123413086,
"rewards/rejected": -11.858766555786133,
"step": 680
},
{
"epoch": 0.3446123111499563,
"grad_norm": 0.03173828125,
"learning_rate": 4.144330684970314e-06,
"logits/chosen": 0.22485598921775818,
"logits/rejected": 1.238599181175232,
"logps/chosen": -14.03515625,
"logps/rejected": -1432.62060546875,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30531880259513855,
"rewards/margins": 13.757417678833008,
"rewards/rejected": -13.45209789276123,
"step": 690
},
{
"epoch": 0.34960669247097015,
"grad_norm": 0.064453125,
"learning_rate": 4.111233629018404e-06,
"logits/chosen": 0.2409452497959137,
"logits/rejected": 1.3199043273925781,
"logps/chosen": -13.525833129882812,
"logps/rejected": -1432.090087890625,
"loss": 0.2146,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29940056800842285,
"rewards/margins": 13.923855781555176,
"rewards/rejected": -13.624455451965332,
"step": 700
},
{
"epoch": 0.354601073791984,
"grad_norm": 0.044677734375,
"learning_rate": 4.077646320557215e-06,
"logits/chosen": 0.25844550132751465,
"logits/rejected": 1.4347895383834839,
"logps/chosen": -13.414407730102539,
"logps/rejected": -1587.720947265625,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29702991247177124,
"rewards/margins": 15.390164375305176,
"rewards/rejected": -15.093134880065918,
"step": 710
},
{
"epoch": 0.3595954551129979,
"grad_norm": 0.017822265625,
"learning_rate": 4.043578979248228e-06,
"logits/chosen": 0.24548295140266418,
"logits/rejected": 1.3877404928207397,
"logps/chosen": -12.786032676696777,
"logps/rejected": -1540.37939453125,
"loss": 0.2147,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3044242262840271,
"rewards/margins": 14.986343383789062,
"rewards/rejected": -14.681918144226074,
"step": 720
},
{
"epoch": 0.3645898364340117,
"grad_norm": 0.0203857421875,
"learning_rate": 4.009041970813247e-06,
"logits/chosen": 0.2618701457977295,
"logits/rejected": 1.432408332824707,
"logps/chosen": -12.812631607055664,
"logps/rejected": -1661.7718505859375,
"loss": 0.2146,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29122787714004517,
"rewards/margins": 16.198129653930664,
"rewards/rejected": -15.906901359558105,
"step": 730
},
{
"epoch": 0.3695842177550256,
"grad_norm": 0.030029296875,
"learning_rate": 3.9740458038804075e-06,
"logits/chosen": 0.25733712315559387,
"logits/rejected": 1.3133214712142944,
"logps/chosen": -14.166203498840332,
"logps/rejected": -1473.091552734375,
"loss": 0.2133,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3017811179161072,
"rewards/margins": 14.06616497039795,
"rewards/rejected": -13.764383316040039,
"step": 740
},
{
"epoch": 0.37457859907603946,
"grad_norm": 0.02880859375,
"learning_rate": 3.938601126786702e-06,
"logits/chosen": 0.28963789343833923,
"logits/rejected": 1.4084501266479492,
"logps/chosen": -12.976341247558594,
"logps/rejected": -1537.215087890625,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2965225577354431,
"rewards/margins": 14.963714599609375,
"rewards/rejected": -14.667192459106445,
"step": 750
},
{
"epoch": 0.37957298039705334,
"grad_norm": 0.0306396484375,
"learning_rate": 3.902718724337993e-06,
"logits/chosen": 0.22370409965515137,
"logits/rejected": 1.3502318859100342,
"logps/chosen": -13.021102905273438,
"logps/rejected": -1571.427978515625,
"loss": 0.2141,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3001527190208435,
"rewards/margins": 15.25025463104248,
"rewards/rejected": -14.950100898742676,
"step": 760
},
{
"epoch": 0.38456736171806716,
"grad_norm": 0.014404296875,
"learning_rate": 3.8664095145274995e-06,
"logits/chosen": 0.26876306533813477,
"logits/rejected": 1.432448387145996,
"logps/chosen": -13.371549606323242,
"logps/rejected": -1552.1263427734375,
"loss": 0.2136,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30356094241142273,
"rewards/margins": 15.13947582244873,
"rewards/rejected": -14.835916519165039,
"step": 770
},
{
"epoch": 0.38956174303908103,
"grad_norm": 0.034912109375,
"learning_rate": 3.829684545213768e-06,
"logits/chosen": 0.23094145953655243,
"logits/rejected": 1.379480242729187,
"logps/chosen": -13.367365837097168,
"logps/rejected": -1540.384765625,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30820637941360474,
"rewards/margins": 14.93891716003418,
"rewards/rejected": -14.630711555480957,
"step": 780
},
{
"epoch": 0.3945561243600949,
"grad_norm": 0.03857421875,
"learning_rate": 3.7925549907591252e-06,
"logits/chosen": 0.17974331974983215,
"logits/rejected": 1.3995566368103027,
"logps/chosen": -13.248870849609375,
"logps/rejected": -1851.462646484375,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3005455434322357,
"rewards/margins": 18.06133460998535,
"rewards/rejected": -17.760787963867188,
"step": 790
},
{
"epoch": 0.3995505056811087,
"grad_norm": 0.0272216796875,
"learning_rate": 3.7550321486296303e-06,
"logits/chosen": 0.1997009515762329,
"logits/rejected": 1.2776936292648315,
"logps/chosen": -13.081275939941406,
"logps/rejected": -1519.75537109375,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29953616857528687,
"rewards/margins": 14.774045944213867,
"rewards/rejected": -14.474508285522461,
"step": 800
},
{
"epoch": 0.4045448870021226,
"grad_norm": 0.0252685546875,
"learning_rate": 3.717127435957583e-06,
"logits/chosen": 0.22182372212409973,
"logits/rejected": 1.330664873123169,
"logps/chosen": -13.001907348632812,
"logps/rejected": -1577.052978515625,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30547088384628296,
"rewards/margins": 15.365854263305664,
"rewards/rejected": -15.060381889343262,
"step": 810
},
{
"epoch": 0.4095392683231365,
"grad_norm": 0.011474609375,
"learning_rate": 3.6788523860676156e-06,
"logits/chosen": 0.23856505751609802,
"logits/rejected": 1.3909879922866821,
"logps/chosen": -13.222851753234863,
"logps/rejected": -1567.79931640625,
"loss": 0.2142,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3013755977153778,
"rewards/margins": 15.270452499389648,
"rewards/rejected": -14.969076156616211,
"step": 820
},
{
"epoch": 0.41453364964415035,
"grad_norm": 0.0267333984375,
"learning_rate": 3.640218644967429e-06,
"logits/chosen": 0.2593843638896942,
"logits/rejected": 1.4300034046173096,
"logps/chosen": -12.888224601745605,
"logps/rejected": -1659.7015380859375,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.295782208442688,
"rewards/margins": 16.122600555419922,
"rewards/rejected": -15.826817512512207,
"step": 830
},
{
"epoch": 0.41952803096516417,
"grad_norm": 0.0194091796875,
"learning_rate": 3.601237967804245e-06,
"logits/chosen": 0.264489084482193,
"logits/rejected": 1.4015864133834839,
"logps/chosen": -12.973742485046387,
"logps/rejected": -1561.237548828125,
"loss": 0.2142,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2937301695346832,
"rewards/margins": 15.21537971496582,
"rewards/rejected": -14.921648979187012,
"step": 840
},
{
"epoch": 0.42452241228617804,
"grad_norm": 0.034912109375,
"learning_rate": 3.5619222152880488e-06,
"logits/chosen": 0.26485809683799744,
"logits/rejected": 1.4641757011413574,
"logps/chosen": -12.745534896850586,
"logps/rejected": -1704.2015380859375,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29673147201538086,
"rewards/margins": 16.65966033935547,
"rewards/rejected": -16.362926483154297,
"step": 850
},
{
"epoch": 0.4295167936071919,
"grad_norm": 0.03662109375,
"learning_rate": 3.522283350082713e-06,
"logits/chosen": 0.27674156427383423,
"logits/rejected": 1.3279974460601807,
"logps/chosen": -13.220677375793457,
"logps/rejected": -1268.2398681640625,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30168816447257996,
"rewards/margins": 12.249058723449707,
"rewards/rejected": -11.947370529174805,
"step": 860
},
{
"epoch": 0.4345111749282058,
"grad_norm": 0.0303955078125,
"learning_rate": 3.482333433166101e-06,
"logits/chosen": 0.2209288775920868,
"logits/rejected": 1.239816427230835,
"logps/chosen": -13.64061450958252,
"logps/rejected": -1329.226806640625,
"loss": 0.2133,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.32054653763771057,
"rewards/margins": 12.724926948547363,
"rewards/rejected": -12.404378890991211,
"step": 870
},
{
"epoch": 0.4395055562492196,
"grad_norm": 0.017578125,
"learning_rate": 3.442084620160255e-06,
"logits/chosen": 0.2859000563621521,
"logits/rejected": 1.3617407083511353,
"logps/chosen": -13.033666610717773,
"logps/rejected": -1480.4652099609375,
"loss": 0.2148,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3004864454269409,
"rewards/margins": 14.408884048461914,
"rewards/rejected": -14.1083984375,
"step": 880
},
{
"epoch": 0.4444999375702335,
"grad_norm": 0.033203125,
"learning_rate": 3.4015491576327813e-06,
"logits/chosen": 0.2019362449645996,
"logits/rejected": 1.4212459325790405,
"logps/chosen": -13.03289794921875,
"logps/rejected": -1749.9088134765625,
"loss": 0.2134,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2995554804801941,
"rewards/margins": 17.093223571777344,
"rewards/rejected": -16.793670654296875,
"step": 890
},
{
"epoch": 0.44949431889124736,
"grad_norm": 0.043701171875,
"learning_rate": 3.3607393793705774e-06,
"logits/chosen": 0.18301896750926971,
"logits/rejected": 1.3750990629196167,
"logps/chosen": -13.328268051147461,
"logps/rejected": -1911.580322265625,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3058582544326782,
"rewards/margins": 18.632659912109375,
"rewards/rejected": -18.32680320739746,
"step": 900
},
{
"epoch": 0.45448870021226123,
"grad_norm": 0.026611328125,
"learning_rate": 3.319667702627004e-06,
"logits/chosen": 0.251764714717865,
"logits/rejected": 1.379320502281189,
"logps/chosen": -13.333532333374023,
"logps/rejected": -1562.4708251953125,
"loss": 0.2145,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29535120725631714,
"rewards/margins": 15.228157043457031,
"rewards/rejected": -14.932805061340332,
"step": 910
},
{
"epoch": 0.45948308153327505,
"grad_norm": 0.0301513671875,
"learning_rate": 3.2783466243436728e-06,
"logits/chosen": 0.2565325200557709,
"logits/rejected": 1.3139139413833618,
"logps/chosen": -12.679740905761719,
"logps/rejected": -1520.677490234375,
"loss": 0.2148,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29702773690223694,
"rewards/margins": 14.843530654907227,
"rewards/rejected": -14.546501159667969,
"step": 920
},
{
"epoch": 0.4644774628542889,
"grad_norm": 0.032958984375,
"learning_rate": 3.23678871734798e-06,
"logits/chosen": 0.25534436106681824,
"logits/rejected": 1.4121118783950806,
"logps/chosen": -13.0289945602417,
"logps/rejected": -1636.283447265625,
"loss": 0.2134,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2977290153503418,
"rewards/margins": 15.95555591583252,
"rewards/rejected": -15.657827377319336,
"step": 930
},
{
"epoch": 0.4694718441753028,
"grad_norm": 0.017578125,
"learning_rate": 3.1950066265275563e-06,
"logits/chosen": 0.22841012477874756,
"logits/rejected": 1.4126774072647095,
"logps/chosen": -13.12025260925293,
"logps/rejected": -1663.779541015625,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2916621267795563,
"rewards/margins": 16.221864700317383,
"rewards/rejected": -15.930200576782227,
"step": 940
},
{
"epoch": 0.4744662254963166,
"grad_norm": 0.0255126953125,
"learning_rate": 3.1530130649827866e-06,
"logits/chosen": 0.22560763359069824,
"logits/rejected": 1.3270902633666992,
"logps/chosen": -12.889676094055176,
"logps/rejected": -1550.6907958984375,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2999538779258728,
"rewards/margins": 15.000715255737305,
"rewards/rejected": -14.700759887695312,
"step": 950
},
{
"epoch": 0.4794606068173305,
"grad_norm": 0.033203125,
"learning_rate": 3.1108208101585737e-06,
"logits/chosen": 0.2439723014831543,
"logits/rejected": 1.362210988998413,
"logps/chosen": -13.089398384094238,
"logps/rejected": -1693.647216796875,
"loss": 0.2136,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29442083835601807,
"rewards/margins": 16.52579116821289,
"rewards/rejected": -16.231369018554688,
"step": 960
},
{
"epoch": 0.48445498813834437,
"grad_norm": 0.062255859375,
"learning_rate": 3.068442699956526e-06,
"logits/chosen": 0.2077961266040802,
"logits/rejected": 1.3753139972686768,
"logps/chosen": -14.569076538085938,
"logps/rejected": -1670.797119140625,
"loss": 0.2127,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30582067370414734,
"rewards/margins": 16.229455947875977,
"rewards/rejected": -15.92363452911377,
"step": 970
},
{
"epoch": 0.48944936945935824,
"grad_norm": 0.05224609375,
"learning_rate": 3.025891628828754e-06,
"logits/chosen": 0.1842622458934784,
"logits/rejected": 1.2995259761810303,
"logps/chosen": -14.32885456085205,
"logps/rejected": -1653.052001953125,
"loss": 0.2126,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3057419955730438,
"rewards/margins": 16.055688858032227,
"rewards/rejected": -15.749944686889648,
"step": 980
},
{
"epoch": 0.49444375078037206,
"grad_norm": 0.0269775390625,
"learning_rate": 2.983180543854449e-06,
"logits/chosen": 0.19390757381916046,
"logits/rejected": 1.3017512559890747,
"logps/chosen": -13.34800910949707,
"logps/rejected": -1712.0159912109375,
"loss": 0.2135,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30465009808540344,
"rewards/margins": 16.635692596435547,
"rewards/rejected": -16.331043243408203,
"step": 990
},
{
"epoch": 0.49943813210138593,
"grad_norm": 0.0223388671875,
"learning_rate": 2.9403224408004607e-06,
"logits/chosen": 0.23906031250953674,
"logits/rejected": 1.400268316268921,
"logps/chosen": -13.12585735321045,
"logps/rejected": -1704.7308349609375,
"loss": 0.2129,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3049536347389221,
"rewards/margins": 16.624217987060547,
"rewards/rejected": -16.319265365600586,
"step": 1000
},
{
"epoch": 0.49943813210138593,
"eval_logits/chosen": 0.20361633598804474,
"eval_logits/rejected": 1.1080797910690308,
"eval_logps/chosen": -12.131524085998535,
"eval_logps/rejected": -998.1762084960938,
"eval_loss": 0.21242494881153107,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.32524025440216064,
"eval_rewards/margins": 9.676619529724121,
"eval_rewards/rejected": -9.351378440856934,
"eval_runtime": 0.4258,
"eval_samples_per_second": 11.742,
"eval_steps_per_second": 7.045,
"step": 1000
},
{
"epoch": 0.5044325134223998,
"grad_norm": 0.0205078125,
"learning_rate": 2.8973303601670537e-06,
"logits/chosen": 0.23553326725959778,
"logits/rejected": 1.356740117073059,
"logps/chosen": -13.091280937194824,
"logps/rejected": -1667.8570556640625,
"loss": 0.2138,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3002287745475769,
"rewards/margins": 16.279354095458984,
"rewards/rejected": -15.979124069213867,
"step": 1010
},
{
"epoch": 0.5094268947434136,
"grad_norm": 0.04833984375,
"learning_rate": 2.8542173832200547e-06,
"logits/chosen": 0.1925448775291443,
"logits/rejected": 1.3025437593460083,
"logps/chosen": -14.501489639282227,
"logps/rejected": -1580.2572021484375,
"loss": 0.2144,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2990257143974304,
"rewards/margins": 15.303033828735352,
"rewards/rejected": -15.004008293151855,
"step": 1020
},
{
"epoch": 0.5144212760644276,
"grad_norm": 0.02587890625,
"learning_rate": 2.810996628010594e-06,
"logits/chosen": 0.2747485637664795,
"logits/rejected": 1.341671109199524,
"logps/chosen": -13.159135818481445,
"logps/rejected": -1436.7039794921875,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30046582221984863,
"rewards/margins": 14.00048542022705,
"rewards/rejected": -13.700021743774414,
"step": 1030
},
{
"epoch": 0.5194156573854414,
"grad_norm": 0.0380859375,
"learning_rate": 2.7676812453836617e-06,
"logits/chosen": 0.2172623872756958,
"logits/rejected": 1.389795184135437,
"logps/chosen": -13.409383773803711,
"logps/rejected": -1729.2418212890625,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29702064394950867,
"rewards/margins": 16.85504722595215,
"rewards/rejected": -16.558027267456055,
"step": 1040
},
{
"epoch": 0.5244100387064552,
"grad_norm": 0.059814453125,
"learning_rate": 2.724284414976672e-06,
"logits/chosen": 0.194356769323349,
"logits/rejected": 1.3346575498580933,
"logps/chosen": -13.100080490112305,
"logps/rejected": -1775.0599365234375,
"loss": 0.2141,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3071358799934387,
"rewards/margins": 17.33095359802246,
"rewards/rejected": -17.023818969726562,
"step": 1050
},
{
"epoch": 0.5294044200274691,
"grad_norm": 0.0322265625,
"learning_rate": 2.6808193412092823e-06,
"logits/chosen": 0.27043357491493225,
"logits/rejected": 1.2958372831344604,
"logps/chosen": -13.240577697753906,
"logps/rejected": -1308.380615234375,
"loss": 0.212,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3128414452075958,
"rewards/margins": 12.567632675170898,
"rewards/rejected": -12.254792213439941,
"step": 1060
},
{
"epoch": 0.5343988013484829,
"grad_norm": 0.018310546875,
"learning_rate": 2.637299249265659e-06,
"logits/chosen": 0.24779090285301208,
"logits/rejected": 1.3237859010696411,
"logps/chosen": -13.30639934539795,
"logps/rejected": -1560.58740234375,
"loss": 0.2129,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3085622787475586,
"rewards/margins": 15.089482307434082,
"rewards/rejected": -14.780920028686523,
"step": 1070
},
{
"epoch": 0.5393931826694968,
"grad_norm": 0.049560546875,
"learning_rate": 2.5937373810704352e-06,
"logits/chosen": 0.20865114033222198,
"logits/rejected": 1.3283964395523071,
"logps/chosen": -12.944803237915039,
"logps/rejected": -1628.0531005859375,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3075375556945801,
"rewards/margins": 15.755208969116211,
"rewards/rejected": -15.447671890258789,
"step": 1080
},
{
"epoch": 0.5443875639905107,
"grad_norm": 0.0247802734375,
"learning_rate": 2.550146991259565e-06,
"logits/chosen": 0.2642405331134796,
"logits/rejected": 1.330570936203003,
"logps/chosen": -12.811630249023438,
"logps/rejected": -1518.474609375,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.309398889541626,
"rewards/margins": 14.794235229492188,
"rewards/rejected": -14.484835624694824,
"step": 1090
},
{
"epoch": 0.5493819453115245,
"grad_norm": 0.0252685546875,
"learning_rate": 2.5065413431473196e-06,
"logits/chosen": 0.22386522591114044,
"logits/rejected": 1.3922302722930908,
"logps/chosen": -13.141294479370117,
"logps/rejected": -1650.009765625,
"loss": 0.2129,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30252230167388916,
"rewards/margins": 16.12454605102539,
"rewards/rejected": -15.822023391723633,
"step": 1100
},
{
"epoch": 0.5543763266325384,
"grad_norm": 0.017822265625,
"learning_rate": 2.462933704690635e-06,
"logits/chosen": 0.23435406386852264,
"logits/rejected": 1.2522070407867432,
"logps/chosen": -13.335357666015625,
"logps/rejected": -1444.448974609375,
"loss": 0.2132,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3048686385154724,
"rewards/margins": 13.933290481567383,
"rewards/rejected": -13.62842082977295,
"step": 1110
},
{
"epoch": 0.5593707079535523,
"grad_norm": 0.0159912109375,
"learning_rate": 2.4193373444520558e-06,
"logits/chosen": 0.23952054977416992,
"logits/rejected": 1.466230869293213,
"logps/chosen": -13.09483528137207,
"logps/rejected": -1737.9251708984375,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3013126254081726,
"rewards/margins": 16.979856491088867,
"rewards/rejected": -16.67854118347168,
"step": 1120
},
{
"epoch": 0.5643650892745661,
"grad_norm": 0.0277099609375,
"learning_rate": 2.3757655275624826e-06,
"logits/chosen": 0.20145916938781738,
"logits/rejected": 1.4154694080352783,
"logps/chosen": -12.834500312805176,
"logps/rejected": -1683.4241943359375,
"loss": 0.2142,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29736918210983276,
"rewards/margins": 16.437068939208984,
"rewards/rejected": -16.13970184326172,
"step": 1130
},
{
"epoch": 0.56935947059558,
"grad_norm": 0.0235595703125,
"learning_rate": 2.3322315116849747e-06,
"logits/chosen": 0.18402531743049622,
"logits/rejected": 1.3126466274261475,
"logps/chosen": -13.314038276672363,
"logps/rejected": -1694.0882568359375,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3090851306915283,
"rewards/margins": 16.457656860351562,
"rewards/rejected": -16.14857292175293,
"step": 1140
},
{
"epoch": 0.5743538519165938,
"grad_norm": 0.0208740234375,
"learning_rate": 2.2887485429808213e-06,
"logits/chosen": 0.24247586727142334,
"logits/rejected": 1.3415312767028809,
"logps/chosen": -13.364187240600586,
"logps/rejected": -1531.287353515625,
"loss": 0.2132,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.31205669045448303,
"rewards/margins": 14.862896919250488,
"rewards/rejected": -14.5508394241333,
"step": 1150
},
{
"epoch": 0.5793482332376076,
"grad_norm": 0.0257568359375,
"learning_rate": 2.245329852079109e-06,
"logits/chosen": 0.2564612329006195,
"logits/rejected": 1.3275741338729858,
"logps/chosen": -12.749259948730469,
"logps/rejected": -1410.6912841796875,
"loss": 0.2134,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3032976984977722,
"rewards/margins": 13.754470825195312,
"rewards/rejected": -13.451173782348633,
"step": 1160
},
{
"epoch": 0.5843426145586216,
"grad_norm": 0.02099609375,
"learning_rate": 2.2019886500510197e-06,
"logits/chosen": 0.234290212392807,
"logits/rejected": 1.3947325944900513,
"logps/chosen": -12.72685432434082,
"logps/rejected": -1736.8687744140625,
"loss": 0.2138,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2989721894264221,
"rewards/margins": 16.979671478271484,
"rewards/rejected": -16.68069839477539,
"step": 1170
},
{
"epoch": 0.5893369958796354,
"grad_norm": 0.037353515625,
"learning_rate": 2.1587381243900777e-06,
"logits/chosen": 0.26597389578819275,
"logits/rejected": 1.3139584064483643,
"logps/chosen": -14.154109001159668,
"logps/rejected": -1483.2686767578125,
"loss": 0.2125,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3113531470298767,
"rewards/margins": 14.383076667785645,
"rewards/rejected": -14.071722030639648,
"step": 1180
},
{
"epoch": 0.5943313772006493,
"grad_norm": 0.0233154296875,
"learning_rate": 2.115591434999573e-06,
"logits/chosen": 0.277686208486557,
"logits/rejected": 1.401039719581604,
"logps/chosen": -12.76582145690918,
"logps/rejected": -1506.6636962890625,
"loss": 0.2132,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30175262689590454,
"rewards/margins": 14.693206787109375,
"rewards/rejected": -14.391454696655273,
"step": 1190
},
{
"epoch": 0.5993257585216631,
"grad_norm": 0.0294189453125,
"learning_rate": 2.0725617101883726e-06,
"logits/chosen": 0.25775861740112305,
"logits/rejected": 1.345365285873413,
"logps/chosen": -12.69609260559082,
"logps/rejected": -1601.963623046875,
"loss": 0.2147,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2992851138114929,
"rewards/margins": 15.617083549499512,
"rewards/rejected": -15.317797660827637,
"step": 1200
},
{
"epoch": 0.604320139842677,
"grad_norm": 0.09033203125,
"learning_rate": 2.0296620426763545e-06,
"logits/chosen": 0.14509257674217224,
"logits/rejected": 1.3586305379867554,
"logps/chosen": -13.326802253723145,
"logps/rejected": -1893.167724609375,
"loss": 0.2128,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.31738612055778503,
"rewards/margins": 18.294166564941406,
"rewards/rejected": -17.97677993774414,
"step": 1210
},
{
"epoch": 0.6093145211636909,
"grad_norm": 0.0206298828125,
"learning_rate": 1.9869054856106628e-06,
"logits/chosen": 0.2093639373779297,
"logits/rejected": 1.3707678318023682,
"logps/chosen": -13.017126083374023,
"logps/rejected": -1714.967529296875,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.305789053440094,
"rewards/margins": 16.755239486694336,
"rewards/rejected": -16.449451446533203,
"step": 1220
},
{
"epoch": 0.6143089024847047,
"grad_norm": 0.027587890625,
"learning_rate": 1.9443050485940118e-06,
"logits/chosen": 0.29796817898750305,
"logits/rejected": 1.4095687866210938,
"logps/chosen": -13.055410385131836,
"logps/rejected": -1501.1917724609375,
"loss": 0.2132,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30659452080726624,
"rewards/margins": 14.630342483520508,
"rewards/rejected": -14.323748588562012,
"step": 1230
},
{
"epoch": 0.6193032838057185,
"grad_norm": 0.0205078125,
"learning_rate": 1.9018736937262271e-06,
"logits/chosen": 0.20551720261573792,
"logits/rejected": 1.3722885847091675,
"logps/chosen": -13.263589859008789,
"logps/rejected": -1644.015380859375,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3152340054512024,
"rewards/margins": 15.940661430358887,
"rewards/rejected": -15.62542724609375,
"step": 1240
},
{
"epoch": 0.6242976651267325,
"grad_norm": 0.025146484375,
"learning_rate": 1.859624331660253e-06,
"logits/chosen": 0.19998934864997864,
"logits/rejected": 1.380974531173706,
"logps/chosen": -12.843725204467773,
"logps/rejected": -1903.5989990234375,
"loss": 0.2149,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.293190598487854,
"rewards/margins": 18.591859817504883,
"rewards/rejected": -18.298667907714844,
"step": 1250
},
{
"epoch": 0.6292920464477463,
"grad_norm": 0.0184326171875,
"learning_rate": 1.817569817673806e-06,
"logits/chosen": 0.20200982689857483,
"logits/rejected": 1.3313050270080566,
"logps/chosen": -13.815347671508789,
"logps/rejected": -1682.6148681640625,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3007175624370575,
"rewards/margins": 16.28915786743164,
"rewards/rejected": -15.988439559936523,
"step": 1260
},
{
"epoch": 0.6342864277687601,
"grad_norm": 0.0264892578125,
"learning_rate": 1.7757229477578824e-06,
"logits/chosen": 0.2238602340221405,
"logits/rejected": 1.3182499408721924,
"logps/chosen": -13.298685073852539,
"logps/rejected": -1689.486328125,
"loss": 0.2124,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.31393638253211975,
"rewards/margins": 16.413013458251953,
"rewards/rejected": -16.099077224731445,
"step": 1270
},
{
"epoch": 0.639280809089774,
"grad_norm": 0.0120849609375,
"learning_rate": 1.7340964547232993e-06,
"logits/chosen": 0.23566928505897522,
"logits/rejected": 1.3794082403182983,
"logps/chosen": -12.66304874420166,
"logps/rejected": -1540.5882568359375,
"loss": 0.2122,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30794182419776917,
"rewards/margins": 15.042073249816895,
"rewards/rejected": -14.734130859375,
"step": 1280
},
{
"epoch": 0.6442751904107878,
"grad_norm": 0.0233154296875,
"learning_rate": 1.6927030043264656e-06,
"logits/chosen": 0.29966339468955994,
"logits/rejected": 1.3575140237808228,
"logps/chosen": -12.536134719848633,
"logps/rejected": -1450.3671875,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3003261685371399,
"rewards/margins": 14.137969970703125,
"rewards/rejected": -13.83764362335205,
"step": 1290
},
{
"epoch": 0.6492695717318018,
"grad_norm": 0.01904296875,
"learning_rate": 1.6515551914155522e-06,
"logits/chosen": 0.21864613890647888,
"logits/rejected": 1.2960518598556519,
"logps/chosen": -13.931121826171875,
"logps/rejected": -1731.0433349609375,
"loss": 0.2135,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29954832792282104,
"rewards/margins": 16.782882690429688,
"rewards/rejected": -16.483333587646484,
"step": 1300
},
{
"epoch": 0.6542639530528156,
"grad_norm": 0.027587890625,
"learning_rate": 1.6106655360982376e-06,
"logits/chosen": 0.11391136795282364,
"logits/rejected": 1.1829859018325806,
"logps/chosen": -13.162788391113281,
"logps/rejected": -1799.5191650390625,
"loss": 0.2121,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3114808201789856,
"rewards/margins": 17.426467895507812,
"rewards/rejected": -17.114986419677734,
"step": 1310
},
{
"epoch": 0.6592583343738294,
"grad_norm": 0.126953125,
"learning_rate": 1.570046479932196e-06,
"logits/chosen": 0.25235381722450256,
"logits/rejected": 1.2799979448318481,
"logps/chosen": -13.60442066192627,
"logps/rejected": -1395.318115234375,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3034665286540985,
"rewards/margins": 13.48466968536377,
"rewards/rejected": -13.18120288848877,
"step": 1320
},
{
"epoch": 0.6642527156948433,
"grad_norm": 0.0306396484375,
"learning_rate": 1.5297103821394876e-06,
"logits/chosen": 0.2604614198207855,
"logits/rejected": 1.4426438808441162,
"logps/chosen": -12.831866264343262,
"logps/rejected": -1725.9541015625,
"loss": 0.2136,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30009177327156067,
"rewards/margins": 16.85196876525879,
"rewards/rejected": -16.551877975463867,
"step": 1330
},
{
"epoch": 0.6692470970158572,
"grad_norm": 0.0291748046875,
"learning_rate": 1.489669515845995e-06,
"logits/chosen": 0.19801196455955505,
"logits/rejected": 1.246242642402649,
"logps/chosen": -12.573989868164062,
"logps/rejected": -1484.0341796875,
"loss": 0.2136,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3015730679035187,
"rewards/margins": 14.43646240234375,
"rewards/rejected": -14.134889602661133,
"step": 1340
},
{
"epoch": 0.674241478336871,
"grad_norm": 0.020751953125,
"learning_rate": 1.449936064347065e-06,
"logits/chosen": 0.24275144934654236,
"logits/rejected": 1.3506710529327393,
"logps/chosen": -12.805200576782227,
"logps/rejected": -1675.633544921875,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3000423312187195,
"rewards/margins": 16.352081298828125,
"rewards/rejected": -16.052040100097656,
"step": 1350
},
{
"epoch": 0.6792358596578849,
"grad_norm": 0.01483154296875,
"learning_rate": 1.4105221174004771e-06,
"logits/chosen": 0.18348607420921326,
"logits/rejected": 1.3507370948791504,
"logps/chosen": -13.73768424987793,
"logps/rejected": -1898.157470703125,
"loss": 0.2124,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.309729665517807,
"rewards/margins": 18.553356170654297,
"rewards/rejected": -18.24362564086914,
"step": 1360
},
{
"epoch": 0.6842302409788987,
"grad_norm": 0.03515625,
"learning_rate": 1.3714396675478714e-06,
"logits/chosen": 0.29044079780578613,
"logits/rejected": 1.34650719165802,
"logps/chosen": -12.877673149108887,
"logps/rejected": -1523.441162109375,
"loss": 0.2132,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2951946556568146,
"rewards/margins": 14.83436107635498,
"rewards/rejected": -14.539166450500488,
"step": 1370
},
{
"epoch": 0.6892246222999125,
"grad_norm": 0.0115966796875,
"learning_rate": 1.332700606465766e-06,
"logits/chosen": 0.18918542563915253,
"logits/rejected": 1.3702523708343506,
"logps/chosen": -13.452165603637695,
"logps/rejected": -1564.6370849609375,
"loss": 0.2125,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3073347210884094,
"rewards/margins": 15.239773750305176,
"rewards/rejected": -14.9324369430542,
"step": 1380
},
{
"epoch": 0.6942190036209265,
"grad_norm": 0.034423828125,
"learning_rate": 1.294316721347254e-06,
"logits/chosen": 0.23732297122478485,
"logits/rejected": 1.3234728574752808,
"logps/chosen": -13.713842391967773,
"logps/rejected": -1537.1632080078125,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30026909708976746,
"rewards/margins": 14.980550765991211,
"rewards/rejected": -14.680282592773438,
"step": 1390
},
{
"epoch": 0.6992133849419403,
"grad_norm": 0.07080078125,
"learning_rate": 1.2562996913154952e-06,
"logits/chosen": 0.150472030043602,
"logits/rejected": 1.4298745393753052,
"logps/chosen": -12.643513679504395,
"logps/rejected": -2120.697509765625,
"loss": 0.2133,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30036434531211853,
"rewards/margins": 20.737525939941406,
"rewards/rejected": -20.43716049194336,
"step": 1400
},
{
"epoch": 0.7042077662629542,
"grad_norm": 0.034423828125,
"learning_rate": 1.2186610838700958e-06,
"logits/chosen": 0.30068179965019226,
"logits/rejected": 1.335126280784607,
"logps/chosen": -12.845601081848145,
"logps/rejected": -1345.099365234375,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2946023643016815,
"rewards/margins": 13.083051681518555,
"rewards/rejected": -12.788450241088867,
"step": 1410
},
{
"epoch": 0.709202147583968,
"grad_norm": 0.033447265625,
"learning_rate": 1.1814123513674465e-06,
"logits/chosen": 0.18157488107681274,
"logits/rejected": 1.380326509475708,
"logps/chosen": -13.374841690063477,
"logps/rejected": -1741.3206787109375,
"loss": 0.2124,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30905359983444214,
"rewards/margins": 16.993389129638672,
"rewards/rejected": -16.684337615966797,
"step": 1420
},
{
"epoch": 0.7141965289049819,
"grad_norm": 0.036865234375,
"learning_rate": 1.1445648275360925e-06,
"logits/chosen": 0.19126050174236298,
"logits/rejected": 1.4904680252075195,
"logps/chosen": -13.341898918151855,
"logps/rejected": -1982.956298828125,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30447131395339966,
"rewards/margins": 19.409332275390625,
"rewards/rejected": -19.104862213134766,
"step": 1430
},
{
"epoch": 0.7191909102259958,
"grad_norm": 0.03955078125,
"learning_rate": 1.1081297240282077e-06,
"logits/chosen": 0.2438248097896576,
"logits/rejected": 1.3988474607467651,
"logps/chosen": -13.187724113464355,
"logps/rejected": -1571.8028564453125,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29910221695899963,
"rewards/margins": 15.328478813171387,
"rewards/rejected": -15.029376029968262,
"step": 1440
},
{
"epoch": 0.7241852915470096,
"grad_norm": 0.0235595703125,
"learning_rate": 1.0721181270082061e-06,
"logits/chosen": 0.20241305232048035,
"logits/rejected": 1.308318018913269,
"logps/chosen": -12.698432922363281,
"logps/rejected": -1740.4573974609375,
"loss": 0.2125,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.313620924949646,
"rewards/margins": 16.896198272705078,
"rewards/rejected": -16.582576751708984,
"step": 1450
},
{
"epoch": 0.7291796728680234,
"grad_norm": 0.0164794921875,
"learning_rate": 1.0365409937795385e-06,
"logits/chosen": 0.20683518052101135,
"logits/rejected": 1.2756832838058472,
"logps/chosen": -13.767419815063477,
"logps/rejected": -1547.2147216796875,
"loss": 0.2129,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3099841773509979,
"rewards/margins": 14.946782112121582,
"rewards/rejected": -14.636796951293945,
"step": 1460
},
{
"epoch": 0.7341740541890374,
"grad_norm": 0.0263671875,
"learning_rate": 1.0014091494506962e-06,
"logits/chosen": 0.17677463591098785,
"logits/rejected": 1.3411346673965454,
"logps/chosen": -13.297311782836914,
"logps/rejected": -1913.2984619140625,
"loss": 0.2128,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3021236062049866,
"rewards/margins": 18.699064254760742,
"rewards/rejected": -18.396942138671875,
"step": 1470
},
{
"epoch": 0.7391684355100512,
"grad_norm": 0.033203125,
"learning_rate": 9.667332836414368e-07,
"logits/chosen": 0.15931569039821625,
"logits/rejected": 1.2274577617645264,
"logps/chosen": -13.343734741210938,
"logps/rejected": -1609.459228515625,
"loss": 0.2133,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3135462999343872,
"rewards/margins": 15.410505294799805,
"rewards/rejected": -15.096961975097656,
"step": 1480
},
{
"epoch": 0.744162816831065,
"grad_norm": 0.013427734375,
"learning_rate": 9.325239472302422e-07,
"logits/chosen": 0.25666847825050354,
"logits/rejected": 1.4118614196777344,
"logps/chosen": -13.085573196411133,
"logps/rejected": -1693.067626953125,
"loss": 0.2127,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30241408944129944,
"rewards/margins": 16.529926300048828,
"rewards/rejected": -16.227514266967773,
"step": 1490
},
{
"epoch": 0.7491571981520789,
"grad_norm": 0.032958984375,
"learning_rate": 8.987915491439844e-07,
"logits/chosen": 0.2501397132873535,
"logits/rejected": 1.383455514907837,
"logps/chosen": -12.903341293334961,
"logps/rejected": -1736.0384521484375,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3011986017227173,
"rewards/margins": 16.931396484375,
"rewards/rejected": -16.630199432373047,
"step": 1500
},
{
"epoch": 0.7541515794730927,
"grad_norm": 0.021240234375,
"learning_rate": 8.655463531907823e-07,
"logits/chosen": 0.2224547117948532,
"logits/rejected": 1.3173682689666748,
"logps/chosen": -13.032022476196289,
"logps/rejected": -1771.166259765625,
"loss": 0.2132,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29998722672462463,
"rewards/margins": 17.284591674804688,
"rewards/rejected": -16.984607696533203,
"step": 1510
},
{
"epoch": 0.7591459607941067,
"grad_norm": 0.014404296875,
"learning_rate": 8.327984749370227e-07,
"logits/chosen": 0.2447456419467926,
"logits/rejected": 1.344362497329712,
"logps/chosen": -12.957304000854492,
"logps/rejected": -1557.052978515625,
"loss": 0.2142,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29349425435066223,
"rewards/margins": 15.1736478805542,
"rewards/rejected": -14.880154609680176,
"step": 1520
},
{
"epoch": 0.7641403421151205,
"grad_norm": 0.0218505859375,
"learning_rate": 8.005578786294782e-07,
"logits/chosen": 0.1744759976863861,
"logits/rejected": 1.3996227979660034,
"logps/chosen": -13.042366027832031,
"logps/rejected": -1864.2431640625,
"loss": 0.2122,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30568575859069824,
"rewards/margins": 18.201250076293945,
"rewards/rejected": -17.89556312561035,
"step": 1530
},
{
"epoch": 0.7691347234361343,
"grad_norm": 0.012939453125,
"learning_rate": 7.688343741634702e-07,
"logits/chosen": 0.22156497836112976,
"logits/rejected": 1.3012608289718628,
"logps/chosen": -12.66821575164795,
"logps/rejected": -1628.963134765625,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2943740785121918,
"rewards/margins": 15.8633394241333,
"rewards/rejected": -15.568964958190918,
"step": 1540
},
{
"epoch": 0.7741291047571482,
"grad_norm": 0.03662109375,
"learning_rate": 7.376376140980001e-07,
"logits/chosen": 0.1970866173505783,
"logits/rejected": 1.2925320863723755,
"logps/chosen": -12.667881965637207,
"logps/rejected": -1566.008056640625,
"loss": 0.2128,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3171369433403015,
"rewards/margins": 15.188334465026855,
"rewards/rejected": -14.871198654174805,
"step": 1550
},
{
"epoch": 0.7791234860781621,
"grad_norm": 0.0625,
"learning_rate": 7.069770907187465e-07,
"logits/chosen": 0.20419040322303772,
"logits/rejected": 1.306980013847351,
"logps/chosen": -13.543539047241211,
"logps/rejected": -1506.096923828125,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3025529384613037,
"rewards/margins": 14.58825397491455,
"rewards/rejected": -14.285697937011719,
"step": 1560
},
{
"epoch": 0.7841178673991759,
"grad_norm": 0.01483154296875,
"learning_rate": 6.768621331498371e-07,
"logits/chosen": 0.22659845650196075,
"logits/rejected": 1.3488702774047852,
"logps/chosen": -12.837379455566406,
"logps/rejected": -1573.134033203125,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29855865240097046,
"rewards/margins": 15.335705757141113,
"rewards/rejected": -15.037145614624023,
"step": 1570
},
{
"epoch": 0.7891122487201898,
"grad_norm": 0.049560546875,
"learning_rate": 6.473019045152593e-07,
"logits/chosen": 0.22067594528198242,
"logits/rejected": 1.39100980758667,
"logps/chosen": -12.964533805847168,
"logps/rejected": -1819.2757568359375,
"loss": 0.2124,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3059755563735962,
"rewards/margins": 17.78072738647461,
"rewards/rejected": -17.474750518798828,
"step": 1580
},
{
"epoch": 0.7941066300412036,
"grad_norm": 0.0159912109375,
"learning_rate": 6.183053991507818e-07,
"logits/chosen": 0.18515101075172424,
"logits/rejected": 1.2657784223556519,
"logps/chosen": -13.701101303100586,
"logps/rejected": -1632.0228271484375,
"loss": 0.2135,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30029112100601196,
"rewards/margins": 15.897903442382812,
"rewards/rejected": -15.597612380981445,
"step": 1590
},
{
"epoch": 0.7991010113622175,
"grad_norm": 0.035400390625,
"learning_rate": 5.898814398672376e-07,
"logits/chosen": 0.2673841416835785,
"logits/rejected": 1.404524326324463,
"logps/chosen": -12.89787769317627,
"logps/rejected": -1488.13818359375,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2999451160430908,
"rewards/margins": 14.503445625305176,
"rewards/rejected": -14.203500747680664,
"step": 1600
},
{
"epoch": 0.8040953926832314,
"grad_norm": 0.028564453125,
"learning_rate": 5.620386752659912e-07,
"logits/chosen": 0.20275497436523438,
"logits/rejected": 1.2962656021118164,
"logps/chosen": -14.060361862182617,
"logps/rejected": -1563.873046875,
"loss": 0.2123,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3067697286605835,
"rewards/margins": 15.22687816619873,
"rewards/rejected": -14.920109748840332,
"step": 1610
},
{
"epoch": 0.8090897740042452,
"grad_norm": 0.031494140625,
"learning_rate": 5.347855771074157e-07,
"logits/chosen": 0.22789278626441956,
"logits/rejected": 1.4333293437957764,
"logps/chosen": -12.939372062683105,
"logps/rejected": -1751.370361328125,
"loss": 0.2136,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2999444007873535,
"rewards/margins": 17.115280151367188,
"rewards/rejected": -16.81533432006836,
"step": 1620
},
{
"epoch": 0.8140841553252591,
"grad_norm": 0.031005859375,
"learning_rate": 5.081304377331786e-07,
"logits/chosen": 0.27506810426712036,
"logits/rejected": 1.2999309301376343,
"logps/chosen": -13.067828178405762,
"logps/rejected": -1445.669677734375,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30137357115745544,
"rewards/margins": 14.080915451049805,
"rewards/rejected": -13.779541015625,
"step": 1630
},
{
"epoch": 0.819078536646273,
"grad_norm": 0.03369140625,
"learning_rate": 4.820813675431186e-07,
"logits/chosen": 0.15463611483573914,
"logits/rejected": 1.3889650106430054,
"logps/chosen": -13.349421501159668,
"logps/rejected": -1812.4769287109375,
"loss": 0.2136,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3092433512210846,
"rewards/margins": 17.648366928100586,
"rewards/rejected": -17.33912467956543,
"step": 1640
},
{
"epoch": 0.8240729179672868,
"grad_norm": 0.0242919921875,
"learning_rate": 4.5664629252747865e-07,
"logits/chosen": 0.21123354136943817,
"logits/rejected": 1.3822557926177979,
"logps/chosen": -12.89905834197998,
"logps/rejected": -1771.5474853515625,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29551658034324646,
"rewards/margins": 17.291709899902344,
"rewards/rejected": -16.99619483947754,
"step": 1650
},
{
"epoch": 0.8290672992883007,
"grad_norm": 0.021484375,
"learning_rate": 4.3183295185525746e-07,
"logits/chosen": 0.17760224640369415,
"logits/rejected": 1.320516586303711,
"logps/chosen": -12.863115310668945,
"logps/rejected": -1791.6168212890625,
"loss": 0.2129,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3116030693054199,
"rewards/margins": 17.404626846313477,
"rewards/rejected": -17.09302520751953,
"step": 1660
},
{
"epoch": 0.8340616806093145,
"grad_norm": 0.0274658203125,
"learning_rate": 4.0764889551939773e-07,
"logits/chosen": 0.19689543545246124,
"logits/rejected": 1.3633372783660889,
"logps/chosen": -12.999624252319336,
"logps/rejected": -1783.937255859375,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30732038617134094,
"rewards/margins": 17.336694717407227,
"rewards/rejected": -17.029375076293945,
"step": 1670
},
{
"epoch": 0.8390560619303283,
"grad_norm": 0.02392578125,
"learning_rate": 3.8410148203953916e-07,
"logits/chosen": 0.20565366744995117,
"logits/rejected": 1.2732315063476562,
"logps/chosen": -13.205337524414062,
"logps/rejected": -1707.686279296875,
"loss": 0.2141,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29469722509384155,
"rewards/margins": 16.513946533203125,
"rewards/rejected": -16.219249725341797,
"step": 1680
},
{
"epoch": 0.8440504432513423,
"grad_norm": 0.01904296875,
"learning_rate": 3.611978762230306e-07,
"logits/chosen": 0.2300119698047638,
"logits/rejected": 1.3706471920013428,
"logps/chosen": -12.709399223327637,
"logps/rejected": -1650.5601806640625,
"loss": 0.2126,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30553561449050903,
"rewards/margins": 16.097576141357422,
"rewards/rejected": -15.79203987121582,
"step": 1690
},
{
"epoch": 0.8490448245723561,
"grad_norm": 0.031982421875,
"learning_rate": 3.389450469848821e-07,
"logits/chosen": 0.26923322677612305,
"logits/rejected": 1.3886299133300781,
"logps/chosen": -12.681255340576172,
"logps/rejected": -1633.20361328125,
"loss": 0.2138,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3012009263038635,
"rewards/margins": 15.944944381713867,
"rewards/rejected": -15.643745422363281,
"step": 1700
},
{
"epoch": 0.85403920589337,
"grad_norm": 0.015869140625,
"learning_rate": 3.173497652273241e-07,
"logits/chosen": 0.22611021995544434,
"logits/rejected": 1.4598513841629028,
"logps/chosen": -13.163076400756836,
"logps/rejected": -1702.979736328125,
"loss": 0.2126,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3090182840824127,
"rewards/margins": 16.64162826538086,
"rewards/rejected": -16.332609176635742,
"step": 1710
},
{
"epoch": 0.8590335872143838,
"grad_norm": 0.0257568359375,
"learning_rate": 2.964186017796153e-07,
"logits/chosen": 0.23432429134845734,
"logits/rejected": 1.3954203128814697,
"logps/chosen": -12.975980758666992,
"logps/rejected": -1669.3160400390625,
"loss": 0.2133,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29741281270980835,
"rewards/margins": 16.287899017333984,
"rewards/rejected": -15.990484237670898,
"step": 1720
},
{
"epoch": 0.8640279685353977,
"grad_norm": 0.020263671875,
"learning_rate": 2.761579253987226e-07,
"logits/chosen": 0.24720034003257751,
"logits/rejected": 1.284588098526001,
"logps/chosen": -13.461567878723145,
"logps/rejected": -1476.7037353515625,
"loss": 0.2131,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3023452162742615,
"rewards/margins": 14.363668441772461,
"rewards/rejected": -14.061323165893555,
"step": 1730
},
{
"epoch": 0.8690223498564116,
"grad_norm": 0.04541015625,
"learning_rate": 2.565739008314944e-07,
"logits/chosen": 0.25941091775894165,
"logits/rejected": 1.3293951749801636,
"logps/chosen": -12.768040657043457,
"logps/rejected": -1509.4613037109375,
"loss": 0.2144,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2932388186454773,
"rewards/margins": 14.711868286132812,
"rewards/rejected": -14.418627738952637,
"step": 1740
},
{
"epoch": 0.8740167311774254,
"grad_norm": 0.0240478515625,
"learning_rate": 2.3767248693890106e-07,
"logits/chosen": 0.2348676472902298,
"logits/rejected": 1.3176116943359375,
"logps/chosen": -13.70958137512207,
"logps/rejected": -1628.524658203125,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3072062134742737,
"rewards/margins": 15.790201187133789,
"rewards/rejected": -15.482992172241211,
"step": 1750
},
{
"epoch": 0.8790111124984392,
"grad_norm": 0.0849609375,
"learning_rate": 2.1945943488292265e-07,
"logits/chosen": 0.14879265427589417,
"logits/rejected": 1.2918832302093506,
"logps/chosen": -14.206727981567383,
"logps/rejected": -1858.029052734375,
"loss": 0.2125,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3139779269695282,
"rewards/margins": 18.043270111083984,
"rewards/rejected": -17.729291915893555,
"step": 1760
},
{
"epoch": 0.8840054938194531,
"grad_norm": 0.019775390625,
"learning_rate": 2.0194028637663733e-07,
"logits/chosen": 0.2688780426979065,
"logits/rejected": 1.2982268333435059,
"logps/chosen": -13.01366901397705,
"logps/rejected": -1392.9710693359375,
"loss": 0.213,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3066830039024353,
"rewards/margins": 13.560384750366211,
"rewards/rejected": -13.253702163696289,
"step": 1770
},
{
"epoch": 0.888999875140467,
"grad_norm": 0.0380859375,
"learning_rate": 1.851203719980324e-07,
"logits/chosen": 0.10393796861171722,
"logits/rejected": 1.3042396306991577,
"logps/chosen": -13.014623641967773,
"logps/rejected": -1894.3724365234375,
"loss": 0.2117,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3259292542934418,
"rewards/margins": 18.341753005981445,
"rewards/rejected": -18.015825271606445,
"step": 1780
},
{
"epoch": 0.8939942564614808,
"grad_norm": 0.0250244140625,
"learning_rate": 1.6900480956806214e-07,
"logits/chosen": 0.14930710196495056,
"logits/rejected": 1.24697744846344,
"logps/chosen": -13.011159896850586,
"logps/rejected": -1752.00390625,
"loss": 0.2127,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30487823486328125,
"rewards/margins": 17.018848419189453,
"rewards/rejected": -16.713970184326172,
"step": 1790
},
{
"epoch": 0.8989886377824947,
"grad_norm": 0.0159912109375,
"learning_rate": 1.5359850259344223e-07,
"logits/chosen": 0.19253353774547577,
"logits/rejected": 1.2840015888214111,
"logps/chosen": -13.470372200012207,
"logps/rejected": -1626.4345703125,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3020302951335907,
"rewards/margins": 15.777534484863281,
"rewards/rejected": -15.475504875183105,
"step": 1800
},
{
"epoch": 0.9039830191035085,
"grad_norm": 0.0238037109375,
"learning_rate": 1.3890613877465127e-07,
"logits/chosen": 0.236587792634964,
"logits/rejected": 1.3434031009674072,
"logps/chosen": -13.017538070678711,
"logps/rejected": -1619.8177490234375,
"loss": 0.2148,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29720932245254517,
"rewards/margins": 15.795916557312012,
"rewards/rejected": -15.498708724975586,
"step": 1810
},
{
"epoch": 0.9089774004245225,
"grad_norm": 0.064453125,
"learning_rate": 1.249321885795954e-07,
"logits/chosen": 0.23312029242515564,
"logits/rejected": 1.218126893043518,
"logps/chosen": -13.364558219909668,
"logps/rejected": -1396.6041259765625,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30306655168533325,
"rewards/margins": 13.558195114135742,
"rewards/rejected": -13.255128860473633,
"step": 1820
},
{
"epoch": 0.9139717817455363,
"grad_norm": 0.02001953125,
"learning_rate": 1.1168090388337577e-07,
"logits/chosen": 0.289134681224823,
"logits/rejected": 1.3270288705825806,
"logps/chosen": -12.729695320129395,
"logps/rejected": -1422.671142578125,
"loss": 0.2137,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3039693832397461,
"rewards/margins": 13.834306716918945,
"rewards/rejected": -13.5303373336792,
"step": 1830
},
{
"epoch": 0.9189661630665501,
"grad_norm": 0.0257568359375,
"learning_rate": 9.915631667455989e-08,
"logits/chosen": 0.23302344977855682,
"logits/rejected": 1.3944863080978394,
"logps/chosen": -13.118586540222168,
"logps/rejected": -1715.78515625,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29517239332199097,
"rewards/margins": 16.75180435180664,
"rewards/rejected": -16.45663070678711,
"step": 1840
},
{
"epoch": 0.923960544387564,
"grad_norm": 0.025634765625,
"learning_rate": 8.736223782836589e-08,
"logits/chosen": 0.1992538869380951,
"logits/rejected": 1.3024797439575195,
"logps/chosen": -12.67003059387207,
"logps/rejected": -1692.8782958984375,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3054552972316742,
"rewards/margins": 16.512258529663086,
"rewards/rejected": -16.206802368164062,
"step": 1850
},
{
"epoch": 0.9289549257085778,
"grad_norm": 0.01226806640625,
"learning_rate": 7.63022559471202e-08,
"logits/chosen": 0.23122599720954895,
"logits/rejected": 1.37257981300354,
"logps/chosen": -12.650789260864258,
"logps/rejected": -1592.7677001953125,
"loss": 0.2138,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2996312975883484,
"rewards/margins": 15.537762641906738,
"rewards/rejected": -15.238128662109375,
"step": 1860
},
{
"epoch": 0.9339493070295917,
"grad_norm": 0.0159912109375,
"learning_rate": 6.597973626834759e-08,
"logits/chosen": 0.21128106117248535,
"logits/rejected": 1.4774492979049683,
"logps/chosen": -13.167986869812012,
"logps/rejected": -1897.197998046875,
"loss": 0.2133,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30018025636672974,
"rewards/margins": 18.570720672607422,
"rewards/rejected": -18.27054214477539,
"step": 1870
},
{
"epoch": 0.9389436883506056,
"grad_norm": 0.0264892578125,
"learning_rate": 5.639781964082547e-08,
"logits/chosen": 0.27233806252479553,
"logits/rejected": 1.4588285684585571,
"logps/chosen": -13.405789375305176,
"logps/rejected": -1711.1881103515625,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29316192865371704,
"rewards/margins": 16.696842193603516,
"rewards/rejected": -16.40367889404297,
"step": 1880
},
{
"epoch": 0.9439380696716194,
"grad_norm": 0.033203125,
"learning_rate": 4.755942156891458e-08,
"logits/chosen": 0.23750165104866028,
"logits/rejected": 1.4071403741836548,
"logps/chosen": -12.829435348510742,
"logps/rejected": -1591.2479248046875,
"loss": 0.2133,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3001479208469391,
"rewards/margins": 15.529951095581055,
"rewards/rejected": -15.229803085327148,
"step": 1890
},
{
"epoch": 0.9489324509926332,
"grad_norm": 0.0203857421875,
"learning_rate": 3.946723132545155e-08,
"logits/chosen": 0.18308812379837036,
"logits/rejected": 1.3526315689086914,
"logps/chosen": -13.234288215637207,
"logps/rejected": -1674.6246337890625,
"loss": 0.2124,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3094561696052551,
"rewards/margins": 16.279918670654297,
"rewards/rejected": -15.970464706420898,
"step": 1900
},
{
"epoch": 0.9539268323136472,
"grad_norm": 0.02197265625,
"learning_rate": 3.212371113348156e-08,
"logits/chosen": 0.2626166343688965,
"logits/rejected": 1.3081789016723633,
"logps/chosen": -12.9055757522583,
"logps/rejected": -1487.3057861328125,
"loss": 0.2143,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29553088545799255,
"rewards/margins": 14.46813678741455,
"rewards/rejected": -14.172607421875,
"step": 1910
},
{
"epoch": 0.958921213634661,
"grad_norm": 0.033935546875,
"learning_rate": 2.5531095417073437e-08,
"logits/chosen": 0.2499731481075287,
"logits/rejected": 1.3821344375610352,
"logps/chosen": -13.182432174682617,
"logps/rejected": -1497.8023681640625,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29157382249832153,
"rewards/margins": 14.601869583129883,
"rewards/rejected": -14.310295104980469,
"step": 1920
},
{
"epoch": 0.9639155949556749,
"grad_norm": 0.0218505859375,
"learning_rate": 1.969139012144822e-08,
"logits/chosen": 0.27068477869033813,
"logits/rejected": 1.3521279096603394,
"logps/chosen": -13.608545303344727,
"logps/rejected": -1491.3599853515625,
"loss": 0.214,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3026901185512543,
"rewards/margins": 14.520078659057617,
"rewards/rejected": -14.217389106750488,
"step": 1930
},
{
"epoch": 0.9689099762766887,
"grad_norm": 0.049072265625,
"learning_rate": 1.4606372102626277e-08,
"logits/chosen": 0.19163861870765686,
"logits/rejected": 1.3109387159347534,
"logps/chosen": -13.46105670928955,
"logps/rejected": -1731.31640625,
"loss": 0.2127,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3083009421825409,
"rewards/margins": 16.883403778076172,
"rewards/rejected": -16.575105667114258,
"step": 1940
},
{
"epoch": 0.9739043575977026,
"grad_norm": 0.0322265625,
"learning_rate": 1.0277588586781463e-08,
"logits/chosen": 0.19613580405712128,
"logits/rejected": 1.228542685508728,
"logps/chosen": -13.699869155883789,
"logps/rejected": -1520.361572265625,
"loss": 0.2129,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3030748963356018,
"rewards/margins": 14.732551574707031,
"rewards/rejected": -14.429475784301758,
"step": 1950
},
{
"epoch": 0.9788987389187165,
"grad_norm": 0.025146484375,
"learning_rate": 6.7063566994651775e-09,
"logits/chosen": 0.21389658749103546,
"logits/rejected": 1.3907114267349243,
"logps/chosen": -13.105180740356445,
"logps/rejected": -1654.1494140625,
"loss": 0.2134,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30368101596832275,
"rewards/margins": 16.11886978149414,
"rewards/rejected": -15.81518840789795,
"step": 1960
},
{
"epoch": 0.9838931202397303,
"grad_norm": 0.02490234375,
"learning_rate": 3.893763064840295e-09,
"logits/chosen": 0.18713845312595367,
"logits/rejected": 1.294327974319458,
"logps/chosen": -12.946528434753418,
"logps/rejected": -1592.420654296875,
"loss": 0.2118,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.31735336780548096,
"rewards/margins": 15.468029975891113,
"rewards/rejected": -15.150675773620605,
"step": 1970
},
{
"epoch": 0.9888875015607441,
"grad_norm": 0.0172119140625,
"learning_rate": 1.840663475053961e-09,
"logits/chosen": 0.23091156780719757,
"logits/rejected": 1.4303152561187744,
"logps/chosen": -13.524391174316406,
"logps/rejected": -1685.2825927734375,
"loss": 0.2139,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30492401123046875,
"rewards/margins": 16.39162826538086,
"rewards/rejected": -16.08670425415039,
"step": 1980
},
{
"epoch": 0.993881882881758,
"grad_norm": 0.01446533203125,
"learning_rate": 5.476826298439486e-10,
"logits/chosen": 0.18041366338729858,
"logits/rejected": 1.3766809701919556,
"logps/chosen": -12.73656940460205,
"logps/rejected": -1975.449462890625,
"loss": 0.2138,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2982867360115051,
"rewards/margins": 19.314851760864258,
"rewards/rejected": -19.016565322875977,
"step": 1990
},
{
"epoch": 0.9988762642027719,
"grad_norm": 0.029052734375,
"learning_rate": 1.521394646070151e-11,
"logits/chosen": 0.212058424949646,
"logits/rejected": 1.3283421993255615,
"logps/chosen": -12.978428840637207,
"logps/rejected": -1706.1982421875,
"loss": 0.2118,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3162993788719177,
"rewards/margins": 16.5406551361084,
"rewards/rejected": -16.224355697631836,
"step": 2000
},
{
"epoch": 0.9988762642027719,
"eval_logits/chosen": 0.19315297901630402,
"eval_logits/rejected": 1.0988490581512451,
"eval_logps/chosen": -12.085772514343262,
"eval_logps/rejected": -1018.5144653320312,
"eval_loss": 0.2121797353029251,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.32569777965545654,
"eval_rewards/margins": 9.880459785461426,
"eval_rewards/rejected": -9.554760932922363,
"eval_runtime": 0.4236,
"eval_samples_per_second": 11.805,
"eval_steps_per_second": 7.083,
"step": 2000
},
{
"epoch": 0.9998751404669747,
"step": 2002,
"total_flos": 0.0,
"train_loss": 0.2449444185841929,
"train_runtime": 3711.3289,
"train_samples_per_second": 4.316,
"train_steps_per_second": 0.539
}
],
"logging_steps": 10,
"max_steps": 2002,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}