MetaMathOctopus-MAPO-DPO-7B / trainer_state.json
VincentVioletLx
commit from VincentLx
2889038
{
"best_metric": 0.6428677439689636,
"best_model_checkpoint": "/mnt/data/shesj/Trained/RL4CoT/DPO/llama2fullcontinue_largebeta_initialData_iter1Self_resetScoreExp_iter1_2_lowerLr16.json/checkpoint-1000",
"epoch": 0.5,
"eval_steps": 100,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5e-09,
"logits/chosen": -1.5196828842163086,
"logits/rejected": -1.53714919090271,
"logps/chosen": -15.696101188659668,
"logps/rejected": -19.717897415161133,
"loss": 0.693,
"rewards/accuracies": 0.3062500059604645,
"rewards/chosen": -4.1580526158213615e-05,
"rewards/margins": 0.0009152223356068134,
"rewards/rejected": -0.000956802919972688,
"step": 5
},
{
"epoch": 0.01,
"learning_rate": 1e-08,
"logits/chosen": -1.4667831659317017,
"logits/rejected": -1.474163293838501,
"logps/chosen": -13.819122314453125,
"logps/rejected": -15.621783256530762,
"loss": 0.6933,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.0003026240156032145,
"rewards/margins": 0.0002748106198851019,
"rewards/rejected": -0.0005774348974227905,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 1.5e-08,
"logits/chosen": -1.4452192783355713,
"logits/rejected": -1.4530740976333618,
"logps/chosen": -16.253719329833984,
"logps/rejected": -19.002716064453125,
"loss": 0.693,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.000418459705542773,
"rewards/margins": 0.0002198911242885515,
"rewards/rejected": 0.00019856853759847581,
"step": 15
},
{
"epoch": 0.01,
"learning_rate": 2e-08,
"logits/chosen": -1.4347673654556274,
"logits/rejected": -1.457056999206543,
"logps/chosen": -16.931005477905273,
"logps/rejected": -17.90321159362793,
"loss": 0.6938,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.0022916034795343876,
"rewards/margins": -0.003587479470297694,
"rewards/rejected": 0.0012958759907633066,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 2.5e-08,
"logits/chosen": -1.5136417150497437,
"logits/rejected": -1.5317498445510864,
"logps/chosen": -13.484731674194336,
"logps/rejected": -17.72646141052246,
"loss": 0.693,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0013993926113471389,
"rewards/margins": 0.003486391855403781,
"rewards/rejected": -0.002086999360471964,
"step": 25
},
{
"epoch": 0.01,
"learning_rate": 3e-08,
"logits/chosen": -1.4325120449066162,
"logits/rejected": -1.423964262008667,
"logps/chosen": -16.99893569946289,
"logps/rejected": -21.196269989013672,
"loss": 0.6929,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0006942325853742659,
"rewards/margins": -0.00022374764375854284,
"rewards/rejected": -0.0004704846942331642,
"step": 30
},
{
"epoch": 0.02,
"learning_rate": 3.4999999999999996e-08,
"logits/chosen": -1.503702163696289,
"logits/rejected": -1.5359680652618408,
"logps/chosen": -15.859466552734375,
"logps/rejected": -18.37940788269043,
"loss": 0.6925,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.002690280321985483,
"rewards/margins": 0.0054735760204494,
"rewards/rejected": -0.002783295465633273,
"step": 35
},
{
"epoch": 0.02,
"learning_rate": 4e-08,
"logits/chosen": -1.474712610244751,
"logits/rejected": -1.4741761684417725,
"logps/chosen": -13.360185623168945,
"logps/rejected": -16.91265296936035,
"loss": 0.6935,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0006134170107543468,
"rewards/margins": 0.0004067299305461347,
"rewards/rejected": -0.0010201467666774988,
"step": 40
},
{
"epoch": 0.02,
"learning_rate": 4.5e-08,
"logits/chosen": -1.4315367937088013,
"logits/rejected": -1.4630348682403564,
"logps/chosen": -14.167867660522461,
"logps/rejected": -18.600988388061523,
"loss": 0.6931,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0011325415689498186,
"rewards/margins": -7.549161182396347e-06,
"rewards/rejected": 0.0011400904040783644,
"step": 45
},
{
"epoch": 0.03,
"learning_rate": 5e-08,
"logits/chosen": -1.466339349746704,
"logits/rejected": -1.4680755138397217,
"logps/chosen": -13.571925163269043,
"logps/rejected": -16.360881805419922,
"loss": 0.6933,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0013324546162039042,
"rewards/margins": -5.83843320782762e-05,
"rewards/rejected": -0.0012740703532472253,
"step": 50
},
{
"epoch": 0.03,
"learning_rate": 5.5e-08,
"logits/chosen": -1.4321575164794922,
"logits/rejected": -1.4344770908355713,
"logps/chosen": -14.864067077636719,
"logps/rejected": -20.08001708984375,
"loss": 0.6929,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.000479007518151775,
"rewards/margins": 0.001067839446477592,
"rewards/rejected": -0.0005888319574296474,
"step": 55
},
{
"epoch": 0.03,
"learning_rate": 6e-08,
"logits/chosen": -1.5018717050552368,
"logits/rejected": -1.5294861793518066,
"logps/chosen": -13.418993949890137,
"logps/rejected": -17.377300262451172,
"loss": 0.6929,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.0013912434224039316,
"rewards/margins": -0.0009791270131245255,
"rewards/rejected": -0.0004121163801755756,
"step": 60
},
{
"epoch": 0.03,
"learning_rate": 6.5e-08,
"logits/chosen": -1.40516197681427,
"logits/rejected": -1.4193370342254639,
"logps/chosen": -13.224818229675293,
"logps/rejected": -15.901044845581055,
"loss": 0.6926,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.0004983447724953294,
"rewards/margins": 0.0009676685440354049,
"rewards/rejected": -0.0014660133747383952,
"step": 65
},
{
"epoch": 0.04,
"learning_rate": 6.999999999999999e-08,
"logits/chosen": -1.5142648220062256,
"logits/rejected": -1.5050678253173828,
"logps/chosen": -15.645828247070312,
"logps/rejected": -20.22635269165039,
"loss": 0.6928,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.0008540893904864788,
"rewards/margins": 0.0006531132385134697,
"rewards/rejected": -0.0015072030946612358,
"step": 70
},
{
"epoch": 0.04,
"learning_rate": 7.5e-08,
"logits/chosen": -1.4419431686401367,
"logits/rejected": -1.4324930906295776,
"logps/chosen": -14.863853454589844,
"logps/rejected": -16.58720588684082,
"loss": 0.6924,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0013656382216140628,
"rewards/margins": 0.0006723797414451838,
"rewards/rejected": 0.0006932583637535572,
"step": 75
},
{
"epoch": 0.04,
"learning_rate": 8e-08,
"logits/chosen": -1.532387375831604,
"logits/rejected": -1.5147764682769775,
"logps/chosen": -15.975011825561523,
"logps/rejected": -19.93300437927246,
"loss": 0.6925,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0007912625442259014,
"rewards/margins": 0.00153960264287889,
"rewards/rejected": -0.0023308652453124523,
"step": 80
},
{
"epoch": 0.04,
"learning_rate": 8.5e-08,
"logits/chosen": -1.4505486488342285,
"logits/rejected": -1.4808809757232666,
"logps/chosen": -17.687313079833984,
"logps/rejected": -24.51900863647461,
"loss": 0.6921,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.00017726181249599904,
"rewards/margins": 0.0016221065307036042,
"rewards/rejected": -0.001799368066713214,
"step": 85
},
{
"epoch": 0.04,
"learning_rate": 9e-08,
"logits/chosen": -1.4988192319869995,
"logits/rejected": -1.513806939125061,
"logps/chosen": -14.488324165344238,
"logps/rejected": -19.464611053466797,
"loss": 0.6912,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0011351148132234812,
"rewards/margins": 0.004005365073680878,
"rewards/rejected": -0.00287025049328804,
"step": 90
},
{
"epoch": 0.05,
"learning_rate": 9.499999999999999e-08,
"logits/chosen": -1.403247594833374,
"logits/rejected": -1.4147038459777832,
"logps/chosen": -16.571857452392578,
"logps/rejected": -16.457307815551758,
"loss": 0.6907,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.00036610427196137607,
"rewards/margins": 0.004560294561088085,
"rewards/rejected": -0.004194191191345453,
"step": 95
},
{
"epoch": 0.05,
"learning_rate": 1e-07,
"logits/chosen": -1.486196756362915,
"logits/rejected": -1.504416823387146,
"logps/chosen": -14.919549942016602,
"logps/rejected": -19.339908599853516,
"loss": 0.691,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.003806027816608548,
"rewards/margins": 0.004251133184880018,
"rewards/rejected": -0.008057162165641785,
"step": 100
},
{
"epoch": 0.05,
"eval_logits/chosen": -1.7919771671295166,
"eval_logits/rejected": -1.8722617626190186,
"eval_logps/chosen": -15.013982772827148,
"eval_logps/rejected": -18.596895217895508,
"eval_loss": 0.6906684637069702,
"eval_rewards/accuracies": 0.5674920082092285,
"eval_rewards/chosen": -8.375057223020121e-05,
"eval_rewards/margins": 0.005779640283435583,
"eval_rewards/rejected": -0.005863390862941742,
"eval_runtime": 306.7,
"eval_samples_per_second": 65.21,
"eval_steps_per_second": 1.021,
"step": 100
},
{
"epoch": 0.05,
"learning_rate": 9.999238475781956e-08,
"logits/chosen": -1.415111780166626,
"logits/rejected": -1.409263253211975,
"logps/chosen": -15.464300155639648,
"logps/rejected": -18.61844253540039,
"loss": 0.6899,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0006141990306787193,
"rewards/margins": 0.004516326356679201,
"rewards/rejected": -0.00513052474707365,
"step": 105
},
{
"epoch": 0.06,
"learning_rate": 9.996954135095479e-08,
"logits/chosen": -1.4188635349273682,
"logits/rejected": -1.432936668395996,
"logps/chosen": -15.272178649902344,
"logps/rejected": -19.53142738342285,
"loss": 0.689,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.00477779284119606,
"rewards/margins": 0.0066839540377259254,
"rewards/rejected": -0.011461746878921986,
"step": 110
},
{
"epoch": 0.06,
"learning_rate": 9.993147673772868e-08,
"logits/chosen": -1.4598616361618042,
"logits/rejected": -1.4905925989151,
"logps/chosen": -15.878756523132324,
"logps/rejected": -18.827163696289062,
"loss": 0.688,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.002627016045153141,
"rewards/margins": 0.01106287818402052,
"rewards/rejected": -0.008435862138867378,
"step": 115
},
{
"epoch": 0.06,
"learning_rate": 9.98782025129912e-08,
"logits/chosen": -1.4647516012191772,
"logits/rejected": -1.4928150177001953,
"logps/chosen": -13.536163330078125,
"logps/rejected": -18.504684448242188,
"loss": 0.6892,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.006495442241430283,
"rewards/margins": 0.0007650878978893161,
"rewards/rejected": -0.007260529790073633,
"step": 120
},
{
"epoch": 0.06,
"learning_rate": 9.980973490458727e-08,
"logits/chosen": -1.4349157810211182,
"logits/rejected": -1.4921131134033203,
"logps/chosen": -16.818748474121094,
"logps/rejected": -23.732484817504883,
"loss": 0.6884,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.014983393251895905,
"rewards/margins": 0.003936333581805229,
"rewards/rejected": -0.018919726833701134,
"step": 125
},
{
"epoch": 0.07,
"learning_rate": 9.972609476841366e-08,
"logits/chosen": -1.4056415557861328,
"logits/rejected": -1.4126735925674438,
"logps/chosen": -14.3095121383667,
"logps/rejected": -18.44950294494629,
"loss": 0.687,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.005277615040540695,
"rewards/margins": 0.011254631914198399,
"rewards/rejected": -0.01653224602341652,
"step": 130
},
{
"epoch": 0.07,
"learning_rate": 9.96273075820661e-08,
"logits/chosen": -1.5334028005599976,
"logits/rejected": -1.5822241306304932,
"logps/chosen": -15.960824966430664,
"logps/rejected": -21.098979949951172,
"loss": 0.6875,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.016458828002214432,
"rewards/margins": 0.006202386226505041,
"rewards/rejected": -0.02266121283173561,
"step": 135
},
{
"epoch": 0.07,
"learning_rate": 9.951340343707851e-08,
"logits/chosen": -1.5316803455352783,
"logits/rejected": -1.5605201721191406,
"logps/chosen": -13.550819396972656,
"logps/rejected": -17.886049270629883,
"loss": 0.6851,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.012303833849728107,
"rewards/margins": 0.02252637967467308,
"rewards/rejected": -0.03483021631836891,
"step": 140
},
{
"epoch": 0.07,
"learning_rate": 9.938441702975688e-08,
"logits/chosen": -1.4327102899551392,
"logits/rejected": -1.446903944015503,
"logps/chosen": -13.809109687805176,
"logps/rejected": -17.256572723388672,
"loss": 0.685,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.016683299094438553,
"rewards/margins": 0.029044728726148605,
"rewards/rejected": -0.04572802782058716,
"step": 145
},
{
"epoch": 0.07,
"learning_rate": 9.92403876506104e-08,
"logits/chosen": -1.4938914775848389,
"logits/rejected": -1.5160696506500244,
"logps/chosen": -13.700773239135742,
"logps/rejected": -19.660675048828125,
"loss": 0.6815,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.03682475537061691,
"rewards/margins": 0.010936172679066658,
"rewards/rejected": -0.04776093363761902,
"step": 150
},
{
"epoch": 0.08,
"learning_rate": 9.90813591723832e-08,
"logits/chosen": -1.4303719997406006,
"logits/rejected": -1.4469249248504639,
"logps/chosen": -14.483743667602539,
"logps/rejected": -20.158580780029297,
"loss": 0.6835,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03273214027285576,
"rewards/margins": 0.030618244782090187,
"rewards/rejected": -0.06335039436817169,
"step": 155
},
{
"epoch": 0.08,
"learning_rate": 9.890738003669028e-08,
"logits/chosen": -1.4257906675338745,
"logits/rejected": -1.440292477607727,
"logps/chosen": -13.087536811828613,
"logps/rejected": -18.774667739868164,
"loss": 0.6813,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.042621154338121414,
"rewards/margins": 0.019855033606290817,
"rewards/rejected": -0.06247618794441223,
"step": 160
},
{
"epoch": 0.08,
"learning_rate": 9.871850323926176e-08,
"logits/chosen": -1.506829023361206,
"logits/rejected": -1.4980268478393555,
"logps/chosen": -15.794275283813477,
"logps/rejected": -22.559085845947266,
"loss": 0.6832,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.04176632687449455,
"rewards/margins": 0.014226436614990234,
"rewards/rejected": -0.05599276348948479,
"step": 165
},
{
"epoch": 0.09,
"learning_rate": 9.851478631379982e-08,
"logits/chosen": -1.5072795152664185,
"logits/rejected": -1.5367839336395264,
"logps/chosen": -16.79397964477539,
"logps/rejected": -18.064043045043945,
"loss": 0.6843,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.037377405911684036,
"rewards/margins": 0.04563738405704498,
"rewards/rejected": -0.08301478624343872,
"step": 170
},
{
"epoch": 0.09,
"learning_rate": 9.82962913144534e-08,
"logits/chosen": -1.507598876953125,
"logits/rejected": -1.5444573163986206,
"logps/chosen": -17.070449829101562,
"logps/rejected": -20.720073699951172,
"loss": 0.6831,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.06209941580891609,
"rewards/margins": 0.03557855635881424,
"rewards/rejected": -0.09767796844244003,
"step": 175
},
{
"epoch": 0.09,
"learning_rate": 9.806308479691594e-08,
"logits/chosen": -1.4471648931503296,
"logits/rejected": -1.474890947341919,
"logps/chosen": -16.156829833984375,
"logps/rejected": -22.047147750854492,
"loss": 0.68,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.0555298812687397,
"rewards/margins": 0.0073929824866354465,
"rewards/rejected": -0.06292285770177841,
"step": 180
},
{
"epoch": 0.09,
"learning_rate": 9.781523779815177e-08,
"logits/chosen": -1.5135043859481812,
"logits/rejected": -1.5078579187393188,
"logps/chosen": -14.832931518554688,
"logps/rejected": -17.844093322753906,
"loss": 0.6827,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.06777982413768768,
"rewards/margins": 0.03908789902925491,
"rewards/rejected": -0.106867715716362,
"step": 185
},
{
"epoch": 0.1,
"learning_rate": 9.755282581475768e-08,
"logits/chosen": -1.4608676433563232,
"logits/rejected": -1.4990969896316528,
"logps/chosen": -14.89118766784668,
"logps/rejected": -20.47022247314453,
"loss": 0.6749,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.05767616629600525,
"rewards/margins": 0.0543893501162529,
"rewards/rejected": -0.11206551641225815,
"step": 190
},
{
"epoch": 0.1,
"learning_rate": 9.727592877996584e-08,
"logits/chosen": -1.4986900091171265,
"logits/rejected": -1.5200297832489014,
"logps/chosen": -14.921911239624023,
"logps/rejected": -19.319089889526367,
"loss": 0.6782,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07736112922430038,
"rewards/margins": 0.037412129342556,
"rewards/rejected": -0.11477325856685638,
"step": 195
},
{
"epoch": 0.1,
"learning_rate": 9.698463103929542e-08,
"logits/chosen": -1.4672738313674927,
"logits/rejected": -1.4699958562850952,
"logps/chosen": -16.190366744995117,
"logps/rejected": -19.112018585205078,
"loss": 0.6787,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.06623920053243637,
"rewards/margins": 0.05223413184285164,
"rewards/rejected": -0.11847333610057831,
"step": 200
},
{
"epoch": 0.1,
"eval_logits/chosen": -1.7878340482711792,
"eval_logits/rejected": -1.868047833442688,
"eval_logps/chosen": -15.84277057647705,
"eval_logps/rejected": -19.780603408813477,
"eval_loss": 0.679453432559967,
"eval_rewards/accuracies": 0.5942491888999939,
"eval_rewards/chosen": -0.08296255767345428,
"eval_rewards/margins": 0.04127146303653717,
"eval_rewards/rejected": -0.12423399835824966,
"eval_runtime": 306.7549,
"eval_samples_per_second": 65.199,
"eval_steps_per_second": 1.02,
"step": 200
},
{
"epoch": 0.1,
"learning_rate": 9.667902132486008e-08,
"logits/chosen": -1.4577996730804443,
"logits/rejected": -1.458738088607788,
"logps/chosen": -14.40186882019043,
"logps/rejected": -19.614154815673828,
"loss": 0.6824,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.1032874584197998,
"rewards/margins": 0.023597324267029762,
"rewards/rejected": -0.12688478827476501,
"step": 205
},
{
"epoch": 0.1,
"learning_rate": 9.635919272833936e-08,
"logits/chosen": -1.446189284324646,
"logits/rejected": -1.4532312154769897,
"logps/chosen": -13.800382614135742,
"logps/rejected": -16.633275985717773,
"loss": 0.6743,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08655096590518951,
"rewards/margins": 0.04802204668521881,
"rewards/rejected": -0.13457301259040833,
"step": 210
},
{
"epoch": 0.11,
"learning_rate": 9.602524267262201e-08,
"logits/chosen": -1.5442548990249634,
"logits/rejected": -1.5252314805984497,
"logps/chosen": -16.61358070373535,
"logps/rejected": -19.388107299804688,
"loss": 0.6745,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.08506567776203156,
"rewards/margins": 0.061091698706150055,
"rewards/rejected": -0.14615735411643982,
"step": 215
},
{
"epoch": 0.11,
"learning_rate": 9.567727288213003e-08,
"logits/chosen": -1.5201714038848877,
"logits/rejected": -1.5108391046524048,
"logps/chosen": -16.2724609375,
"logps/rejected": -19.838550567626953,
"loss": 0.6755,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1071196049451828,
"rewards/margins": 0.05034583806991577,
"rewards/rejected": -0.15746544301509857,
"step": 220
},
{
"epoch": 0.11,
"learning_rate": 9.53153893518325e-08,
"logits/chosen": -1.4277980327606201,
"logits/rejected": -1.4468252658843994,
"logps/chosen": -15.238229751586914,
"logps/rejected": -21.562265396118164,
"loss": 0.6815,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.11674360930919647,
"rewards/margins": 0.013696810230612755,
"rewards/rejected": -0.13044041395187378,
"step": 225
},
{
"epoch": 0.12,
"learning_rate": 9.493970231495834e-08,
"logits/chosen": -1.449592113494873,
"logits/rejected": -1.4903148412704468,
"logps/chosen": -16.092803955078125,
"logps/rejected": -21.503910064697266,
"loss": 0.6787,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.08694227039813995,
"rewards/margins": 0.06734081357717514,
"rewards/rejected": -0.1542830765247345,
"step": 230
},
{
"epoch": 0.12,
"learning_rate": 9.455032620941839e-08,
"logits/chosen": -1.4714128971099854,
"logits/rejected": -1.5002821683883667,
"logps/chosen": -18.0159969329834,
"logps/rejected": -20.794544219970703,
"loss": 0.6774,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.09392134845256805,
"rewards/margins": 0.06613980978727341,
"rewards/rejected": -0.16006115078926086,
"step": 235
},
{
"epoch": 0.12,
"learning_rate": 9.414737964294634e-08,
"logits/chosen": -1.4862656593322754,
"logits/rejected": -1.5194942951202393,
"logps/chosen": -18.190040588378906,
"logps/rejected": -20.4471492767334,
"loss": 0.6773,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.12713944911956787,
"rewards/margins": 0.034943290054798126,
"rewards/rejected": -0.1620827466249466,
"step": 240
},
{
"epoch": 0.12,
"learning_rate": 9.373098535696979e-08,
"logits/chosen": -1.4240232706069946,
"logits/rejected": -1.439587116241455,
"logps/chosen": -16.851634979248047,
"logps/rejected": -21.63901138305664,
"loss": 0.6721,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.11880864202976227,
"rewards/margins": 0.0625835433602333,
"rewards/rejected": -0.18139217793941498,
"step": 245
},
{
"epoch": 0.12,
"learning_rate": 9.330127018922194e-08,
"logits/chosen": -1.490593433380127,
"logits/rejected": -1.519042730331421,
"logps/chosen": -15.26930046081543,
"logps/rejected": -20.554950714111328,
"loss": 0.6788,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12308211624622345,
"rewards/margins": 0.049174632877111435,
"rewards/rejected": -0.17225676774978638,
"step": 250
},
{
"epoch": 0.13,
"learning_rate": 9.285836503510561e-08,
"logits/chosen": -1.481011152267456,
"logits/rejected": -1.482369065284729,
"logps/chosen": -14.86046028137207,
"logps/rejected": -20.206274032592773,
"loss": 0.6683,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.09218485653400421,
"rewards/margins": 0.0633331835269928,
"rewards/rejected": -0.155518040060997,
"step": 255
},
{
"epoch": 0.13,
"learning_rate": 9.240240480782128e-08,
"logits/chosen": -1.4365761280059814,
"logits/rejected": -1.4659501314163208,
"logps/chosen": -16.5053768157959,
"logps/rejected": -20.457700729370117,
"loss": 0.6781,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.1196146234869957,
"rewards/margins": 0.0831097811460495,
"rewards/rejected": -0.2027243822813034,
"step": 260
},
{
"epoch": 0.13,
"learning_rate": 9.19335283972712e-08,
"logits/chosen": -1.4037281274795532,
"logits/rejected": -1.4219300746917725,
"logps/chosen": -17.773784637451172,
"logps/rejected": -21.13812828063965,
"loss": 0.6672,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11854933202266693,
"rewards/margins": 0.08150772750377655,
"rewards/rejected": -0.20005705952644348,
"step": 265
},
{
"epoch": 0.14,
"learning_rate": 9.145187862775208e-08,
"logits/chosen": -1.5054607391357422,
"logits/rejected": -1.5367683172225952,
"logps/chosen": -16.548221588134766,
"logps/rejected": -20.993818283081055,
"loss": 0.6649,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.1312890648841858,
"rewards/margins": 0.06594224274158478,
"rewards/rejected": -0.19723130762577057,
"step": 270
},
{
"epoch": 0.14,
"learning_rate": 9.095760221444959e-08,
"logits/chosen": -1.5523430109024048,
"logits/rejected": -1.5738317966461182,
"logps/chosen": -16.80923080444336,
"logps/rejected": -20.842090606689453,
"loss": 0.6712,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.1634162962436676,
"rewards/margins": 0.06340184062719345,
"rewards/rejected": -0.22681812942028046,
"step": 275
},
{
"epoch": 0.14,
"learning_rate": 9.045084971874737e-08,
"logits/chosen": -1.4899578094482422,
"logits/rejected": -1.5115963220596313,
"logps/chosen": -15.990486145019531,
"logps/rejected": -21.041019439697266,
"loss": 0.6678,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.1481240689754486,
"rewards/margins": 0.04743753746151924,
"rewards/rejected": -0.19556160271167755,
"step": 280
},
{
"epoch": 0.14,
"learning_rate": 8.993177550236463e-08,
"logits/chosen": -1.484438180923462,
"logits/rejected": -1.499731183052063,
"logps/chosen": -16.02753448486328,
"logps/rejected": -20.631275177001953,
"loss": 0.6734,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.15427681803703308,
"rewards/margins": 0.016259009018540382,
"rewards/rejected": -0.1705358326435089,
"step": 285
},
{
"epoch": 0.14,
"learning_rate": 8.940053768033609e-08,
"logits/chosen": -1.4610778093338013,
"logits/rejected": -1.4643038511276245,
"logps/chosen": -18.099334716796875,
"logps/rejected": -21.200000762939453,
"loss": 0.6758,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.15338566899299622,
"rewards/margins": 0.03785661607980728,
"rewards/rejected": -0.1912422627210617,
"step": 290
},
{
"epoch": 0.15,
"learning_rate": 8.885729807284853e-08,
"logits/chosen": -1.5055420398712158,
"logits/rejected": -1.5390150547027588,
"logps/chosen": -14.061132431030273,
"logps/rejected": -19.544492721557617,
"loss": 0.6667,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.189667746424675,
"rewards/margins": 0.06658226996660233,
"rewards/rejected": -0.2562499940395355,
"step": 295
},
{
"epoch": 0.15,
"learning_rate": 8.83022221559489e-08,
"logits/chosen": -1.3812211751937866,
"logits/rejected": -1.394118070602417,
"logps/chosen": -17.90454864501953,
"logps/rejected": -21.622303009033203,
"loss": 0.6701,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.177689790725708,
"rewards/margins": 0.05468187481164932,
"rewards/rejected": -0.23237165808677673,
"step": 300
},
{
"epoch": 0.15,
"eval_logits/chosen": -1.7818100452423096,
"eval_logits/rejected": -1.8616912364959717,
"eval_logps/chosen": -16.538070678710938,
"eval_logps/rejected": -20.776979446411133,
"eval_loss": 0.6713529229164124,
"eval_rewards/accuracies": 0.6006389856338501,
"eval_rewards/chosen": -0.15249261260032654,
"eval_rewards/margins": 0.07137925922870636,
"eval_rewards/rejected": -0.2238718867301941,
"eval_runtime": 306.8553,
"eval_samples_per_second": 65.177,
"eval_steps_per_second": 1.02,
"step": 300
},
{
"epoch": 0.15,
"learning_rate": 8.77354790111386e-08,
"logits/chosen": -1.4351907968521118,
"logits/rejected": -1.4078080654144287,
"logps/chosen": -15.99042797088623,
"logps/rejected": -19.805822372436523,
"loss": 0.6643,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.14661024510860443,
"rewards/margins": 0.06514577567577362,
"rewards/rejected": -0.21175602078437805,
"step": 305
},
{
"epoch": 0.15,
"learning_rate": 8.715724127386971e-08,
"logits/chosen": -1.4537417888641357,
"logits/rejected": -1.4652783870697021,
"logps/chosen": -18.730056762695312,
"logps/rejected": -22.277685165405273,
"loss": 0.6727,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.18776853382587433,
"rewards/margins": 0.04285965487360954,
"rewards/rejected": -0.23062816262245178,
"step": 310
},
{
"epoch": 0.16,
"learning_rate": 8.656768508095852e-08,
"logits/chosen": -1.400272250175476,
"logits/rejected": -1.3987435102462769,
"logps/chosen": -16.988048553466797,
"logps/rejected": -19.94150161743164,
"loss": 0.6662,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1827572137117386,
"rewards/margins": 0.03802407532930374,
"rewards/rejected": -0.22078128159046173,
"step": 315
},
{
"epoch": 0.16,
"learning_rate": 8.596699001693255e-08,
"logits/chosen": -1.4178965091705322,
"logits/rejected": -1.4355350732803345,
"logps/chosen": -15.241655349731445,
"logps/rejected": -23.634593963623047,
"loss": 0.6629,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.16266849637031555,
"rewards/margins": 0.08267641067504883,
"rewards/rejected": -0.24534490704536438,
"step": 320
},
{
"epoch": 0.16,
"learning_rate": 8.535533905932736e-08,
"logits/chosen": -1.432398796081543,
"logits/rejected": -1.4598662853240967,
"logps/chosen": -16.167316436767578,
"logps/rejected": -20.810152053833008,
"loss": 0.6686,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.18219885230064392,
"rewards/margins": 0.06463146209716797,
"rewards/rejected": -0.2468303143978119,
"step": 325
},
{
"epoch": 0.17,
"learning_rate": 8.473291852294986e-08,
"logits/chosen": -1.4149553775787354,
"logits/rejected": -1.4176180362701416,
"logps/chosen": -15.214263916015625,
"logps/rejected": -20.01323890686035,
"loss": 0.6633,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.1562959998846054,
"rewards/margins": 0.10322503745555878,
"rewards/rejected": -0.2595210373401642,
"step": 330
},
{
"epoch": 0.17,
"learning_rate": 8.409991800312493e-08,
"logits/chosen": -1.5221331119537354,
"logits/rejected": -1.5471950769424438,
"logps/chosen": -15.114236831665039,
"logps/rejected": -18.62519645690918,
"loss": 0.6609,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.12045025825500488,
"rewards/margins": 0.14489233493804932,
"rewards/rejected": -0.2653425931930542,
"step": 335
},
{
"epoch": 0.17,
"learning_rate": 8.34565303179429e-08,
"logits/chosen": -1.4450016021728516,
"logits/rejected": -1.4777119159698486,
"logps/chosen": -15.960481643676758,
"logps/rejected": -21.401214599609375,
"loss": 0.6669,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.20441024005413055,
"rewards/margins": 0.05542609095573425,
"rewards/rejected": -0.259836345911026,
"step": 340
},
{
"epoch": 0.17,
"learning_rate": 8.280295144952536e-08,
"logits/chosen": -1.3684567213058472,
"logits/rejected": -1.356687307357788,
"logps/chosen": -15.962387084960938,
"logps/rejected": -20.549938201904297,
"loss": 0.6613,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.15356414020061493,
"rewards/margins": 0.054951734840869904,
"rewards/rejected": -0.20851588249206543,
"step": 345
},
{
"epoch": 0.17,
"learning_rate": 8.213938048432696e-08,
"logits/chosen": -1.4533028602600098,
"logits/rejected": -1.47086763381958,
"logps/chosen": -15.168508529663086,
"logps/rejected": -20.278743743896484,
"loss": 0.6692,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.1997944861650467,
"rewards/margins": 0.06951111555099487,
"rewards/rejected": -0.26930561661720276,
"step": 350
},
{
"epoch": 0.18,
"learning_rate": 8.146601955249188e-08,
"logits/chosen": -1.4886163473129272,
"logits/rejected": -1.5108848810195923,
"logps/chosen": -18.865304946899414,
"logps/rejected": -24.56046485900879,
"loss": 0.6586,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.1855696439743042,
"rewards/margins": 0.1037302240729332,
"rewards/rejected": -0.289299875497818,
"step": 355
},
{
"epoch": 0.18,
"learning_rate": 8.07830737662829e-08,
"logits/chosen": -1.4619147777557373,
"logits/rejected": -1.4794903993606567,
"logps/chosen": -13.954874992370605,
"logps/rejected": -17.310880661010742,
"loss": 0.6687,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.18103952705860138,
"rewards/margins": 0.09420069307088852,
"rewards/rejected": -0.2752402424812317,
"step": 360
},
{
"epoch": 0.18,
"learning_rate": 8.009075115760243e-08,
"logits/chosen": -1.3863359689712524,
"logits/rejected": -1.3789931535720825,
"logps/chosen": -17.23479652404785,
"logps/rejected": -19.347734451293945,
"loss": 0.664,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.17440780997276306,
"rewards/margins": 0.10439540445804596,
"rewards/rejected": -0.2788031995296478,
"step": 365
},
{
"epoch": 0.18,
"learning_rate": 7.938926261462366e-08,
"logits/chosen": -1.355503797531128,
"logits/rejected": -1.3629181385040283,
"logps/chosen": -18.74729347229004,
"logps/rejected": -22.029096603393555,
"loss": 0.6606,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.21672053635120392,
"rewards/margins": 0.04532719776034355,
"rewards/rejected": -0.2620477080345154,
"step": 370
},
{
"epoch": 0.19,
"learning_rate": 7.86788218175523e-08,
"logits/chosen": -1.429302453994751,
"logits/rejected": -1.457215666770935,
"logps/chosen": -16.596994400024414,
"logps/rejected": -21.360082626342773,
"loss": 0.6667,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.18682262301445007,
"rewards/margins": 0.1307360827922821,
"rewards/rejected": -0.3175587058067322,
"step": 375
},
{
"epoch": 0.19,
"learning_rate": 7.795964517353733e-08,
"logits/chosen": -1.4372557401657104,
"logits/rejected": -1.4457218647003174,
"logps/chosen": -17.684959411621094,
"logps/rejected": -21.197628021240234,
"loss": 0.6617,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.1949060708284378,
"rewards/margins": 0.0896020457148552,
"rewards/rejected": -0.2845081090927124,
"step": 380
},
{
"epoch": 0.19,
"learning_rate": 7.723195175075135e-08,
"logits/chosen": -1.4864705801010132,
"logits/rejected": -1.4808794260025024,
"logps/chosen": -16.80634880065918,
"logps/rejected": -20.31828498840332,
"loss": 0.6581,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.1962084323167801,
"rewards/margins": 0.1290890872478485,
"rewards/rejected": -0.3252975046634674,
"step": 385
},
{
"epoch": 0.2,
"learning_rate": 7.649596321166024e-08,
"logits/chosen": -1.3273518085479736,
"logits/rejected": -1.3579399585723877,
"logps/chosen": -16.424484252929688,
"logps/rejected": -19.864755630493164,
"loss": 0.6686,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.17965315282344818,
"rewards/margins": 0.09440762549638748,
"rewards/rejected": -0.27406078577041626,
"step": 390
},
{
"epoch": 0.2,
"learning_rate": 7.575190374550272e-08,
"logits/chosen": -1.4504528045654297,
"logits/rejected": -1.4469316005706787,
"logps/chosen": -17.03780174255371,
"logps/rejected": -20.324657440185547,
"loss": 0.6587,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.22678837180137634,
"rewards/margins": 0.05626438185572624,
"rewards/rejected": -0.2830527722835541,
"step": 395
},
{
"epoch": 0.2,
"learning_rate": 7.5e-08,
"logits/chosen": -1.4595016241073608,
"logits/rejected": -1.4589979648590088,
"logps/chosen": -16.276988983154297,
"logps/rejected": -22.162479400634766,
"loss": 0.6676,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2052340805530548,
"rewards/margins": 0.09203705936670303,
"rewards/rejected": -0.29727110266685486,
"step": 400
},
{
"epoch": 0.2,
"eval_logits/chosen": -1.7746305465698242,
"eval_logits/rejected": -1.8544888496398926,
"eval_logps/chosen": -16.942358016967773,
"eval_logps/rejected": -21.468765258789062,
"eval_loss": 0.6636533737182617,
"eval_rewards/accuracies": 0.6174121499061584,
"eval_rewards/chosen": -0.19292119145393372,
"eval_rewards/margins": 0.10012920200824738,
"eval_rewards/rejected": -0.2930504083633423,
"eval_runtime": 306.9122,
"eval_samples_per_second": 65.165,
"eval_steps_per_second": 1.02,
"step": 400
},
{
"epoch": 0.2,
"learning_rate": 7.424048101231686e-08,
"logits/chosen": -1.3691225051879883,
"logits/rejected": -1.3902453184127808,
"logps/chosen": -18.4908504486084,
"logps/rejected": -25.587356567382812,
"loss": 0.6637,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.2073945701122284,
"rewards/margins": 0.09110216796398163,
"rewards/rejected": -0.2984967529773712,
"step": 405
},
{
"epoch": 0.2,
"learning_rate": 7.347357813929453e-08,
"logits/chosen": -1.4559385776519775,
"logits/rejected": -1.4722373485565186,
"logps/chosen": -14.663101196289062,
"logps/rejected": -18.937580108642578,
"loss": 0.658,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.19805128872394562,
"rewards/margins": 0.08183437585830688,
"rewards/rejected": -0.2798856794834137,
"step": 410
},
{
"epoch": 0.21,
"learning_rate": 7.269952498697734e-08,
"logits/chosen": -1.4937471151351929,
"logits/rejected": -1.508141279220581,
"logps/chosen": -16.866802215576172,
"logps/rejected": -19.958166122436523,
"loss": 0.6675,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.16971763968467712,
"rewards/margins": 0.0776999443769455,
"rewards/rejected": -0.2474175989627838,
"step": 415
},
{
"epoch": 0.21,
"learning_rate": 7.191855733945387e-08,
"logits/chosen": -1.4956650733947754,
"logits/rejected": -1.4953656196594238,
"logps/chosen": -15.598005294799805,
"logps/rejected": -20.977344512939453,
"loss": 0.6545,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.20860418677330017,
"rewards/margins": 0.113286092877388,
"rewards/rejected": -0.321890264749527,
"step": 420
},
{
"epoch": 0.21,
"learning_rate": 7.113091308703497e-08,
"logits/chosen": -1.4557757377624512,
"logits/rejected": -1.4881502389907837,
"logps/chosen": -15.815752029418945,
"logps/rejected": -23.03244400024414,
"loss": 0.6648,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.18280960619449615,
"rewards/margins": 0.10344807803630829,
"rewards/rejected": -0.28625768423080444,
"step": 425
},
{
"epoch": 0.21,
"learning_rate": 7.033683215379002e-08,
"logits/chosen": -1.5411797761917114,
"logits/rejected": -1.52112877368927,
"logps/chosen": -19.202138900756836,
"logps/rejected": -21.153202056884766,
"loss": 0.6586,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.21284401416778564,
"rewards/margins": 0.06404918432235718,
"rewards/rejected": -0.2768932282924652,
"step": 430
},
{
"epoch": 0.22,
"learning_rate": 6.953655642446368e-08,
"logits/chosen": -1.3810409307479858,
"logits/rejected": -1.3840851783752441,
"logps/chosen": -19.21212387084961,
"logps/rejected": -23.80167579650879,
"loss": 0.6577,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.18992505967617035,
"rewards/margins": 0.10662909597158432,
"rewards/rejected": -0.2965541481971741,
"step": 435
},
{
"epoch": 0.22,
"learning_rate": 6.87303296707956e-08,
"logits/chosen": -1.3490447998046875,
"logits/rejected": -1.374977707862854,
"logps/chosen": -15.331153869628906,
"logps/rejected": -22.786605834960938,
"loss": 0.6525,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2073613405227661,
"rewards/margins": 0.13075020909309387,
"rewards/rejected": -0.3381115794181824,
"step": 440
},
{
"epoch": 0.22,
"learning_rate": 6.7918397477265e-08,
"logits/chosen": -1.4382171630859375,
"logits/rejected": -1.4352459907531738,
"logps/chosen": -17.559337615966797,
"logps/rejected": -21.59587860107422,
"loss": 0.6659,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.20677068829536438,
"rewards/margins": 0.11824776977300644,
"rewards/rejected": -0.3250184655189514,
"step": 445
},
{
"epoch": 0.23,
"learning_rate": 6.710100716628345e-08,
"logits/chosen": -1.5328034162521362,
"logits/rejected": -1.5469181537628174,
"logps/chosen": -17.923799514770508,
"logps/rejected": -21.902999877929688,
"loss": 0.6573,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.21206100285053253,
"rewards/margins": 0.102953240275383,
"rewards/rejected": -0.3150142729282379,
"step": 450
},
{
"epoch": 0.23,
"learning_rate": 6.627840772285784e-08,
"logits/chosen": -1.4326767921447754,
"logits/rejected": -1.460436224937439,
"logps/chosen": -16.84982681274414,
"logps/rejected": -21.357173919677734,
"loss": 0.6489,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.18491019308567047,
"rewards/margins": 0.1146242767572403,
"rewards/rejected": -0.29953449964523315,
"step": 455
},
{
"epoch": 0.23,
"learning_rate": 6.545084971874738e-08,
"logits/chosen": -1.4505399465560913,
"logits/rejected": -1.4668846130371094,
"logps/chosen": -17.121883392333984,
"logps/rejected": -21.807418823242188,
"loss": 0.6563,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.22878995537757874,
"rewards/margins": 0.12952670454978943,
"rewards/rejected": -0.3583166301250458,
"step": 460
},
{
"epoch": 0.23,
"learning_rate": 6.461858523613683e-08,
"logits/chosen": -1.4589532613754272,
"logits/rejected": -1.4687752723693848,
"logps/chosen": -16.5905818939209,
"logps/rejected": -22.4600830078125,
"loss": 0.6516,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.23870249092578888,
"rewards/margins": 0.09481547772884369,
"rewards/rejected": -0.3335179388523102,
"step": 465
},
{
"epoch": 0.23,
"learning_rate": 6.378186779084996e-08,
"logits/chosen": -1.4866201877593994,
"logits/rejected": -1.4896290302276611,
"logps/chosen": -17.042253494262695,
"logps/rejected": -23.417699813842773,
"loss": 0.6498,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.22306545078754425,
"rewards/margins": 0.13384439051151276,
"rewards/rejected": -0.3569098114967346,
"step": 470
},
{
"epoch": 0.24,
"learning_rate": 6.294095225512604e-08,
"logits/chosen": -1.4022520780563354,
"logits/rejected": -1.3814033269882202,
"logps/chosen": -16.337562561035156,
"logps/rejected": -21.31450653076172,
"loss": 0.6569,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2246430367231369,
"rewards/margins": 0.15105651319026947,
"rewards/rejected": -0.37569957971572876,
"step": 475
},
{
"epoch": 0.24,
"learning_rate": 6.209609477998338e-08,
"logits/chosen": -1.479479193687439,
"logits/rejected": -1.5034449100494385,
"logps/chosen": -17.957107543945312,
"logps/rejected": -23.098215103149414,
"loss": 0.6629,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.24643921852111816,
"rewards/margins": 0.1044594869017601,
"rewards/rejected": -0.3508986830711365,
"step": 480
},
{
"epoch": 0.24,
"learning_rate": 6.124755271719325e-08,
"logits/chosen": -1.4407473802566528,
"logits/rejected": -1.49507737159729,
"logps/chosen": -14.73084545135498,
"logps/rejected": -20.570337295532227,
"loss": 0.6481,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.19731993973255157,
"rewards/margins": 0.14940151572227478,
"rewards/rejected": -0.34672147035598755,
"step": 485
},
{
"epoch": 0.24,
"learning_rate": 6.039558454088796e-08,
"logits/chosen": -1.5169572830200195,
"logits/rejected": -1.5479459762573242,
"logps/chosen": -13.957262992858887,
"logps/rejected": -22.48187255859375,
"loss": 0.6496,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.20421214401721954,
"rewards/margins": 0.16293281316757202,
"rewards/rejected": -0.36714497208595276,
"step": 490
},
{
"epoch": 0.25,
"learning_rate": 5.954044976882724e-08,
"logits/chosen": -1.4714252948760986,
"logits/rejected": -1.4994815587997437,
"logps/chosen": -17.487274169921875,
"logps/rejected": -23.63813591003418,
"loss": 0.6526,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.2372887134552002,
"rewards/margins": 0.12310683727264404,
"rewards/rejected": -0.36039552092552185,
"step": 495
},
{
"epoch": 0.25,
"learning_rate": 5.868240888334653e-08,
"logits/chosen": -1.4606778621673584,
"logits/rejected": -1.4551252126693726,
"logps/chosen": -16.36603355407715,
"logps/rejected": -21.119382858276367,
"loss": 0.6562,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.2265624701976776,
"rewards/margins": 0.16614754498004913,
"rewards/rejected": -0.39271003007888794,
"step": 500
},
{
"epoch": 0.25,
"eval_logits/chosen": -1.7672057151794434,
"eval_logits/rejected": -1.8467962741851807,
"eval_logps/chosen": -17.221099853515625,
"eval_logps/rejected": -22.007465362548828,
"eval_loss": 0.6560497283935547,
"eval_rewards/accuracies": 0.634984016418457,
"eval_rewards/chosen": -0.22079555690288544,
"eval_rewards/margins": 0.12612493336200714,
"eval_rewards/rejected": -0.3469204902648926,
"eval_runtime": 306.6923,
"eval_samples_per_second": 65.212,
"eval_steps_per_second": 1.021,
"step": 500
},
{
"epoch": 0.25,
"learning_rate": 5.7821723252011546e-08,
"logits/chosen": -1.4445126056671143,
"logits/rejected": -1.4752463102340698,
"logps/chosen": -18.014240264892578,
"logps/rejected": -25.679813385009766,
"loss": 0.6619,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2500578463077545,
"rewards/margins": 0.07094712555408478,
"rewards/rejected": -0.3210049867630005,
"step": 505
},
{
"epoch": 0.26,
"learning_rate": 5.695865504800327e-08,
"logits/chosen": -1.4805431365966797,
"logits/rejected": -1.4579927921295166,
"logps/chosen": -16.328933715820312,
"logps/rejected": -23.8695068359375,
"loss": 0.6543,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2136952131986618,
"rewards/margins": 0.15080443024635315,
"rewards/rejected": -0.36449965834617615,
"step": 510
},
{
"epoch": 0.26,
"learning_rate": 5.6093467170257366e-08,
"logits/chosen": -1.38763427734375,
"logits/rejected": -1.429253339767456,
"logps/chosen": -17.211883544921875,
"logps/rejected": -21.488162994384766,
"loss": 0.6454,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.23104102909564972,
"rewards/margins": 0.11019430309534073,
"rewards/rejected": -0.34123533964157104,
"step": 515
},
{
"epoch": 0.26,
"learning_rate": 5.5226423163382677e-08,
"logits/chosen": -1.3689727783203125,
"logits/rejected": -1.3758208751678467,
"logps/chosen": -18.692577362060547,
"logps/rejected": -21.925251007080078,
"loss": 0.6466,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.21853986382484436,
"rewards/margins": 0.09194198995828629,
"rewards/rejected": -0.31048184633255005,
"step": 520
},
{
"epoch": 0.26,
"learning_rate": 5.435778713738292e-08,
"logits/chosen": -1.3423216342926025,
"logits/rejected": -1.355452299118042,
"logps/chosen": -16.941513061523438,
"logps/rejected": -20.827672958374023,
"loss": 0.652,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.2414543330669403,
"rewards/margins": 0.10109053552150726,
"rewards/rejected": -0.3425448536872864,
"step": 525
},
{
"epoch": 0.27,
"learning_rate": 5.3487823687206256e-08,
"logits/chosen": -1.4701192378997803,
"logits/rejected": -1.4796873331069946,
"logps/chosen": -19.578105926513672,
"logps/rejected": -25.23065757751465,
"loss": 0.6484,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.251248836517334,
"rewards/margins": 0.13180874288082123,
"rewards/rejected": -0.3830576241016388,
"step": 530
},
{
"epoch": 0.27,
"learning_rate": 5.261679781214719e-08,
"logits/chosen": -1.435497522354126,
"logits/rejected": -1.459380865097046,
"logps/chosen": -15.287922859191895,
"logps/rejected": -20.136037826538086,
"loss": 0.6582,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.23688891530036926,
"rewards/margins": 0.1384437382221222,
"rewards/rejected": -0.37533265352249146,
"step": 535
},
{
"epoch": 0.27,
"learning_rate": 5.1744974835125056e-08,
"logits/chosen": -1.4165819883346558,
"logits/rejected": -1.3940761089324951,
"logps/chosen": -17.587196350097656,
"logps/rejected": -21.539077758789062,
"loss": 0.6443,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2061920464038849,
"rewards/margins": 0.133754700422287,
"rewards/rejected": -0.3399467468261719,
"step": 540
},
{
"epoch": 0.27,
"learning_rate": 5.087262032186418e-08,
"logits/chosen": -1.4642976522445679,
"logits/rejected": -1.469395637512207,
"logps/chosen": -19.382497787475586,
"logps/rejected": -25.097797393798828,
"loss": 0.6469,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.22812576591968536,
"rewards/margins": 0.1407867670059204,
"rewards/rejected": -0.3689125180244446,
"step": 545
},
{
"epoch": 0.28,
"learning_rate": 5e-08,
"logits/chosen": -1.4709419012069702,
"logits/rejected": -1.4961137771606445,
"logps/chosen": -18.24771499633789,
"logps/rejected": -22.102489471435547,
"loss": 0.6616,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.23151636123657227,
"rewards/margins": 0.10363288968801498,
"rewards/rejected": -0.33514922857284546,
"step": 550
},
{
"epoch": 0.28,
"learning_rate": 4.912737967813582e-08,
"logits/chosen": -1.425290584564209,
"logits/rejected": -1.4705363512039185,
"logps/chosen": -20.757022857666016,
"logps/rejected": -25.79976463317871,
"loss": 0.6412,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.2047382891178131,
"rewards/margins": 0.147218257188797,
"rewards/rejected": -0.3519565165042877,
"step": 555
},
{
"epoch": 0.28,
"learning_rate": 4.8255025164874966e-08,
"logits/chosen": -1.4153515100479126,
"logits/rejected": -1.4406566619873047,
"logps/chosen": -15.806404113769531,
"logps/rejected": -23.537967681884766,
"loss": 0.6501,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.26336127519607544,
"rewards/margins": 0.0594928041100502,
"rewards/rejected": -0.32285410165786743,
"step": 560
},
{
"epoch": 0.28,
"learning_rate": 4.73832021878528e-08,
"logits/chosen": -1.4600375890731812,
"logits/rejected": -1.465736746788025,
"logps/chosen": -14.703231811523438,
"logps/rejected": -22.058746337890625,
"loss": 0.6494,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.2418874204158783,
"rewards/margins": 0.14205661416053772,
"rewards/rejected": -0.383944034576416,
"step": 565
},
{
"epoch": 0.28,
"learning_rate": 4.651217631279373e-08,
"logits/chosen": -1.488512396812439,
"logits/rejected": -1.5095858573913574,
"logps/chosen": -16.324678421020508,
"logps/rejected": -24.393007278442383,
"loss": 0.6459,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.22573427855968475,
"rewards/margins": 0.1321270912885666,
"rewards/rejected": -0.35786136984825134,
"step": 570
},
{
"epoch": 0.29,
"learning_rate": 4.5642212862617084e-08,
"logits/chosen": -1.4317299127578735,
"logits/rejected": -1.455801010131836,
"logps/chosen": -15.989962577819824,
"logps/rejected": -18.849998474121094,
"loss": 0.6477,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.2763644754886627,
"rewards/margins": 0.11443523317575455,
"rewards/rejected": -0.39079970121383667,
"step": 575
},
{
"epoch": 0.29,
"learning_rate": 4.477357683661733e-08,
"logits/chosen": -1.4575328826904297,
"logits/rejected": -1.470690131187439,
"logps/chosen": -18.188373565673828,
"logps/rejected": -22.835786819458008,
"loss": 0.6512,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.22157052159309387,
"rewards/margins": 0.10469740629196167,
"rewards/rejected": -0.32626792788505554,
"step": 580
},
{
"epoch": 0.29,
"learning_rate": 4.390653282974263e-08,
"logits/chosen": -1.503989815711975,
"logits/rejected": -1.519313931465149,
"logps/chosen": -16.674240112304688,
"logps/rejected": -21.952165603637695,
"loss": 0.6365,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2247585952281952,
"rewards/margins": 0.203842431306839,
"rewards/rejected": -0.4286009669303894,
"step": 585
},
{
"epoch": 0.29,
"learning_rate": 4.304134495199674e-08,
"logits/chosen": -1.4609280824661255,
"logits/rejected": -1.4766910076141357,
"logps/chosen": -19.406192779541016,
"logps/rejected": -22.686025619506836,
"loss": 0.6364,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.2958422303199768,
"rewards/margins": 0.11995653063058853,
"rewards/rejected": -0.4157988131046295,
"step": 590
},
{
"epoch": 0.3,
"learning_rate": 4.217827674798844e-08,
"logits/chosen": -1.4871938228607178,
"logits/rejected": -1.5188095569610596,
"logps/chosen": -17.268047332763672,
"logps/rejected": -23.33865737915039,
"loss": 0.6532,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.31558969616889954,
"rewards/margins": 0.11393972486257553,
"rewards/rejected": -0.42952942848205566,
"step": 595
},
{
"epoch": 0.3,
"learning_rate": 4.131759111665348e-08,
"logits/chosen": -1.4645878076553345,
"logits/rejected": -1.4607641696929932,
"logps/chosen": -17.968856811523438,
"logps/rejected": -24.378482818603516,
"loss": 0.6483,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.20308199524879456,
"rewards/margins": 0.19312706589698792,
"rewards/rejected": -0.3962090015411377,
"step": 600
},
{
"epoch": 0.3,
"eval_logits/chosen": -1.762748122215271,
"eval_logits/rejected": -1.8424293994903564,
"eval_logps/chosen": -17.52169418334961,
"eval_logps/rejected": -22.536460876464844,
"eval_loss": 0.65033358335495,
"eval_rewards/accuracies": 0.6413738131523132,
"eval_rewards/chosen": -0.2508549690246582,
"eval_rewards/margins": 0.14896489679813385,
"eval_rewards/rejected": -0.39981985092163086,
"eval_runtime": 306.9145,
"eval_samples_per_second": 65.165,
"eval_steps_per_second": 1.02,
"step": 600
},
{
"epoch": 0.3,
"learning_rate": 4.0459550231172757e-08,
"logits/chosen": -1.3895206451416016,
"logits/rejected": -1.4285638332366943,
"logps/chosen": -17.535720825195312,
"logps/rejected": -24.525096893310547,
"loss": 0.6521,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.2898118793964386,
"rewards/margins": 0.08567583560943604,
"rewards/rejected": -0.37548771500587463,
"step": 605
},
{
"epoch": 0.3,
"learning_rate": 3.960441545911204e-08,
"logits/chosen": -1.4046005010604858,
"logits/rejected": -1.4093401432037354,
"logps/chosen": -19.01970863342285,
"logps/rejected": -22.846820831298828,
"loss": 0.6486,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.28651127219200134,
"rewards/margins": 0.07938197255134583,
"rewards/rejected": -0.36589327454566956,
"step": 610
},
{
"epoch": 0.31,
"learning_rate": 3.8752447282806754e-08,
"logits/chosen": -1.4743293523788452,
"logits/rejected": -1.481278419494629,
"logps/chosen": -15.964106559753418,
"logps/rejected": -22.171709060668945,
"loss": 0.6402,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.21257051825523376,
"rewards/margins": 0.2042216807603836,
"rewards/rejected": -0.41679221391677856,
"step": 615
},
{
"epoch": 0.31,
"learning_rate": 3.7903905220016615e-08,
"logits/chosen": -1.3876988887786865,
"logits/rejected": -1.410398244857788,
"logps/chosen": -17.697256088256836,
"logps/rejected": -23.332763671875,
"loss": 0.6429,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.23554810881614685,
"rewards/margins": 0.13481906056404114,
"rewards/rejected": -0.3703671991825104,
"step": 620
},
{
"epoch": 0.31,
"learning_rate": 3.705904774487396e-08,
"logits/chosen": -1.383541464805603,
"logits/rejected": -1.3796765804290771,
"logps/chosen": -18.24690818786621,
"logps/rejected": -24.86931610107422,
"loss": 0.6398,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.23973222076892853,
"rewards/margins": 0.16422876715660095,
"rewards/rejected": -0.4039610028266907,
"step": 625
},
{
"epoch": 0.32,
"learning_rate": 3.621813220915004e-08,
"logits/chosen": -1.450282335281372,
"logits/rejected": -1.4380896091461182,
"logps/chosen": -16.290973663330078,
"logps/rejected": -21.838808059692383,
"loss": 0.6481,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2356175184249878,
"rewards/margins": 0.14112675189971924,
"rewards/rejected": -0.37674424052238464,
"step": 630
},
{
"epoch": 0.32,
"learning_rate": 3.538141476386316e-08,
"logits/chosen": -1.4591354131698608,
"logits/rejected": -1.456343412399292,
"logps/chosen": -17.22078514099121,
"logps/rejected": -21.944005966186523,
"loss": 0.6491,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2835259735584259,
"rewards/margins": 0.10515954345464706,
"rewards/rejected": -0.38868552446365356,
"step": 635
},
{
"epoch": 0.32,
"learning_rate": 3.4549150281252633e-08,
"logits/chosen": -1.428468108177185,
"logits/rejected": -1.4328949451446533,
"logps/chosen": -16.396862030029297,
"logps/rejected": -20.86203384399414,
"loss": 0.6369,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.2767188251018524,
"rewards/margins": 0.12265461683273315,
"rewards/rejected": -0.39937347173690796,
"step": 640
},
{
"epoch": 0.32,
"learning_rate": 3.372159227714218e-08,
"logits/chosen": -1.3802716732025146,
"logits/rejected": -1.4310414791107178,
"logps/chosen": -18.36510467529297,
"logps/rejected": -20.808429718017578,
"loss": 0.6429,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.2337242066860199,
"rewards/margins": 0.1597466617822647,
"rewards/rejected": -0.3934708833694458,
"step": 645
},
{
"epoch": 0.33,
"learning_rate": 3.2898992833716563e-08,
"logits/chosen": -1.482743263244629,
"logits/rejected": -1.4772045612335205,
"logps/chosen": -15.889050483703613,
"logps/rejected": -19.02865982055664,
"loss": 0.6478,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.2292957603931427,
"rewards/margins": 0.16532939672470093,
"rewards/rejected": -0.39462512731552124,
"step": 650
},
{
"epoch": 0.33,
"learning_rate": 3.208160252273498e-08,
"logits/chosen": -1.4797611236572266,
"logits/rejected": -1.4938347339630127,
"logps/chosen": -18.688114166259766,
"logps/rejected": -22.962867736816406,
"loss": 0.644,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.2989271283149719,
"rewards/margins": 0.12799039483070374,
"rewards/rejected": -0.4269174635410309,
"step": 655
},
{
"epoch": 0.33,
"learning_rate": 3.126967032920439e-08,
"logits/chosen": -1.4451311826705933,
"logits/rejected": -1.4879848957061768,
"logps/chosen": -17.379915237426758,
"logps/rejected": -23.540796279907227,
"loss": 0.6363,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.23357510566711426,
"rewards/margins": 0.11232779920101166,
"rewards/rejected": -0.3459029197692871,
"step": 660
},
{
"epoch": 0.33,
"learning_rate": 3.046344357553632e-08,
"logits/chosen": -1.5276529788970947,
"logits/rejected": -1.5440245866775513,
"logps/chosen": -17.0556640625,
"logps/rejected": -22.744121551513672,
"loss": 0.6308,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.23212318122386932,
"rewards/margins": 0.16936565935611725,
"rewards/rejected": -0.4014888405799866,
"step": 665
},
{
"epoch": 0.34,
"learning_rate": 2.9663167846209998e-08,
"logits/chosen": -1.4147757291793823,
"logits/rejected": -1.418082356452942,
"logps/chosen": -19.272014617919922,
"logps/rejected": -27.263992309570312,
"loss": 0.6512,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.2706632614135742,
"rewards/margins": 0.18588735163211823,
"rewards/rejected": -0.45655059814453125,
"step": 670
},
{
"epoch": 0.34,
"learning_rate": 2.8869086912965035e-08,
"logits/chosen": -1.4479643106460571,
"logits/rejected": -1.4662295579910278,
"logps/chosen": -17.57897186279297,
"logps/rejected": -25.184152603149414,
"loss": 0.6423,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2373163402080536,
"rewards/margins": 0.2066657841205597,
"rewards/rejected": -0.4439820647239685,
"step": 675
},
{
"epoch": 0.34,
"learning_rate": 2.8081442660546124e-08,
"logits/chosen": -1.36322820186615,
"logits/rejected": -1.3949863910675049,
"logps/chosen": -18.550283432006836,
"logps/rejected": -23.298870086669922,
"loss": 0.6416,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.25548475980758667,
"rewards/margins": 0.18017227947711945,
"rewards/rejected": -0.4356570839881897,
"step": 680
},
{
"epoch": 0.34,
"learning_rate": 2.730047501302266e-08,
"logits/chosen": -1.4136111736297607,
"logits/rejected": -1.4232200384140015,
"logps/chosen": -17.45450210571289,
"logps/rejected": -20.295621871948242,
"loss": 0.6415,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.2979032099246979,
"rewards/margins": 0.12163282930850983,
"rewards/rejected": -0.4195360243320465,
"step": 685
},
{
"epoch": 0.34,
"learning_rate": 2.6526421860705472e-08,
"logits/chosen": -1.384798526763916,
"logits/rejected": -1.4583854675292969,
"logps/chosen": -16.522525787353516,
"logps/rejected": -23.442546844482422,
"loss": 0.6288,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.23055878281593323,
"rewards/margins": 0.23925617337226868,
"rewards/rejected": -0.4698149561882019,
"step": 690
},
{
"epoch": 0.35,
"learning_rate": 2.5759518987683148e-08,
"logits/chosen": -1.4321386814117432,
"logits/rejected": -1.4662306308746338,
"logps/chosen": -17.545879364013672,
"logps/rejected": -20.307212829589844,
"loss": 0.6431,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.31093353033065796,
"rewards/margins": 0.1719907969236374,
"rewards/rejected": -0.48292431235313416,
"step": 695
},
{
"epoch": 0.35,
"learning_rate": 2.500000000000001e-08,
"logits/chosen": -1.4065908193588257,
"logits/rejected": -1.4342257976531982,
"logps/chosen": -17.379558563232422,
"logps/rejected": -22.622777938842773,
"loss": 0.6413,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.27703213691711426,
"rewards/margins": 0.15564481914043427,
"rewards/rejected": -0.4326769709587097,
"step": 700
},
{
"epoch": 0.35,
"eval_logits/chosen": -1.7605791091918945,
"eval_logits/rejected": -1.8403115272521973,
"eval_logps/chosen": -17.779977798461914,
"eval_logps/rejected": -22.98041343688965,
"eval_loss": 0.6457803845405579,
"eval_rewards/accuracies": 0.6469648480415344,
"eval_rewards/chosen": -0.27668341994285583,
"eval_rewards/margins": 0.16753174364566803,
"eval_rewards/rejected": -0.44421514868736267,
"eval_runtime": 307.0164,
"eval_samples_per_second": 65.143,
"eval_steps_per_second": 1.019,
"step": 700
},
{
"epoch": 0.35,
"learning_rate": 2.4248096254497287e-08,
"logits/chosen": -1.4401991367340088,
"logits/rejected": -1.4596917629241943,
"logps/chosen": -17.921367645263672,
"logps/rejected": -23.322214126586914,
"loss": 0.6378,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.29132723808288574,
"rewards/margins": 0.19050189852714539,
"rewards/rejected": -0.4818291664123535,
"step": 705
},
{
"epoch": 0.35,
"learning_rate": 2.350403678833976e-08,
"logits/chosen": -1.4126722812652588,
"logits/rejected": -1.420345425605774,
"logps/chosen": -19.046810150146484,
"logps/rejected": -25.77749252319336,
"loss": 0.6416,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.30587273836135864,
"rewards/margins": 0.18231990933418274,
"rewards/rejected": -0.4881926476955414,
"step": 710
},
{
"epoch": 0.36,
"learning_rate": 2.2768048249248644e-08,
"logits/chosen": -1.3649728298187256,
"logits/rejected": -1.3823496103286743,
"logps/chosen": -17.098003387451172,
"logps/rejected": -21.837902069091797,
"loss": 0.6356,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.33593738079071045,
"rewards/margins": 0.1847352534532547,
"rewards/rejected": -0.520672619342804,
"step": 715
},
{
"epoch": 0.36,
"learning_rate": 2.2040354826462664e-08,
"logits/chosen": -1.3861441612243652,
"logits/rejected": -1.3953332901000977,
"logps/chosen": -17.742359161376953,
"logps/rejected": -24.968896865844727,
"loss": 0.6353,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.32852208614349365,
"rewards/margins": 0.14815881848335266,
"rewards/rejected": -0.4766809046268463,
"step": 720
},
{
"epoch": 0.36,
"learning_rate": 2.1321178182447707e-08,
"logits/chosen": -1.3506596088409424,
"logits/rejected": -1.3595778942108154,
"logps/chosen": -19.100223541259766,
"logps/rejected": -24.44327163696289,
"loss": 0.6449,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.31565234065055847,
"rewards/margins": 0.11223573982715607,
"rewards/rejected": -0.42788806557655334,
"step": 725
},
{
"epoch": 0.36,
"learning_rate": 2.0610737385376347e-08,
"logits/chosen": -1.4714252948760986,
"logits/rejected": -1.4704372882843018,
"logps/chosen": -20.749143600463867,
"logps/rejected": -23.971107482910156,
"loss": 0.6357,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.31912142038345337,
"rewards/margins": 0.1428820788860321,
"rewards/rejected": -0.4620034694671631,
"step": 730
},
{
"epoch": 0.37,
"learning_rate": 1.990924884239758e-08,
"logits/chosen": -1.4702327251434326,
"logits/rejected": -1.4725419282913208,
"logps/chosen": -19.519115447998047,
"logps/rejected": -21.988479614257812,
"loss": 0.6404,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.30419957637786865,
"rewards/margins": 0.16355463862419128,
"rewards/rejected": -0.46775418519973755,
"step": 735
},
{
"epoch": 0.37,
"learning_rate": 1.9216926233717085e-08,
"logits/chosen": -1.4942646026611328,
"logits/rejected": -1.5009443759918213,
"logps/chosen": -16.522647857666016,
"logps/rejected": -24.716516494750977,
"loss": 0.6315,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2690926194190979,
"rewards/margins": 0.16352799534797668,
"rewards/rejected": -0.4326205849647522,
"step": 740
},
{
"epoch": 0.37,
"learning_rate": 1.8533980447508135e-08,
"logits/chosen": -1.4108445644378662,
"logits/rejected": -1.4322090148925781,
"logps/chosen": -18.862674713134766,
"logps/rejected": -23.953310012817383,
"loss": 0.636,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.2986528277397156,
"rewards/margins": 0.14546221494674683,
"rewards/rejected": -0.4441150724887848,
"step": 745
},
{
"epoch": 0.38,
"learning_rate": 1.786061951567303e-08,
"logits/chosen": -1.4277067184448242,
"logits/rejected": -1.438820481300354,
"logps/chosen": -19.40964126586914,
"logps/rejected": -26.506017684936523,
"loss": 0.6373,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.29312849044799805,
"rewards/margins": 0.17660866677761078,
"rewards/rejected": -0.46973714232444763,
"step": 750
},
{
"epoch": 0.38,
"learning_rate": 1.719704855047464e-08,
"logits/chosen": -1.408469319343567,
"logits/rejected": -1.4118237495422363,
"logps/chosen": -17.609798431396484,
"logps/rejected": -22.488494873046875,
"loss": 0.636,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.29817792773246765,
"rewards/margins": 0.1390009969472885,
"rewards/rejected": -0.43717893958091736,
"step": 755
},
{
"epoch": 0.38,
"learning_rate": 1.6543469682057103e-08,
"logits/chosen": -1.3890222311019897,
"logits/rejected": -1.4223724603652954,
"logps/chosen": -16.866146087646484,
"logps/rejected": -21.382225036621094,
"loss": 0.6432,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.23102489113807678,
"rewards/margins": 0.185149148106575,
"rewards/rejected": -0.4161740243434906,
"step": 760
},
{
"epoch": 0.38,
"learning_rate": 1.590008199687508e-08,
"logits/chosen": -1.4136755466461182,
"logits/rejected": -1.4180434942245483,
"logps/chosen": -17.45306968688965,
"logps/rejected": -20.812774658203125,
"loss": 0.6425,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.29186105728149414,
"rewards/margins": 0.115916408598423,
"rewards/rejected": -0.40777745842933655,
"step": 765
},
{
"epoch": 0.39,
"learning_rate": 1.526708147705013e-08,
"logits/chosen": -1.4240939617156982,
"logits/rejected": -1.4586817026138306,
"logps/chosen": -17.540027618408203,
"logps/rejected": -22.493268966674805,
"loss": 0.6404,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2557734549045563,
"rewards/margins": 0.17367486655712128,
"rewards/rejected": -0.42944836616516113,
"step": 770
},
{
"epoch": 0.39,
"learning_rate": 1.4644660940672625e-08,
"logits/chosen": -1.3837788105010986,
"logits/rejected": -1.4116965532302856,
"logps/chosen": -17.056177139282227,
"logps/rejected": -25.01016616821289,
"loss": 0.6338,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.28285446763038635,
"rewards/margins": 0.2001866102218628,
"rewards/rejected": -0.48304110765457153,
"step": 775
},
{
"epoch": 0.39,
"learning_rate": 1.4033009983067451e-08,
"logits/chosen": -1.4337577819824219,
"logits/rejected": -1.446617603302002,
"logps/chosen": -16.082469940185547,
"logps/rejected": -22.56252670288086,
"loss": 0.6439,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.2657018005847931,
"rewards/margins": 0.24111859500408173,
"rewards/rejected": -0.5068204402923584,
"step": 780
},
{
"epoch": 0.39,
"learning_rate": 1.3432314919041476e-08,
"logits/chosen": -1.47090744972229,
"logits/rejected": -1.508124589920044,
"logps/chosen": -14.742795944213867,
"logps/rejected": -19.045040130615234,
"loss": 0.6327,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.2083434760570526,
"rewards/margins": 0.19431748986244202,
"rewards/rejected": -0.402660995721817,
"step": 785
},
{
"epoch": 0.4,
"learning_rate": 1.2842758726130282e-08,
"logits/chosen": -1.404345989227295,
"logits/rejected": -1.4321860074996948,
"logps/chosen": -18.46529769897461,
"logps/rejected": -22.419506072998047,
"loss": 0.6581,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.3104281723499298,
"rewards/margins": 0.12459827959537506,
"rewards/rejected": -0.43502646684646606,
"step": 790
},
{
"epoch": 0.4,
"learning_rate": 1.2264520988861398e-08,
"logits/chosen": -1.528306007385254,
"logits/rejected": -1.548837661743164,
"logps/chosen": -18.500003814697266,
"logps/rejected": -22.167816162109375,
"loss": 0.6485,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.29536038637161255,
"rewards/margins": 0.1245318204164505,
"rewards/rejected": -0.41989216208457947,
"step": 795
},
{
"epoch": 0.4,
"learning_rate": 1.1697777844051105e-08,
"logits/chosen": -1.4373068809509277,
"logits/rejected": -1.4536762237548828,
"logps/chosen": -19.13837242126465,
"logps/rejected": -21.966625213623047,
"loss": 0.6364,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.26146507263183594,
"rewards/margins": 0.19086112082004547,
"rewards/rejected": -0.4523262083530426,
"step": 800
},
{
"epoch": 0.4,
"eval_logits/chosen": -1.7574115991592407,
"eval_logits/rejected": -1.8370665311813354,
"eval_logps/chosen": -17.89972496032715,
"eval_logps/rejected": -23.20018768310547,
"eval_loss": 0.6436580419540405,
"eval_rewards/accuracies": 0.6449680328369141,
"eval_rewards/chosen": -0.28865811228752136,
"eval_rewards/margins": 0.17753452062606812,
"eval_rewards/rejected": -0.4661926031112671,
"eval_runtime": 306.9231,
"eval_samples_per_second": 65.163,
"eval_steps_per_second": 1.02,
"step": 800
},
{
"epoch": 0.4,
"learning_rate": 1.1142701927151454e-08,
"logits/chosen": -1.4606168270111084,
"logits/rejected": -1.4665342569351196,
"logps/chosen": -15.579089164733887,
"logps/rejected": -22.30219268798828,
"loss": 0.6483,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2538829445838928,
"rewards/margins": 0.22965078055858612,
"rewards/rejected": -0.4835337698459625,
"step": 805
},
{
"epoch": 0.41,
"learning_rate": 1.0599462319663904e-08,
"logits/chosen": -1.3860244750976562,
"logits/rejected": -1.3826844692230225,
"logps/chosen": -17.648862838745117,
"logps/rejected": -25.444860458374023,
"loss": 0.6498,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.2895384132862091,
"rewards/margins": 0.12358560413122177,
"rewards/rejected": -0.4131239950656891,
"step": 810
},
{
"epoch": 0.41,
"learning_rate": 1.0068224497635369e-08,
"logits/chosen": -1.4392060041427612,
"logits/rejected": -1.460053563117981,
"logps/chosen": -17.04248809814453,
"logps/rejected": -25.65012550354004,
"loss": 0.6428,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.26920509338378906,
"rewards/margins": 0.1518903225660324,
"rewards/rejected": -0.42109543085098267,
"step": 815
},
{
"epoch": 0.41,
"learning_rate": 9.549150281252633e-09,
"logits/chosen": -1.380948781967163,
"logits/rejected": -1.38584566116333,
"logps/chosen": -19.414995193481445,
"logps/rejected": -24.585783004760742,
"loss": 0.6346,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.28162115812301636,
"rewards/margins": 0.15368224680423737,
"rewards/rejected": -0.43530339002609253,
"step": 820
},
{
"epoch": 0.41,
"learning_rate": 9.042397785550404e-09,
"logits/chosen": -1.3312206268310547,
"logits/rejected": -1.3463561534881592,
"logps/chosen": -15.558464050292969,
"logps/rejected": -20.260160446166992,
"loss": 0.6395,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2858559489250183,
"rewards/margins": 0.15887019038200378,
"rewards/rejected": -0.4447261691093445,
"step": 825
},
{
"epoch": 0.41,
"learning_rate": 8.548121372247919e-09,
"logits/chosen": -1.403304100036621,
"logits/rejected": -1.3955562114715576,
"logps/chosen": -17.097698211669922,
"logps/rejected": -23.62860679626465,
"loss": 0.6436,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.3152267336845398,
"rewards/margins": 0.09969434142112732,
"rewards/rejected": -0.4149211049079895,
"step": 830
},
{
"epoch": 0.42,
"learning_rate": 8.066471602728803e-09,
"logits/chosen": -1.4208085536956787,
"logits/rejected": -1.4421093463897705,
"logps/chosen": -16.878705978393555,
"logps/rejected": -18.381912231445312,
"loss": 0.6342,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.2497948855161667,
"rewards/margins": 0.19130581617355347,
"rewards/rejected": -0.44110068678855896,
"step": 835
},
{
"epoch": 0.42,
"learning_rate": 7.597595192178703e-09,
"logits/chosen": -1.4911779165267944,
"logits/rejected": -1.4802215099334717,
"logps/chosen": -16.187580108642578,
"logps/rejected": -20.237377166748047,
"loss": 0.6369,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.30556631088256836,
"rewards/margins": 0.13740003108978271,
"rewards/rejected": -0.4429663121700287,
"step": 840
},
{
"epoch": 0.42,
"learning_rate": 7.1416349648943884e-09,
"logits/chosen": -1.4348905086517334,
"logits/rejected": -1.4343273639678955,
"logps/chosen": -18.559795379638672,
"logps/rejected": -23.7816219329834,
"loss": 0.6424,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.3174014091491699,
"rewards/margins": 0.12262705713510513,
"rewards/rejected": -0.44002848863601685,
"step": 845
},
{
"epoch": 0.42,
"learning_rate": 6.698729810778064e-09,
"logits/chosen": -1.4852615594863892,
"logits/rejected": -1.4959180355072021,
"logps/chosen": -18.27218246459961,
"logps/rejected": -25.438560485839844,
"loss": 0.651,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.3046795427799225,
"rewards/margins": 0.14905641973018646,
"rewards/rejected": -0.45373591780662537,
"step": 850
},
{
"epoch": 0.43,
"learning_rate": 6.269014643030213e-09,
"logits/chosen": -1.4191768169403076,
"logits/rejected": -1.4377117156982422,
"logps/chosen": -17.66164207458496,
"logps/rejected": -24.158462524414062,
"loss": 0.6427,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.29421794414520264,
"rewards/margins": 0.1277884691953659,
"rewards/rejected": -0.4220064580440521,
"step": 855
},
{
"epoch": 0.43,
"learning_rate": 5.8526203570536504e-09,
"logits/chosen": -1.4127318859100342,
"logits/rejected": -1.4357506036758423,
"logps/chosen": -17.93946075439453,
"logps/rejected": -25.222787857055664,
"loss": 0.6391,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3292141258716583,
"rewards/margins": 0.20317533612251282,
"rewards/rejected": -0.5323894619941711,
"step": 860
},
{
"epoch": 0.43,
"learning_rate": 5.44967379058161e-09,
"logits/chosen": -1.3914811611175537,
"logits/rejected": -1.4102437496185303,
"logps/chosen": -16.657636642456055,
"logps/rejected": -23.440505981445312,
"loss": 0.6564,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3101702630519867,
"rewards/margins": 0.18297256529331207,
"rewards/rejected": -0.49314290285110474,
"step": 865
},
{
"epoch": 0.43,
"learning_rate": 5.060297685041659e-09,
"logits/chosen": -1.3386437892913818,
"logits/rejected": -1.3571398258209229,
"logps/chosen": -19.986854553222656,
"logps/rejected": -26.74784278869629,
"loss": 0.6279,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.284962922334671,
"rewards/margins": 0.19053895771503448,
"rewards/rejected": -0.4755018651485443,
"step": 870
},
{
"epoch": 0.44,
"learning_rate": 4.684610648167503e-09,
"logits/chosen": -1.3065433502197266,
"logits/rejected": -1.3413054943084717,
"logps/chosen": -17.01144027709961,
"logps/rejected": -24.851526260375977,
"loss": 0.6387,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.34229403734207153,
"rewards/margins": 0.10157414525747299,
"rewards/rejected": -0.44386816024780273,
"step": 875
},
{
"epoch": 0.44,
"learning_rate": 4.322727117869951e-09,
"logits/chosen": -1.3795998096466064,
"logits/rejected": -1.4208852052688599,
"logps/chosen": -18.107046127319336,
"logps/rejected": -24.59166145324707,
"loss": 0.6355,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.282823383808136,
"rewards/margins": 0.2188301533460617,
"rewards/rejected": -0.5016534924507141,
"step": 880
},
{
"epoch": 0.44,
"learning_rate": 3.974757327377981e-09,
"logits/chosen": -1.436051368713379,
"logits/rejected": -1.4475667476654053,
"logps/chosen": -18.486120223999023,
"logps/rejected": -22.97800064086914,
"loss": 0.6469,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.3353201746940613,
"rewards/margins": 0.08744947612285614,
"rewards/rejected": -0.4227696359157562,
"step": 885
},
{
"epoch": 0.45,
"learning_rate": 3.640807271660634e-09,
"logits/chosen": -1.464347004890442,
"logits/rejected": -1.4781545400619507,
"logps/chosen": -18.07439613342285,
"logps/rejected": -23.41990089416504,
"loss": 0.6422,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.259718656539917,
"rewards/margins": 0.25415921211242676,
"rewards/rejected": -0.5138779282569885,
"step": 890
},
{
"epoch": 0.45,
"learning_rate": 3.3209786751399183e-09,
"logits/chosen": -1.3919214010238647,
"logits/rejected": -1.4111039638519287,
"logps/chosen": -19.07809066772461,
"logps/rejected": -24.558177947998047,
"loss": 0.6417,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3080528676509857,
"rewards/margins": 0.1815052181482315,
"rewards/rejected": -0.4895581305027008,
"step": 895
},
{
"epoch": 0.45,
"learning_rate": 3.015368960704584e-09,
"logits/chosen": -1.348677158355713,
"logits/rejected": -1.379531979560852,
"logps/chosen": -17.733901977539062,
"logps/rejected": -21.527729034423828,
"loss": 0.6456,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.34989431500434875,
"rewards/margins": 0.13681560754776,
"rewards/rejected": -0.48670992255210876,
"step": 900
},
{
"epoch": 0.45,
"eval_logits/chosen": -1.7564817667007446,
"eval_logits/rejected": -1.8361190557479858,
"eval_logps/chosen": -17.921520233154297,
"eval_logps/rejected": -23.25269317626953,
"eval_loss": 0.6429938077926636,
"eval_rewards/accuracies": 0.6417731642723083,
"eval_rewards/chosen": -0.2908374071121216,
"eval_rewards/margins": 0.1806056946516037,
"eval_rewards/rejected": -0.4714431166648865,
"eval_runtime": 307.0371,
"eval_samples_per_second": 65.139,
"eval_steps_per_second": 1.019,
"step": 900
},
{
"epoch": 0.45,
"learning_rate": 2.7240712200341577e-09,
"logits/chosen": -1.511375069618225,
"logits/rejected": -1.5339971780776978,
"logps/chosen": -17.652019500732422,
"logps/rejected": -21.41324234008789,
"loss": 0.6269,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.26256105303764343,
"rewards/margins": 0.18523547053337097,
"rewards/rejected": -0.4477965235710144,
"step": 905
},
{
"epoch": 0.46,
"learning_rate": 2.4471741852423233e-09,
"logits/chosen": -1.4349921941757202,
"logits/rejected": -1.4263644218444824,
"logps/chosen": -18.641862869262695,
"logps/rejected": -23.057044982910156,
"loss": 0.636,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.29802319407463074,
"rewards/margins": 0.17096181213855743,
"rewards/rejected": -0.468984991312027,
"step": 910
},
{
"epoch": 0.46,
"learning_rate": 2.184762201848228e-09,
"logits/chosen": -1.370429277420044,
"logits/rejected": -1.4054522514343262,
"logps/chosen": -17.3892765045166,
"logps/rejected": -24.669536590576172,
"loss": 0.6347,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2652004361152649,
"rewards/margins": 0.2178630828857422,
"rewards/rejected": -0.4830635190010071,
"step": 915
},
{
"epoch": 0.46,
"learning_rate": 1.9369152030840553e-09,
"logits/chosen": -1.4306275844573975,
"logits/rejected": -1.4435356855392456,
"logps/chosen": -16.403339385986328,
"logps/rejected": -23.840957641601562,
"loss": 0.6289,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.23369142413139343,
"rewards/margins": 0.23300664126873016,
"rewards/rejected": -0.4666980803012848,
"step": 920
},
{
"epoch": 0.46,
"learning_rate": 1.70370868554659e-09,
"logits/chosen": -1.3685413599014282,
"logits/rejected": -1.3498773574829102,
"logps/chosen": -17.239362716674805,
"logps/rejected": -23.87692642211914,
"loss": 0.6242,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.25025489926338196,
"rewards/margins": 0.25468313694000244,
"rewards/rejected": -0.504938006401062,
"step": 925
},
{
"epoch": 0.47,
"learning_rate": 1.4852136862001763e-09,
"logits/chosen": -1.386918306350708,
"logits/rejected": -1.3831664323806763,
"logps/chosen": -19.643451690673828,
"logps/rejected": -24.284130096435547,
"loss": 0.6319,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.31439071893692017,
"rewards/margins": 0.20531371235847473,
"rewards/rejected": -0.5197044610977173,
"step": 930
},
{
"epoch": 0.47,
"learning_rate": 1.2814967607382432e-09,
"logits/chosen": -1.454646348953247,
"logits/rejected": -1.463849663734436,
"logps/chosen": -17.558238983154297,
"logps/rejected": -19.34381103515625,
"loss": 0.642,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.29885101318359375,
"rewards/margins": 0.1699037104845047,
"rewards/rejected": -0.46875467896461487,
"step": 935
},
{
"epoch": 0.47,
"learning_rate": 1.0926199633097156e-09,
"logits/chosen": -1.3675868511199951,
"logits/rejected": -1.4033797979354858,
"logps/chosen": -17.723363876342773,
"logps/rejected": -21.526758193969727,
"loss": 0.6345,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.29813557863235474,
"rewards/margins": 0.24015481770038605,
"rewards/rejected": -0.5382903814315796,
"step": 940
},
{
"epoch": 0.47,
"learning_rate": 9.186408276168012e-10,
"logits/chosen": -1.4319554567337036,
"logits/rejected": -1.4493201971054077,
"logps/chosen": -17.649295806884766,
"logps/rejected": -25.110567092895508,
"loss": 0.6347,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.25603246688842773,
"rewards/margins": 0.21903729438781738,
"rewards/rejected": -0.4750697612762451,
"step": 945
},
{
"epoch": 0.47,
"learning_rate": 7.59612349389599e-10,
"logits/chosen": -1.4553452730178833,
"logits/rejected": -1.4630682468414307,
"logps/chosen": -18.7999267578125,
"logps/rejected": -22.81121253967285,
"loss": 0.6351,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.32581713795661926,
"rewards/margins": 0.13567964732646942,
"rewards/rejected": -0.4614967703819275,
"step": 950
},
{
"epoch": 0.48,
"learning_rate": 6.15582970243117e-10,
"logits/chosen": -1.4470014572143555,
"logits/rejected": -1.4603230953216553,
"logps/chosen": -18.624347686767578,
"logps/rejected": -21.459732055664062,
"loss": 0.643,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.3545226454734802,
"rewards/margins": 0.13958732783794403,
"rewards/rejected": -0.49411001801490784,
"step": 955
},
{
"epoch": 0.48,
"learning_rate": 4.865965629214819e-10,
"logits/chosen": -1.4853070974349976,
"logits/rejected": -1.5119067430496216,
"logps/chosen": -18.299137115478516,
"logps/rejected": -22.493701934814453,
"loss": 0.6401,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.2782926559448242,
"rewards/margins": 0.11381447315216064,
"rewards/rejected": -0.39210715889930725,
"step": 960
},
{
"epoch": 0.48,
"learning_rate": 3.7269241793390084e-10,
"logits/chosen": -1.416803002357483,
"logits/rejected": -1.4471747875213623,
"logps/chosen": -18.96782875061035,
"logps/rejected": -25.271512985229492,
"loss": 0.6433,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.30018851161003113,
"rewards/margins": 0.2068859338760376,
"rewards/rejected": -0.5070745348930359,
"step": 965
},
{
"epoch": 0.48,
"learning_rate": 2.739052315863355e-10,
"logits/chosen": -1.475373387336731,
"logits/rejected": -1.4798928499221802,
"logps/chosen": -18.24342918395996,
"logps/rejected": -26.389270782470703,
"loss": 0.64,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.2784801125526428,
"rewards/margins": 0.23683655261993408,
"rewards/rejected": -0.5153166651725769,
"step": 970
},
{
"epoch": 0.49,
"learning_rate": 1.9026509541272272e-10,
"logits/chosen": -1.4730727672576904,
"logits/rejected": -1.482080101966858,
"logps/chosen": -17.36160659790039,
"logps/rejected": -26.266277313232422,
"loss": 0.635,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.3075779676437378,
"rewards/margins": 0.23826391994953156,
"rewards/rejected": -0.5458418726921082,
"step": 975
},
{
"epoch": 0.49,
"learning_rate": 1.2179748700879012e-10,
"logits/chosen": -1.476564645767212,
"logits/rejected": -1.5101211071014404,
"logps/chosen": -17.65967559814453,
"logps/rejected": -22.493526458740234,
"loss": 0.6488,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.29659706354141235,
"rewards/margins": 0.1625634878873825,
"rewards/rejected": -0.45916056632995605,
"step": 980
},
{
"epoch": 0.49,
"learning_rate": 6.852326227130833e-11,
"logits/chosen": -1.3950709104537964,
"logits/rejected": -1.396350622177124,
"logps/chosen": -17.96800994873047,
"logps/rejected": -23.398609161376953,
"loss": 0.6321,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.26527708768844604,
"rewards/margins": 0.17985311150550842,
"rewards/rejected": -0.44513019919395447,
"step": 985
},
{
"epoch": 0.49,
"learning_rate": 3.0458649045211895e-11,
"logits/chosen": -1.3564696311950684,
"logits/rejected": -1.343924880027771,
"logps/chosen": -18.672687530517578,
"logps/rejected": -25.386516571044922,
"loss": 0.633,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2513435184955597,
"rewards/margins": 0.25813814997673035,
"rewards/rejected": -0.50948166847229,
"step": 990
},
{
"epoch": 0.5,
"learning_rate": 7.615242180436521e-12,
"logits/chosen": -1.428175926208496,
"logits/rejected": -1.4705220460891724,
"logps/chosen": -18.455589294433594,
"logps/rejected": -23.739002227783203,
"loss": 0.6388,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.34945839643478394,
"rewards/margins": 0.16214387118816376,
"rewards/rejected": -0.5116022825241089,
"step": 995
},
{
"epoch": 0.5,
"learning_rate": 0.0,
"logits/chosen": -1.3395625352859497,
"logits/rejected": -1.3556944131851196,
"logps/chosen": -17.86135482788086,
"logps/rejected": -21.11099624633789,
"loss": 0.6448,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.3157894015312195,
"rewards/margins": 0.17568814754486084,
"rewards/rejected": -0.4914775788784027,
"step": 1000
},
{
"epoch": 0.5,
"eval_logits/chosen": -1.7563655376434326,
"eval_logits/rejected": -1.8358914852142334,
"eval_logps/chosen": -17.932043075561523,
"eval_logps/rejected": -23.265531539916992,
"eval_loss": 0.6428677439689636,
"eval_rewards/accuracies": 0.6453673839569092,
"eval_rewards/chosen": -0.2918897271156311,
"eval_rewards/margins": 0.18083742260932922,
"eval_rewards/rejected": -0.47272711992263794,
"eval_runtime": 307.0181,
"eval_samples_per_second": 65.143,
"eval_steps_per_second": 1.019,
"step": 1000
}
],
"logging_steps": 5,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}