Llama-3-Instruct-8B-SimPOW-1 / trainer_state.json
RAY2L's picture
Upload folder using huggingface_hub
9834c72 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9981298423724285,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"abs_diff": 0.043448589742183685,
"all_logps_1": -124.6441650390625,
"all_logps_1_values": -124.64417266845703,
"all_logps_2": 459.15625,
"all_logps_2_values": 459.15625,
"epoch": 0.0021373230029388193,
"grad_norm": 16.66867807446414,
"learning_rate": 2.127659574468085e-08,
"logits/chosen": -1.1381689310073853,
"logits/rejected": -0.9913416504859924,
"logps/chosen": -0.2839311361312866,
"logps/rejected": -0.29555341601371765,
"loss": 1.5077,
"original_losses": 1.5989841222763062,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7098277807235718,
"rewards/margins": 0.029055725783109665,
"rewards/rejected": -0.7388835549354553,
"step": 1,
"weight": 0.9598712921142578
},
{
"abs_diff": 0.050563473254442215,
"all_logps_1": -113.89578247070312,
"all_logps_1_values": -113.89578247070312,
"all_logps_2": 426.234375,
"all_logps_2_values": 426.234375,
"epoch": 0.010686615014694095,
"grad_norm": 12.434660441186981,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -0.9904537796974182,
"logits/rejected": -0.9189692735671997,
"logps/chosen": -0.2694719731807709,
"logps/rejected": -0.2684631943702698,
"loss": 1.5251,
"original_losses": 1.6255850791931152,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6736798286437988,
"rewards/margins": -0.0025218012742698193,
"rewards/rejected": -0.6711580753326416,
"step": 5,
"weight": 0.9548923373222351
},
{
"abs_diff": 0.06418919563293457,
"all_logps_1": -118.16609191894531,
"all_logps_1_values": -118.16609191894531,
"all_logps_2": 443.21875,
"all_logps_2_values": 443.21875,
"epoch": 0.02137323002938819,
"grad_norm": 11.724962863400911,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.9794756174087524,
"logits/rejected": -0.9353710412979126,
"logps/chosen": -0.2719997763633728,
"logps/rejected": -0.2735568881034851,
"loss": 1.5172,
"original_losses": 1.620931625366211,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.6799993515014648,
"rewards/margins": 0.0038928240537643433,
"rewards/rejected": -0.6838923096656799,
"step": 10,
"weight": 0.9420804977416992
},
{
"abs_diff": 0.06552017480134964,
"all_logps_1": -101.9596939086914,
"all_logps_1_values": -101.95967864990234,
"all_logps_2": 370.20001220703125,
"all_logps_2_values": 370.20001220703125,
"epoch": 0.03205984504408229,
"grad_norm": 9.773542967175878,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.9607246518135071,
"logits/rejected": -0.9163097143173218,
"logps/chosen": -0.29539960622787476,
"logps/rejected": -0.2832711338996887,
"loss": 1.5128,
"original_losses": 1.6492595672607422,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.7384990453720093,
"rewards/margins": -0.030321191996335983,
"rewards/rejected": -0.708177924156189,
"step": 15,
"weight": 0.9420396089553833
},
{
"abs_diff": 0.082237109541893,
"all_logps_1": -95.52127075195312,
"all_logps_1_values": -95.52125549316406,
"all_logps_2": 368.6625061035156,
"all_logps_2_values": 368.6625061035156,
"epoch": 0.04274646005877638,
"grad_norm": 14.386337719633973,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.9820459485054016,
"logits/rejected": -0.9820452928543091,
"logps/chosen": -0.26204216480255127,
"logps/rejected": -0.26956799626350403,
"loss": 1.5149,
"original_losses": 1.6124236583709717,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.6551053524017334,
"rewards/margins": 0.018814602866768837,
"rewards/rejected": -0.6739200353622437,
"step": 20,
"weight": 0.9291993379592896
},
{
"abs_diff": 0.07468467205762863,
"all_logps_1": -101.43566131591797,
"all_logps_1_values": -101.43565368652344,
"all_logps_2": 359.6499938964844,
"all_logps_2_values": 359.6499938964844,
"epoch": 0.053433075073470476,
"grad_norm": 12.506683302853757,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -1.0295155048370361,
"logits/rejected": -1.0065571069717407,
"logps/chosen": -0.28278106451034546,
"logps/rejected": -0.2869016230106354,
"loss": 1.5005,
"original_losses": 1.6180095672607422,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.706952691078186,
"rewards/margins": 0.010301386937499046,
"rewards/rejected": -0.7172540426254272,
"step": 25,
"weight": 0.9346221089363098
},
{
"abs_diff": 0.07145524024963379,
"all_logps_1": -96.14094543457031,
"all_logps_1_values": -96.14093780517578,
"all_logps_2": 358.6937561035156,
"all_logps_2_values": 358.6937561035156,
"epoch": 0.06411969008816458,
"grad_norm": 17.486598946846197,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -1.0747442245483398,
"logits/rejected": -0.9867307543754578,
"logps/chosen": -0.27444857358932495,
"logps/rejected": -0.27685946226119995,
"loss": 1.5207,
"original_losses": 1.6215848922729492,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.6861215233802795,
"rewards/margins": 0.006027159281075001,
"rewards/rejected": -0.6921486258506775,
"step": 30,
"weight": 0.9376131296157837
},
{
"abs_diff": 0.08128118515014648,
"all_logps_1": -110.31912994384766,
"all_logps_1_values": -110.3191146850586,
"all_logps_2": 396.7250061035156,
"all_logps_2_values": 396.7250061035156,
"epoch": 0.07480630510285867,
"grad_norm": 10.190092324128308,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -1.0031483173370361,
"logits/rejected": -0.9225772023200989,
"logps/chosen": -0.2776695191860199,
"logps/rejected": -0.3029964566230774,
"loss": 1.5058,
"original_losses": 1.5780258178710938,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.6941738128662109,
"rewards/margins": 0.06331733614206314,
"rewards/rejected": -0.7574911713600159,
"step": 35,
"weight": 0.9304083585739136
},
{
"abs_diff": 0.06388907134532928,
"all_logps_1": -94.03665924072266,
"all_logps_1_values": -94.03666687011719,
"all_logps_2": 347.20001220703125,
"all_logps_2_values": 347.20001220703125,
"epoch": 0.08549292011755276,
"grad_norm": 12.383837039803712,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -0.9180997014045715,
"logits/rejected": -0.9071486592292786,
"logps/chosen": -0.28308817744255066,
"logps/rejected": -0.29446059465408325,
"loss": 1.5141,
"original_losses": 1.6014320850372314,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.7077205181121826,
"rewards/margins": 0.028431018814444542,
"rewards/rejected": -0.7361515760421753,
"step": 40,
"weight": 0.9425530433654785
},
{
"abs_diff": 0.09521429240703583,
"all_logps_1": -106.0528793334961,
"all_logps_1_values": -106.0528793334961,
"all_logps_2": 362.95623779296875,
"all_logps_2_values": 362.95623779296875,
"epoch": 0.09617953513224686,
"grad_norm": 9.970613374779385,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -0.9140686988830566,
"logits/rejected": -0.8324721455574036,
"logps/chosen": -0.33634239435195923,
"logps/rejected": -0.34527257084846497,
"loss": 1.4915,
"original_losses": 1.614324927330017,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.8408559560775757,
"rewards/margins": 0.02232544682919979,
"rewards/rejected": -0.8631814122200012,
"step": 45,
"weight": 0.9211470484733582
},
{
"abs_diff": 0.12202360481023788,
"all_logps_1": -105.84830474853516,
"all_logps_1_values": -105.84830474853516,
"all_logps_2": 377.7437438964844,
"all_logps_2_values": 377.7437438964844,
"epoch": 0.10686615014694095,
"grad_norm": 10.765426712830973,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -0.8902776837348938,
"logits/rejected": -0.8994420766830444,
"logps/chosen": -0.31167787313461304,
"logps/rejected": -0.3589983582496643,
"loss": 1.466,
"original_losses": 1.5521076917648315,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.7791945934295654,
"rewards/margins": 0.11830125004053116,
"rewards/rejected": -0.8974958658218384,
"step": 50,
"weight": 0.9070577621459961
},
{
"abs_diff": 0.11367271095514297,
"all_logps_1": -112.1168441772461,
"all_logps_1_values": -112.1168441772461,
"all_logps_2": 420.46875,
"all_logps_2_values": 420.46875,
"epoch": 0.11755276516163506,
"grad_norm": 10.584693183679102,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -0.8847481608390808,
"logits/rejected": -0.8255330920219421,
"logps/chosen": -0.28891468048095703,
"logps/rejected": -0.3513794541358948,
"loss": 1.465,
"original_losses": 1.557521939277649,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7222867012023926,
"rewards/margins": 0.15616199374198914,
"rewards/rejected": -0.8784486651420593,
"step": 55,
"weight": 0.9259511828422546
},
{
"abs_diff": 0.08213352411985397,
"all_logps_1": -120.3653564453125,
"all_logps_1_values": -120.36537170410156,
"all_logps_2": 451.7250061035156,
"all_logps_2_values": 451.7250061035156,
"epoch": 0.12823938017632916,
"grad_norm": 20.487281254270606,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -0.9171462059020996,
"logits/rejected": -0.8949100375175476,
"logps/chosen": -0.2980085015296936,
"logps/rejected": -0.32817280292510986,
"loss": 1.4606,
"original_losses": 1.5710750818252563,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.7450211644172668,
"rewards/margins": 0.07541082799434662,
"rewards/rejected": -0.8204320073127747,
"step": 60,
"weight": 0.9325092434883118
},
{
"abs_diff": 0.08584319800138474,
"all_logps_1": -115.28419494628906,
"all_logps_1_values": -115.28419494628906,
"all_logps_2": 410.28125,
"all_logps_2_values": 410.28125,
"epoch": 0.13892599519102325,
"grad_norm": 13.268818877197086,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -0.9003847241401672,
"logits/rejected": -0.9516555666923523,
"logps/chosen": -0.31763237714767456,
"logps/rejected": -0.3270418345928192,
"loss": 1.4586,
"original_losses": 1.614269495010376,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.794080913066864,
"rewards/margins": 0.023523610085248947,
"rewards/rejected": -0.817604660987854,
"step": 65,
"weight": 0.9301543235778809
},
{
"abs_diff": 0.23710966110229492,
"all_logps_1": -129.6254119873047,
"all_logps_1_values": -129.6254425048828,
"all_logps_2": 391.6187438964844,
"all_logps_2_values": 391.6187438964844,
"epoch": 0.14961261020571734,
"grad_norm": 19.008527618804656,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -0.9297588467597961,
"logits/rejected": -0.8964225053787231,
"logps/chosen": -0.4621095657348633,
"logps/rejected": -0.565943717956543,
"loss": 1.4309,
"original_losses": 1.5991542339324951,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -1.1552739143371582,
"rewards/margins": 0.25958532094955444,
"rewards/rejected": -1.414859414100647,
"step": 70,
"weight": 0.8780097961425781
},
{
"abs_diff": 0.22396209836006165,
"all_logps_1": -126.3341064453125,
"all_logps_1_values": -126.33412170410156,
"all_logps_2": 375.15625,
"all_logps_2_values": 375.15625,
"epoch": 0.16029922522041143,
"grad_norm": 14.741661325228266,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.88294517993927,
"logits/rejected": -0.8696261644363403,
"logps/chosen": -0.6373583078384399,
"logps/rejected": -0.7649468779563904,
"loss": 1.371,
"original_losses": 1.5059027671813965,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.593395709991455,
"rewards/margins": 0.3189714848995209,
"rewards/rejected": -1.9123672246932983,
"step": 75,
"weight": 0.874294102191925
},
{
"abs_diff": 0.4753897786140442,
"all_logps_1": -154.002197265625,
"all_logps_1_values": -154.002197265625,
"all_logps_2": 385.40625,
"all_logps_2_values": 385.40625,
"epoch": 0.17098584023510552,
"grad_norm": 10.653088582817368,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -0.9209216833114624,
"logits/rejected": -0.905800461769104,
"logps/chosen": -0.9318068623542786,
"logps/rejected": -1.1782509088516235,
"loss": 1.3728,
"original_losses": 1.6557430028915405,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -2.329517126083374,
"rewards/margins": 0.61611008644104,
"rewards/rejected": -2.945627212524414,
"step": 80,
"weight": 0.8384539484977722
},
{
"abs_diff": 0.4482264518737793,
"all_logps_1": -181.6018829345703,
"all_logps_1_values": -181.6018829345703,
"all_logps_2": 381.91876220703125,
"all_logps_2_values": 381.91876220703125,
"epoch": 0.18167245524979964,
"grad_norm": 8.388730168314039,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.8116687536239624,
"logits/rejected": -0.7630541324615479,
"logps/chosen": -1.007387638092041,
"logps/rejected": -1.0764662027359009,
"loss": 1.3965,
"original_losses": 1.8705193996429443,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -2.5184690952301025,
"rewards/margins": 0.17269621789455414,
"rewards/rejected": -2.6911654472351074,
"step": 85,
"weight": 0.8248960375785828
},
{
"abs_diff": 0.638414204120636,
"all_logps_1": -197.71530151367188,
"all_logps_1_values": -197.7152862548828,
"all_logps_2": 368.6000061035156,
"all_logps_2_values": 368.6000061035156,
"epoch": 0.19235907026449373,
"grad_norm": 12.62143276771947,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -0.7098425626754761,
"logits/rejected": -0.6454850435256958,
"logps/chosen": -1.299263596534729,
"logps/rejected": -1.3454030752182007,
"loss": 1.3792,
"original_losses": 2.042982578277588,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.2481586933135986,
"rewards/margins": 0.11534923315048218,
"rewards/rejected": -3.3635077476501465,
"step": 90,
"weight": 0.788603663444519
},
{
"abs_diff": 0.3771124482154846,
"all_logps_1": -198.22885131835938,
"all_logps_1_values": -198.22885131835938,
"all_logps_2": 307.64373779296875,
"all_logps_2_values": 307.64373779296875,
"epoch": 0.20304568527918782,
"grad_norm": 9.223783777700444,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -0.7450689077377319,
"logits/rejected": -0.7714122533798218,
"logps/chosen": -1.5162893533706665,
"logps/rejected": -1.538206696510315,
"loss": 1.3537,
"original_losses": 1.7573131322860718,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -3.7907233238220215,
"rewards/margins": 0.054793525487184525,
"rewards/rejected": -3.8455166816711426,
"step": 95,
"weight": 0.7987316846847534
},
{
"abs_diff": 0.531648576259613,
"all_logps_1": -257.82080078125,
"all_logps_1_values": -257.82080078125,
"all_logps_2": 405.08123779296875,
"all_logps_2_values": 405.08123779296875,
"epoch": 0.2137323002938819,
"grad_norm": 13.130824511623645,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -0.7543559074401855,
"logits/rejected": -0.6947053074836731,
"logps/chosen": -1.3733211755752563,
"logps/rejected": -1.4744349718093872,
"loss": 1.3472,
"original_losses": 1.8884124755859375,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -3.433303117752075,
"rewards/margins": 0.2527844309806824,
"rewards/rejected": -3.6860873699188232,
"step": 100,
"weight": 0.8195359110832214
},
{
"abs_diff": 0.4814772605895996,
"all_logps_1": -285.88824462890625,
"all_logps_1_values": -285.88824462890625,
"all_logps_2": 447.76251220703125,
"all_logps_2_values": 447.76251220703125,
"epoch": 0.224418915308576,
"grad_norm": 15.741233324493118,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -0.5685318112373352,
"logits/rejected": -0.5175650119781494,
"logps/chosen": -1.1041462421417236,
"logps/rejected": -1.3609198331832886,
"loss": 1.347,
"original_losses": 1.60434091091156,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.7603654861450195,
"rewards/margins": 0.6419342756271362,
"rewards/rejected": -3.4022998809814453,
"step": 105,
"weight": 0.8199658393859863
},
{
"abs_diff": 0.5063992738723755,
"all_logps_1": -312.87860107421875,
"all_logps_1_values": -312.8785705566406,
"all_logps_2": 410.79998779296875,
"all_logps_2_values": 410.79998779296875,
"epoch": 0.2351055303232701,
"grad_norm": 14.779833008390499,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.3194349706172943,
"logits/rejected": -0.27131232619285583,
"logps/chosen": -1.436680793762207,
"logps/rejected": -1.3837544918060303,
"loss": 1.3485,
"original_losses": 2.0654890537261963,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.5917022228240967,
"rewards/margins": -0.13231578469276428,
"rewards/rejected": -3.459386110305786,
"step": 110,
"weight": 0.8112524151802063
},
{
"abs_diff": 0.79926997423172,
"all_logps_1": -352.8046875,
"all_logps_1_values": -352.8046875,
"all_logps_2": 401.26873779296875,
"all_logps_2_values": 401.26873779296875,
"epoch": 0.2457921453379642,
"grad_norm": 17.098670325278757,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -0.27068907022476196,
"logits/rejected": -0.25977402925491333,
"logps/chosen": -1.8351905345916748,
"logps/rejected": -2.079685688018799,
"loss": 1.2568,
"original_losses": 1.9370386600494385,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -4.587975978851318,
"rewards/margins": 0.6112388968467712,
"rewards/rejected": -5.199214458465576,
"step": 115,
"weight": 0.7355886101722717
},
{
"abs_diff": 0.4315846860408783,
"all_logps_1": -371.93505859375,
"all_logps_1_values": -371.93505859375,
"all_logps_2": 397.9624938964844,
"all_logps_2_values": 397.9624938964844,
"epoch": 0.2564787603526583,
"grad_norm": 17.135021647585766,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.18766793608665466,
"logits/rejected": -0.1377825289964676,
"logps/chosen": -1.6060386896133423,
"logps/rejected": -1.7283703088760376,
"loss": 1.2524,
"original_losses": 1.669327974319458,
"rewards/accuracies": 0.4375,
"rewards/chosen": -4.015096187591553,
"rewards/margins": 0.30582934617996216,
"rewards/rejected": -4.320925712585449,
"step": 120,
"weight": 0.7726086378097534
},
{
"abs_diff": 0.8556106686592102,
"all_logps_1": -424.2312927246094,
"all_logps_1_values": -424.2313537597656,
"all_logps_2": 358.1312561035156,
"all_logps_2_values": 358.1312561035156,
"epoch": 0.2671653753673524,
"grad_norm": 18.949047249790798,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -0.0937797874212265,
"logits/rejected": -0.08780622482299805,
"logps/chosen": -2.3565449714660645,
"logps/rejected": -2.821481227874756,
"loss": 1.2455,
"original_losses": 1.5799314975738525,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -5.891362190246582,
"rewards/margins": 1.1623404026031494,
"rewards/rejected": -7.053703308105469,
"step": 125,
"weight": 0.6997275352478027
},
{
"abs_diff": 1.122897982597351,
"all_logps_1": -483.11285400390625,
"all_logps_1_values": -483.11279296875,
"all_logps_2": 356.2250061035156,
"all_logps_2_values": 356.2250061035156,
"epoch": 0.2778519903820465,
"grad_norm": 16.067627857167523,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": 0.18425658345222473,
"logits/rejected": 0.12208795547485352,
"logps/chosen": -2.2584593296051025,
"logps/rejected": -2.747421979904175,
"loss": 1.2378,
"original_losses": 1.9530925750732422,
"rewards/accuracies": 0.5,
"rewards/chosen": -5.646147727966309,
"rewards/margins": 1.2224081754684448,
"rewards/rejected": -6.868556022644043,
"step": 130,
"weight": 0.6967185139656067
},
{
"abs_diff": 0.5274697542190552,
"all_logps_1": -584.13671875,
"all_logps_1_values": -584.13671875,
"all_logps_2": 443.01873779296875,
"all_logps_2_values": 443.01873779296875,
"epoch": 0.2885386053967406,
"grad_norm": 29.366033343143968,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": 0.3088318705558777,
"logits/rejected": 0.3932690918445587,
"logps/chosen": -2.3267366886138916,
"logps/rejected": -2.385960102081299,
"loss": 1.1916,
"original_losses": 1.8465898036956787,
"rewards/accuracies": 0.5,
"rewards/chosen": -5.816841125488281,
"rewards/margins": 0.1480589658021927,
"rewards/rejected": -5.964900016784668,
"step": 135,
"weight": 0.7594529390335083
},
{
"abs_diff": 0.9901386499404907,
"all_logps_1": -715.9130859375,
"all_logps_1_values": -715.9131469726562,
"all_logps_2": 402.9312438964844,
"all_logps_2_values": 402.9312438964844,
"epoch": 0.2992252204114347,
"grad_norm": 27.69156284264097,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": 0.5694825649261475,
"logits/rejected": 0.5738533139228821,
"logps/chosen": -3.3967947959899902,
"logps/rejected": -3.4784629344940186,
"loss": 1.1521,
"original_losses": 2.2902231216430664,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -8.491987228393555,
"rewards/margins": 0.20417042076587677,
"rewards/rejected": -8.696157455444336,
"step": 140,
"weight": 0.6874681115150452
},
{
"abs_diff": 0.9199058413505554,
"all_logps_1": -995.3132934570312,
"all_logps_1_values": -995.3132934570312,
"all_logps_2": 409.5249938964844,
"all_logps_2_values": 409.5249938964844,
"epoch": 0.30991183542612877,
"grad_norm": 28.11539806786062,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": 0.8747909665107727,
"logits/rejected": 0.9098325967788696,
"logps/chosen": -3.898921251296997,
"logps/rejected": -3.9907355308532715,
"loss": 1.1592,
"original_losses": 2.074253797531128,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -9.74730110168457,
"rewards/margins": 0.22953681647777557,
"rewards/rejected": -9.976838111877441,
"step": 145,
"weight": 0.6336122751235962
},
{
"abs_diff": 1.7418813705444336,
"all_logps_1": -1663.3861083984375,
"all_logps_1_values": -1663.3861083984375,
"all_logps_2": 383.75,
"all_logps_2_values": 383.75,
"epoch": 0.32059845044082286,
"grad_norm": 43.30888911111554,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": 1.2933635711669922,
"logits/rejected": 1.2684452533721924,
"logps/chosen": -6.538305759429932,
"logps/rejected": -7.486212253570557,
"loss": 1.0994,
"original_losses": 1.926180124282837,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -16.34576416015625,
"rewards/margins": 2.3697667121887207,
"rewards/rejected": -18.715530395507812,
"step": 150,
"weight": 0.5583394765853882
},
{
"abs_diff": 1.5373389720916748,
"all_logps_1": -2462.133056640625,
"all_logps_1_values": -2462.13330078125,
"all_logps_2": 434.73748779296875,
"all_logps_2_values": 434.73748779296875,
"epoch": 0.33128506545551695,
"grad_norm": 47.421884036345716,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": 1.8083369731903076,
"logits/rejected": 1.890794038772583,
"logps/chosen": -8.33267879486084,
"logps/rejected": -9.018165588378906,
"loss": 1.0741,
"original_losses": 2.0032851696014404,
"rewards/accuracies": 0.5625,
"rewards/chosen": -20.83169937133789,
"rewards/margins": 1.713716745376587,
"rewards/rejected": -22.5454158782959,
"step": 155,
"weight": 0.5593416094779968
},
{
"abs_diff": 1.8985588550567627,
"all_logps_1": -2538.660400390625,
"all_logps_1_values": -2538.66064453125,
"all_logps_2": 403.66876220703125,
"all_logps_2_values": 403.66876220703125,
"epoch": 0.34197168047021104,
"grad_norm": 58.88642904599502,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": 2.1515212059020996,
"logits/rejected": 2.141986846923828,
"logps/chosen": -8.633856773376465,
"logps/rejected": -9.374483108520508,
"loss": 1.0769,
"original_losses": 2.3099827766418457,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -21.584644317626953,
"rewards/margins": 1.8515657186508179,
"rewards/rejected": -23.436208724975586,
"step": 160,
"weight": 0.5209288001060486
},
{
"abs_diff": 2.2082934379577637,
"all_logps_1": -3570.93603515625,
"all_logps_1_values": -3570.936279296875,
"all_logps_2": 442.4437561035156,
"all_logps_2_values": 442.4437561035156,
"epoch": 0.3526582954849052,
"grad_norm": 32.977138977170775,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": 2.5748469829559326,
"logits/rejected": 2.677804470062256,
"logps/chosen": -9.694478988647461,
"logps/rejected": -10.093037605285645,
"loss": 1.0398,
"original_losses": 3.134640693664551,
"rewards/accuracies": 0.5625,
"rewards/chosen": -24.236202239990234,
"rewards/margins": 0.9963935017585754,
"rewards/rejected": -25.232593536376953,
"step": 165,
"weight": 0.49304407835006714
},
{
"abs_diff": 2.007434129714966,
"all_logps_1": -3220.789794921875,
"all_logps_1_values": -3220.789794921875,
"all_logps_2": 357.3062438964844,
"all_logps_2_values": 357.3062438964844,
"epoch": 0.36334491049959927,
"grad_norm": 44.745926058943496,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": 2.934321641921997,
"logits/rejected": 2.8931219577789307,
"logps/chosen": -11.122208595275879,
"logps/rejected": -11.998506546020508,
"loss": 0.9596,
"original_losses": 2.2297332286834717,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -27.80552101135254,
"rewards/margins": 2.1907458305358887,
"rewards/rejected": -29.996265411376953,
"step": 170,
"weight": 0.49520620703697205
},
{
"abs_diff": 2.5391037464141846,
"all_logps_1": -3010.77099609375,
"all_logps_1_values": -3010.77099609375,
"all_logps_2": 336.26251220703125,
"all_logps_2_values": 336.26251220703125,
"epoch": 0.37403152551429336,
"grad_norm": 50.44282847929724,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": 2.6517717838287354,
"logits/rejected": 2.698502779006958,
"logps/chosen": -11.271635055541992,
"logps/rejected": -12.422686576843262,
"loss": 0.953,
"original_losses": 2.4483256340026855,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -28.1790828704834,
"rewards/margins": 2.8776297569274902,
"rewards/rejected": -31.056713104248047,
"step": 175,
"weight": 0.45478373765945435
},
{
"abs_diff": 2.311084270477295,
"all_logps_1": -3630.26123046875,
"all_logps_1_values": -3630.26123046875,
"all_logps_2": 367.6937561035156,
"all_logps_2_values": 367.6937561035156,
"epoch": 0.38471814052898745,
"grad_norm": 54.556403188950036,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": 2.640475273132324,
"logits/rejected": 2.6134068965911865,
"logps/chosen": -12.537522315979004,
"logps/rejected": -13.568713188171387,
"loss": 0.9044,
"original_losses": 2.333768844604492,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -31.343807220458984,
"rewards/margins": 2.577979564666748,
"rewards/rejected": -33.921791076660156,
"step": 180,
"weight": 0.46685990691185
},
{
"abs_diff": 2.934654951095581,
"all_logps_1": -5179.39404296875,
"all_logps_1_values": -5179.39404296875,
"all_logps_2": 370.9624938964844,
"all_logps_2_values": 370.9624938964844,
"epoch": 0.39540475554368154,
"grad_norm": 57.260252425269734,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": 2.326862096786499,
"logits/rejected": 2.4421494007110596,
"logps/chosen": -15.849513053894043,
"logps/rejected": -17.323734283447266,
"loss": 0.9407,
"original_losses": 2.5988547801971436,
"rewards/accuracies": 0.5625,
"rewards/chosen": -39.623779296875,
"rewards/margins": 3.6855552196502686,
"rewards/rejected": -43.30933380126953,
"step": 185,
"weight": 0.40877920389175415
},
{
"abs_diff": 2.9652016162872314,
"all_logps_1": -5177.00244140625,
"all_logps_1_values": -5177.00244140625,
"all_logps_2": 374.4312438964844,
"all_logps_2_values": 374.4312438964844,
"epoch": 0.40609137055837563,
"grad_norm": 83.2255328888069,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": 2.44303297996521,
"logits/rejected": 2.4873244762420654,
"logps/chosen": -15.580667495727539,
"logps/rejected": -17.045442581176758,
"loss": 0.9238,
"original_losses": 2.6292238235473633,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -38.95166778564453,
"rewards/margins": 3.661935329437256,
"rewards/rejected": -42.61360168457031,
"step": 190,
"weight": 0.40674179792404175
},
{
"abs_diff": 2.7273154258728027,
"all_logps_1": -4500.06005859375,
"all_logps_1_values": -4500.06005859375,
"all_logps_2": 380.1312561035156,
"all_logps_2_values": 380.1312561035156,
"epoch": 0.4167779855730697,
"grad_norm": 84.18074257984793,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": 2.6910769939422607,
"logits/rejected": 2.7326107025146484,
"logps/chosen": -13.98046875,
"logps/rejected": -15.500396728515625,
"loss": 0.9048,
"original_losses": 2.1395676136016846,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -34.95117950439453,
"rewards/margins": 3.7998204231262207,
"rewards/rejected": -38.75099182128906,
"step": 195,
"weight": 0.3992369771003723
},
{
"abs_diff": 2.3771374225616455,
"all_logps_1": -4996.7001953125,
"all_logps_1_values": -4996.7001953125,
"all_logps_2": 438.8500061035156,
"all_logps_2_values": 438.8500061035156,
"epoch": 0.4274646005877638,
"grad_norm": 51.852682835194706,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": 2.5128085613250732,
"logits/rejected": 2.454047679901123,
"logps/chosen": -13.007303237915039,
"logps/rejected": -13.782841682434082,
"loss": 0.9745,
"original_losses": 2.816681385040283,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -32.51825714111328,
"rewards/margins": 1.9388458728790283,
"rewards/rejected": -34.45710372924805,
"step": 200,
"weight": 0.4336828589439392
},
{
"abs_diff": 2.789199113845825,
"all_logps_1": -5606.87744140625,
"all_logps_1_values": -5606.87744140625,
"all_logps_2": 413.7875061035156,
"all_logps_2_values": 413.7875061035156,
"epoch": 0.4381512156024579,
"grad_norm": 82.65919240097834,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": 2.4827866554260254,
"logits/rejected": 2.610020399093628,
"logps/chosen": -15.495327949523926,
"logps/rejected": -16.71689224243164,
"loss": 0.8079,
"original_losses": 2.6531503200531006,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -38.738319396972656,
"rewards/margins": 3.053907871246338,
"rewards/rejected": -41.7922248840332,
"step": 205,
"weight": 0.40062981843948364
},
{
"abs_diff": 3.1174449920654297,
"all_logps_1": -6078.1650390625,
"all_logps_1_values": -6078.1650390625,
"all_logps_2": 408.83123779296875,
"all_logps_2_values": 408.83123779296875,
"epoch": 0.448837830617152,
"grad_norm": 66.91462129577006,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": 2.3251194953918457,
"logits/rejected": 2.481720209121704,
"logps/chosen": -15.918850898742676,
"logps/rejected": -17.23949432373047,
"loss": 0.8447,
"original_losses": 2.9006853103637695,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -39.79712677001953,
"rewards/margins": 3.3016059398651123,
"rewards/rejected": -43.09873580932617,
"step": 210,
"weight": 0.37287402153015137
},
{
"abs_diff": 3.3388848304748535,
"all_logps_1": -6523.8935546875,
"all_logps_1_values": -6523.8935546875,
"all_logps_2": 405.98748779296875,
"all_logps_2_values": 405.98748779296875,
"epoch": 0.45952444563184613,
"grad_norm": 95.8421548369589,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": 2.866258382797241,
"logits/rejected": 2.9341139793395996,
"logps/chosen": -16.77628517150879,
"logps/rejected": -18.90264320373535,
"loss": 0.8426,
"original_losses": 2.032466411590576,
"rewards/accuracies": 0.75,
"rewards/chosen": -41.940711975097656,
"rewards/margins": 5.31589412689209,
"rewards/rejected": -47.25660705566406,
"step": 215,
"weight": 0.35115545988082886
},
{
"abs_diff": 2.8094236850738525,
"all_logps_1": -4738.73046875,
"all_logps_1_values": -4738.73046875,
"all_logps_2": 363.98126220703125,
"all_logps_2_values": 363.98126220703125,
"epoch": 0.4702110606465402,
"grad_norm": 112.65545034373879,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": 2.7162396907806396,
"logits/rejected": 2.835710048675537,
"logps/chosen": -15.200531005859375,
"logps/rejected": -15.732034683227539,
"loss": 0.7804,
"original_losses": 3.5092949867248535,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -38.0013313293457,
"rewards/margins": 1.32875394821167,
"rewards/rejected": -39.33008575439453,
"step": 220,
"weight": 0.4126754403114319
},
{
"abs_diff": 3.209429979324341,
"all_logps_1": -5642.91943359375,
"all_logps_1_values": -5642.92041015625,
"all_logps_2": 383.92498779296875,
"all_logps_2_values": 383.92498779296875,
"epoch": 0.4808976756612343,
"grad_norm": 37.46832492030243,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": 2.5830130577087402,
"logits/rejected": 2.689384937286377,
"logps/chosen": -15.603918075561523,
"logps/rejected": -16.610340118408203,
"loss": 0.9122,
"original_losses": 3.4112372398376465,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -39.00979995727539,
"rewards/margins": 2.5160529613494873,
"rewards/rejected": -41.525856018066406,
"step": 225,
"weight": 0.3604838252067566
},
{
"abs_diff": 3.7125911712646484,
"all_logps_1": -5569.94384765625,
"all_logps_1_values": -5569.94384765625,
"all_logps_2": 361.3125,
"all_logps_2_values": 361.3125,
"epoch": 0.4915842906759284,
"grad_norm": 66.55702343700871,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": 2.3465304374694824,
"logits/rejected": 2.66461181640625,
"logps/chosen": -17.106571197509766,
"logps/rejected": -19.080835342407227,
"loss": 0.8076,
"original_losses": 2.705897808074951,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -42.76642608642578,
"rewards/margins": 4.935657501220703,
"rewards/rejected": -47.70208740234375,
"step": 230,
"weight": 0.3060615658760071
},
{
"abs_diff": 2.281184434890747,
"all_logps_1": -3926.673828125,
"all_logps_1_values": -3926.67333984375,
"all_logps_2": 311.42498779296875,
"all_logps_2_values": 311.42498779296875,
"epoch": 0.5022709056906225,
"grad_norm": 52.12193473626352,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": 2.5927655696868896,
"logits/rejected": 2.721041679382324,
"logps/chosen": -14.924860000610352,
"logps/rejected": -15.577176094055176,
"loss": 0.8527,
"original_losses": 2.7761876583099365,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -37.3121452331543,
"rewards/margins": 1.6307960748672485,
"rewards/rejected": -38.94294357299805,
"step": 235,
"weight": 0.42170318961143494
},
{
"abs_diff": 2.7382442951202393,
"all_logps_1": -5511.8671875,
"all_logps_1_values": -5511.8671875,
"all_logps_2": 424.04376220703125,
"all_logps_2_values": 424.04376220703125,
"epoch": 0.5129575207053166,
"grad_norm": 59.31175783914156,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": 2.750415086746216,
"logits/rejected": 2.8377902507781982,
"logps/chosen": -15.228363037109375,
"logps/rejected": -16.618165969848633,
"loss": 0.8222,
"original_losses": 2.4813647270202637,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -38.07091522216797,
"rewards/margins": 3.474503993988037,
"rewards/rejected": -41.545413970947266,
"step": 240,
"weight": 0.43811964988708496
},
{
"abs_diff": 3.254149913787842,
"all_logps_1": -5742.85595703125,
"all_logps_1_values": -5742.85546875,
"all_logps_2": 412.4624938964844,
"all_logps_2_values": 412.4624938964844,
"epoch": 0.5236441357200107,
"grad_norm": 54.9226284927014,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": 2.565314531326294,
"logits/rejected": 2.691755533218384,
"logps/chosen": -15.381324768066406,
"logps/rejected": -17.23483657836914,
"loss": 0.7997,
"original_losses": 2.286261558532715,
"rewards/accuracies": 0.75,
"rewards/chosen": -38.45330810546875,
"rewards/margins": 4.633780479431152,
"rewards/rejected": -43.08708953857422,
"step": 245,
"weight": 0.3790872097015381
},
{
"abs_diff": 3.364607334136963,
"all_logps_1": -5477.4482421875,
"all_logps_1_values": -5477.4482421875,
"all_logps_2": 341.70623779296875,
"all_logps_2_values": 341.70623779296875,
"epoch": 0.5343307507347048,
"grad_norm": 75.70050581279018,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": 2.6822657585144043,
"logits/rejected": 2.7521121501922607,
"logps/chosen": -16.76608657836914,
"logps/rejected": -19.173168182373047,
"loss": 0.8369,
"original_losses": 1.7328109741210938,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -41.91521453857422,
"rewards/margins": 6.017703056335449,
"rewards/rejected": -47.932918548583984,
"step": 250,
"weight": 0.3458004593849182
},
{
"abs_diff": 3.0323586463928223,
"all_logps_1": -6443.626953125,
"all_logps_1_values": -6443.62646484375,
"all_logps_2": 363.6000061035156,
"all_logps_2_values": 363.6000061035156,
"epoch": 0.5450173657493989,
"grad_norm": 39.687795704366174,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": 2.954530954360962,
"logits/rejected": 2.9405295848846436,
"logps/chosen": -18.485279083251953,
"logps/rejected": -19.909687042236328,
"loss": 0.7436,
"original_losses": 2.6259872913360596,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -46.21319580078125,
"rewards/margins": 3.561020612716675,
"rewards/rejected": -49.77421569824219,
"step": 255,
"weight": 0.37864193320274353
},
{
"abs_diff": 3.276740312576294,
"all_logps_1": -8267.275390625,
"all_logps_1_values": -8267.275390625,
"all_logps_2": 393.79376220703125,
"all_logps_2_values": 393.79376220703125,
"epoch": 0.555703980764093,
"grad_norm": 82.01295751328553,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": 3.072216749191284,
"logits/rejected": 3.1636574268341064,
"logps/chosen": -20.29796600341797,
"logps/rejected": -22.58323860168457,
"loss": 0.767,
"original_losses": 1.7455909252166748,
"rewards/accuracies": 0.6875,
"rewards/chosen": -50.74491882324219,
"rewards/margins": 5.713181972503662,
"rewards/rejected": -56.458106994628906,
"step": 260,
"weight": 0.35711461305618286
},
{
"abs_diff": 3.182936429977417,
"all_logps_1": -9216.587890625,
"all_logps_1_values": -9216.5869140625,
"all_logps_2": 407.7562561035156,
"all_logps_2_values": 407.7562561035156,
"epoch": 0.566390595778787,
"grad_norm": 52.240919124363245,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": 3.0337119102478027,
"logits/rejected": 3.0206868648529053,
"logps/chosen": -23.038707733154297,
"logps/rejected": -23.99751091003418,
"loss": 0.807,
"original_losses": 3.4282360076904297,
"rewards/accuracies": 0.625,
"rewards/chosen": -57.596778869628906,
"rewards/margins": 2.3969998359680176,
"rewards/rejected": -59.9937744140625,
"step": 265,
"weight": 0.34545254707336426
},
{
"abs_diff": 3.006873607635498,
"all_logps_1": -10153.31640625,
"all_logps_1_values": -10153.3154296875,
"all_logps_2": 477.38751220703125,
"all_logps_2_values": 477.38751220703125,
"epoch": 0.5770772107934812,
"grad_norm": 59.52695646189972,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": 2.972503185272217,
"logits/rejected": 2.9690961837768555,
"logps/chosen": -22.136503219604492,
"logps/rejected": -23.38858413696289,
"loss": 0.8091,
"original_losses": 2.8519082069396973,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -55.34125900268555,
"rewards/margins": 3.1301934719085693,
"rewards/rejected": -58.47145462036133,
"step": 270,
"weight": 0.36154988408088684
},
{
"abs_diff": 2.563995599746704,
"all_logps_1": -7391.75,
"all_logps_1_values": -7391.75,
"all_logps_2": 375.40625,
"all_logps_2_values": 375.40625,
"epoch": 0.5877638258081752,
"grad_norm": 59.32000543668621,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": 3.481792449951172,
"logits/rejected": 3.5533995628356934,
"logps/chosen": -21.077594757080078,
"logps/rejected": -22.37049674987793,
"loss": 0.7438,
"original_losses": 2.241507053375244,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -52.6939811706543,
"rewards/margins": 3.2322616577148438,
"rewards/rejected": -55.926246643066406,
"step": 275,
"weight": 0.41661015152931213
},
{
"abs_diff": 3.315411329269409,
"all_logps_1": -7719.34619140625,
"all_logps_1_values": -7719.34521484375,
"all_logps_2": 439.35626220703125,
"all_logps_2_values": 439.35626220703125,
"epoch": 0.5984504408228694,
"grad_norm": 53.61685628912313,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": 2.6844732761383057,
"logits/rejected": 2.87386417388916,
"logps/chosen": -17.859844207763672,
"logps/rejected": -19.173076629638672,
"loss": 0.7914,
"original_losses": 3.27254056930542,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -44.64960861206055,
"rewards/margins": 3.283079147338867,
"rewards/rejected": -47.93268966674805,
"step": 280,
"weight": 0.38690507411956787
},
{
"abs_diff": 2.917543649673462,
"all_logps_1": -6426.8310546875,
"all_logps_1_values": -6426.8310546875,
"all_logps_2": 355.16876220703125,
"all_logps_2_values": 355.16876220703125,
"epoch": 0.6091370558375635,
"grad_norm": 55.70128923603701,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": 3.0214133262634277,
"logits/rejected": 3.1276047229766846,
"logps/chosen": -20.0152530670166,
"logps/rejected": -20.51242446899414,
"loss": 0.8001,
"original_losses": 3.778569459915161,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -50.03813552856445,
"rewards/margins": 1.2429269552230835,
"rewards/rejected": -51.28105926513672,
"step": 285,
"weight": 0.4056159555912018
},
{
"abs_diff": 3.5806915760040283,
"all_logps_1": -6845.4326171875,
"all_logps_1_values": -6845.4326171875,
"all_logps_2": 341.95001220703125,
"all_logps_2_values": 341.95001220703125,
"epoch": 0.6198236708522575,
"grad_norm": 67.43658438729601,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": 2.87160325050354,
"logits/rejected": 2.953885555267334,
"logps/chosen": -19.99938201904297,
"logps/rejected": -22.214576721191406,
"loss": 0.8043,
"original_losses": 2.356289863586426,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -49.99845504760742,
"rewards/margins": 5.537986755371094,
"rewards/rejected": -55.53644561767578,
"step": 290,
"weight": 0.3736080527305603
},
{
"abs_diff": 3.2340712547302246,
"all_logps_1": -7549.24755859375,
"all_logps_1_values": -7549.24755859375,
"all_logps_2": 351.07501220703125,
"all_logps_2_values": 351.07501220703125,
"epoch": 0.6305102858669517,
"grad_norm": 48.7199759637811,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": 2.5181379318237305,
"logits/rejected": 2.6238226890563965,
"logps/chosen": -21.65777587890625,
"logps/rejected": -23.368385314941406,
"loss": 0.8187,
"original_losses": 2.4767355918884277,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -54.144432067871094,
"rewards/margins": 4.276528835296631,
"rewards/rejected": -58.42096710205078,
"step": 295,
"weight": 0.35286107659339905
},
{
"abs_diff": 3.32385516166687,
"all_logps_1": -8850.7470703125,
"all_logps_1_values": -8850.748046875,
"all_logps_2": 415.8999938964844,
"all_logps_2_values": 415.8999938964844,
"epoch": 0.6411969008816457,
"grad_norm": 40.177069639353974,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": 2.7022032737731934,
"logits/rejected": 2.7918949127197266,
"logps/chosen": -22.0867862701416,
"logps/rejected": -24.649303436279297,
"loss": 0.7237,
"original_losses": 1.4938082695007324,
"rewards/accuracies": 0.75,
"rewards/chosen": -55.21696090698242,
"rewards/margins": 6.4062957763671875,
"rewards/rejected": -61.623252868652344,
"step": 300,
"weight": 0.38917768001556396
},
{
"abs_diff": 2.9255619049072266,
"all_logps_1": -7401.28662109375,
"all_logps_1_values": -7401.2861328125,
"all_logps_2": 407.0562438964844,
"all_logps_2_values": 407.0562438964844,
"epoch": 0.6518835158963399,
"grad_norm": 48.44087105424344,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": 2.6635046005249023,
"logits/rejected": 2.777791976928711,
"logps/chosen": -18.8937931060791,
"logps/rejected": -20.48404312133789,
"loss": 0.8144,
"original_losses": 2.4159512519836426,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -47.23448181152344,
"rewards/margins": 3.975621461868286,
"rewards/rejected": -51.21010208129883,
"step": 305,
"weight": 0.4257276952266693
},
{
"abs_diff": 3.013671875,
"all_logps_1": -7221.7607421875,
"all_logps_1_values": -7221.76171875,
"all_logps_2": 377.16876220703125,
"all_logps_2_values": 377.16876220703125,
"epoch": 0.6625701309110339,
"grad_norm": 59.965292421288716,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": 2.7733490467071533,
"logits/rejected": 2.600106954574585,
"logps/chosen": -19.796558380126953,
"logps/rejected": -20.72552490234375,
"loss": 0.8117,
"original_losses": 3.373765230178833,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -49.491390228271484,
"rewards/margins": 2.3224196434020996,
"rewards/rejected": -51.813812255859375,
"step": 310,
"weight": 0.3705739974975586
},
{
"abs_diff": 3.0523111820220947,
"all_logps_1": -7815.6552734375,
"all_logps_1_values": -7815.65478515625,
"all_logps_2": 449.16876220703125,
"all_logps_2_values": 449.16876220703125,
"epoch": 0.673256745925728,
"grad_norm": 52.38266043751792,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": 2.1578516960144043,
"logits/rejected": 2.247980833053589,
"logps/chosen": -17.26466941833496,
"logps/rejected": -18.508235931396484,
"loss": 0.7937,
"original_losses": 2.871872901916504,
"rewards/accuracies": 0.6875,
"rewards/chosen": -43.16167449951172,
"rewards/margins": 3.1089208126068115,
"rewards/rejected": -46.270591735839844,
"step": 315,
"weight": 0.3665739893913269
},
{
"abs_diff": 3.318554639816284,
"all_logps_1": -6473.89013671875,
"all_logps_1_values": -6473.890625,
"all_logps_2": 359.54376220703125,
"all_logps_2_values": 359.54376220703125,
"epoch": 0.6839433609404221,
"grad_norm": 87.91813204561389,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": 2.3339014053344727,
"logits/rejected": 2.4213125705718994,
"logps/chosen": -17.509052276611328,
"logps/rejected": -19.367351531982422,
"loss": 0.7705,
"original_losses": 2.49141263961792,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -43.77263259887695,
"rewards/margins": 4.645747184753418,
"rewards/rejected": -48.41838455200195,
"step": 320,
"weight": 0.37379634380340576
},
{
"abs_diff": 4.029627799987793,
"all_logps_1": -8260.576171875,
"all_logps_1_values": -8260.576171875,
"all_logps_2": 420.7562561035156,
"all_logps_2_values": 420.7562561035156,
"epoch": 0.6946299759551162,
"grad_norm": 56.877689091030156,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": 2.8901479244232178,
"logits/rejected": 2.8577167987823486,
"logps/chosen": -18.898571014404297,
"logps/rejected": -21.253376007080078,
"loss": 0.7293,
"original_losses": 2.648833751678467,
"rewards/accuracies": 0.625,
"rewards/chosen": -47.24642562866211,
"rewards/margins": 5.887020111083984,
"rewards/rejected": -53.133445739746094,
"step": 325,
"weight": 0.299586683511734
},
{
"abs_diff": 3.5890209674835205,
"all_logps_1": -8075.91650390625,
"all_logps_1_values": -8075.91552734375,
"all_logps_2": 370.53125,
"all_logps_2_values": 370.53125,
"epoch": 0.7053165909698104,
"grad_norm": 51.745088875170836,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": 2.560868978500366,
"logits/rejected": 2.73579740524292,
"logps/chosen": -20.837478637695312,
"logps/rejected": -23.191274642944336,
"loss": 0.8048,
"original_losses": 2.0950331687927246,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -52.09369659423828,
"rewards/margins": 5.884491443634033,
"rewards/rejected": -57.978187561035156,
"step": 330,
"weight": 0.35120078921318054
},
{
"abs_diff": 3.3162055015563965,
"all_logps_1": -8510.986328125,
"all_logps_1_values": -8510.9873046875,
"all_logps_2": 404.1875,
"all_logps_2_values": 404.1875,
"epoch": 0.7160032059845044,
"grad_norm": 69.38405615517823,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": 2.6478374004364014,
"logits/rejected": 2.565058946609497,
"logps/chosen": -20.254060745239258,
"logps/rejected": -22.03819465637207,
"loss": 0.7845,
"original_losses": 2.5941619873046875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -50.63515090942383,
"rewards/margins": 4.460334300994873,
"rewards/rejected": -55.095489501953125,
"step": 335,
"weight": 0.3666679263114929
},
{
"abs_diff": 3.677370548248291,
"all_logps_1": -8691.5263671875,
"all_logps_1_values": -8691.5263671875,
"all_logps_2": 381.01873779296875,
"all_logps_2_values": 381.01873779296875,
"epoch": 0.7266898209991985,
"grad_norm": 84.25483121877998,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": 2.709319829940796,
"logits/rejected": 2.7781405448913574,
"logps/chosen": -21.70474624633789,
"logps/rejected": -23.93856430053711,
"loss": 0.7836,
"original_losses": 2.420710802078247,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -54.261871337890625,
"rewards/margins": 5.5845465660095215,
"rewards/rejected": -59.84641647338867,
"step": 340,
"weight": 0.35216349363327026
},
{
"abs_diff": 2.6053452491760254,
"all_logps_1": -8825.68359375,
"all_logps_1_values": -8825.68359375,
"all_logps_2": 365.8812561035156,
"all_logps_2_values": 365.8812561035156,
"epoch": 0.7373764360138926,
"grad_norm": 70.75060453919657,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": 2.845489501953125,
"logits/rejected": 2.95839262008667,
"logps/chosen": -24.01942253112793,
"logps/rejected": -25.074626922607422,
"loss": 0.7906,
"original_losses": 2.7006657123565674,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -60.048553466796875,
"rewards/margins": 2.63801908493042,
"rewards/rejected": -62.68656539916992,
"step": 345,
"weight": 0.4165709912776947
},
{
"abs_diff": 3.481846570968628,
"all_logps_1": -9110.2353515625,
"all_logps_1_values": -9110.2353515625,
"all_logps_2": 392.26251220703125,
"all_logps_2_values": 392.26251220703125,
"epoch": 0.7480630510285867,
"grad_norm": 50.64757547547787,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": 2.579031467437744,
"logits/rejected": 2.5901365280151367,
"logps/chosen": -21.98320198059082,
"logps/rejected": -24.78140640258789,
"loss": 0.7388,
"original_losses": 1.231533408164978,
"rewards/accuracies": 0.8125,
"rewards/chosen": -54.9580078125,
"rewards/margins": 6.995513916015625,
"rewards/rejected": -61.953514099121094,
"step": 350,
"weight": 0.3367912769317627
},
{
"abs_diff": 3.284003496170044,
"all_logps_1": -9058.169921875,
"all_logps_1_values": -9058.169921875,
"all_logps_2": 396.1812438964844,
"all_logps_2_values": 396.1812438964844,
"epoch": 0.7587496660432808,
"grad_norm": 74.67147548055407,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": 2.3805794715881348,
"logits/rejected": 2.5762991905212402,
"logps/chosen": -21.627700805664062,
"logps/rejected": -23.67769432067871,
"loss": 0.7775,
"original_losses": 2.081150531768799,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -54.06926345825195,
"rewards/margins": 5.124981880187988,
"rewards/rejected": -59.194244384765625,
"step": 355,
"weight": 0.333683043718338
},
{
"abs_diff": 3.9802608489990234,
"all_logps_1": -8140.62646484375,
"all_logps_1_values": -8140.625,
"all_logps_2": 368.1812438964844,
"all_logps_2_values": 368.1812438964844,
"epoch": 0.7694362810579749,
"grad_norm": 58.567962370545146,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": 2.5346484184265137,
"logits/rejected": 2.3816428184509277,
"logps/chosen": -22.101619720458984,
"logps/rejected": -24.49993896484375,
"loss": 0.6993,
"original_losses": 2.540489435195923,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -55.254051208496094,
"rewards/margins": 5.995795249938965,
"rewards/rejected": -61.249847412109375,
"step": 360,
"weight": 0.34169501066207886
},
{
"abs_diff": 3.0081470012664795,
"all_logps_1": -7452.68115234375,
"all_logps_1_values": -7452.68115234375,
"all_logps_2": 344.38751220703125,
"all_logps_2_values": 344.38751220703125,
"epoch": 0.7801228960726689,
"grad_norm": 83.23124267198439,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": 2.4369776248931885,
"logits/rejected": 2.584667682647705,
"logps/chosen": -21.06991195678711,
"logps/rejected": -22.77521324157715,
"loss": 0.7695,
"original_losses": 2.1701793670654297,
"rewards/accuracies": 0.6875,
"rewards/chosen": -52.674774169921875,
"rewards/margins": 4.263253211975098,
"rewards/rejected": -56.93803024291992,
"step": 365,
"weight": 0.345781534910202
},
{
"abs_diff": 3.4343185424804688,
"all_logps_1": -9116.8271484375,
"all_logps_1_values": -9116.826171875,
"all_logps_2": 410.375,
"all_logps_2_values": 410.375,
"epoch": 0.7908095110873631,
"grad_norm": 70.00940117238335,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": 2.4547030925750732,
"logits/rejected": 2.5984954833984375,
"logps/chosen": -21.283931732177734,
"logps/rejected": -23.039413452148438,
"loss": 0.7116,
"original_losses": 2.7036542892456055,
"rewards/accuracies": 0.625,
"rewards/chosen": -53.20983123779297,
"rewards/margins": 4.388695240020752,
"rewards/rejected": -57.59852981567383,
"step": 370,
"weight": 0.3490845561027527
},
{
"abs_diff": 3.5221400260925293,
"all_logps_1": -8307.474609375,
"all_logps_1_values": -8307.4755859375,
"all_logps_2": 382.3999938964844,
"all_logps_2_values": 382.3999938964844,
"epoch": 0.8014961261020572,
"grad_norm": 46.47990793449235,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": 2.5157063007354736,
"logits/rejected": 2.4793992042541504,
"logps/chosen": -20.3429012298584,
"logps/rejected": -21.467952728271484,
"loss": 0.836,
"original_losses": 3.6247520446777344,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -50.85725784301758,
"rewards/margins": 2.8126296997070312,
"rewards/rejected": -53.669883728027344,
"step": 375,
"weight": 0.3106473684310913
},
{
"abs_diff": 3.1884102821350098,
"all_logps_1": -7604.51953125,
"all_logps_1_values": -7604.5185546875,
"all_logps_2": 386.5625,
"all_logps_2_values": 386.5625,
"epoch": 0.8121827411167513,
"grad_norm": 53.33016210631404,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": 2.299121141433716,
"logits/rejected": 2.4894156455993652,
"logps/chosen": -18.67618179321289,
"logps/rejected": -20.802087783813477,
"loss": 0.7253,
"original_losses": 1.8635917901992798,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -46.690452575683594,
"rewards/margins": 5.314764499664307,
"rewards/rejected": -52.005226135253906,
"step": 380,
"weight": 0.36687955260276794
},
{
"abs_diff": 3.728355884552002,
"all_logps_1": -6403.2841796875,
"all_logps_1_values": -6403.2841796875,
"all_logps_2": 352.4937438964844,
"all_logps_2_values": 352.4937438964844,
"epoch": 0.8228693561314454,
"grad_norm": 51.015747481657996,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": 2.5289080142974854,
"logits/rejected": 2.568324565887451,
"logps/chosen": -17.892498016357422,
"logps/rejected": -20.8332462310791,
"loss": 0.6978,
"original_losses": 1.4315834045410156,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -44.73124694824219,
"rewards/margins": 7.351869106292725,
"rewards/rejected": -52.0831184387207,
"step": 385,
"weight": 0.3369835317134857
},
{
"abs_diff": 3.4457297325134277,
"all_logps_1": -7086.70947265625,
"all_logps_1_values": -7086.70849609375,
"all_logps_2": 400.7562561035156,
"all_logps_2_values": 400.7562561035156,
"epoch": 0.8335559711461394,
"grad_norm": 68.15400742768429,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": 2.300518751144409,
"logits/rejected": 2.432492256164551,
"logps/chosen": -17.285266876220703,
"logps/rejected": -19.280744552612305,
"loss": 0.6869,
"original_losses": 2.3130502700805664,
"rewards/accuracies": 0.75,
"rewards/chosen": -43.213172912597656,
"rewards/margins": 4.9886932373046875,
"rewards/rejected": -48.201866149902344,
"step": 390,
"weight": 0.337992399930954
},
{
"abs_diff": 2.9501354694366455,
"all_logps_1": -7602.40478515625,
"all_logps_1_values": -7602.40380859375,
"all_logps_2": 396.3125,
"all_logps_2_values": 396.3125,
"epoch": 0.8442425861608336,
"grad_norm": 72.89829906287879,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": 2.815917491912842,
"logits/rejected": 3.0646049976348877,
"logps/chosen": -17.960046768188477,
"logps/rejected": -19.63981056213379,
"loss": 0.7686,
"original_losses": 2.1916909217834473,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -44.900108337402344,
"rewards/margins": 4.1994123458862305,
"rewards/rejected": -49.099525451660156,
"step": 395,
"weight": 0.36895015835762024
},
{
"abs_diff": 3.306037187576294,
"all_logps_1": -6128.6064453125,
"all_logps_1_values": -6128.6064453125,
"all_logps_2": 348.07501220703125,
"all_logps_2_values": 348.07501220703125,
"epoch": 0.8549292011755276,
"grad_norm": 49.93516351014214,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": 2.5622057914733887,
"logits/rejected": 2.715359926223755,
"logps/chosen": -18.067874908447266,
"logps/rejected": -20.04085922241211,
"loss": 0.7506,
"original_losses": 2.2437596321105957,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -45.169681549072266,
"rewards/margins": 4.932468891143799,
"rewards/rejected": -50.10215377807617,
"step": 400,
"weight": 0.34681177139282227
},
{
"epoch": 0.8549292011755276,
"eval_abs_diff": 3.175931930541992,
"eval_all_logps_1": -7614.6904296875,
"eval_all_logps_1_values": -7614.69091796875,
"eval_all_logps_2": 414.86090087890625,
"eval_all_logps_2_values": 414.86090087890625,
"eval_logits/chosen": 1.7177369594573975,
"eval_logits/rejected": 1.830857753753662,
"eval_logps/chosen": -18.158353805541992,
"eval_logps/rejected": -20.146547317504883,
"eval_loss": 0.752778172492981,
"eval_original_losses": 2.049124002456665,
"eval_rewards/accuracies": 0.6975806355476379,
"eval_rewards/chosen": -45.3958854675293,
"eval_rewards/margins": 4.970486640930176,
"eval_rewards/rejected": -50.36636734008789,
"eval_runtime": 70.2236,
"eval_samples_per_second": 27.925,
"eval_steps_per_second": 0.883,
"eval_weight": 0.37132638692855835,
"step": 400
},
{
"abs_diff": 3.7374179363250732,
"all_logps_1": -6704.875,
"all_logps_1_values": -6704.875,
"all_logps_2": 385.4375,
"all_logps_2_values": 385.4375,
"epoch": 0.8656158161902218,
"grad_norm": 69.66297582257639,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": 2.536898612976074,
"logits/rejected": 2.8442349433898926,
"logps/chosen": -17.0179443359375,
"logps/rejected": -19.512527465820312,
"loss": 0.6907,
"original_losses": 1.9623138904571533,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -42.544864654541016,
"rewards/margins": 6.236458778381348,
"rewards/rejected": -48.78131866455078,
"step": 405,
"weight": 0.320218563079834
},
{
"abs_diff": 3.427241802215576,
"all_logps_1": -6360.1455078125,
"all_logps_1_values": -6360.1455078125,
"all_logps_2": 339.8062438964844,
"all_logps_2_values": 339.8062438964844,
"epoch": 0.8763024312049158,
"grad_norm": 61.75715741585555,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": 2.682762861251831,
"logits/rejected": 2.7268879413604736,
"logps/chosen": -18.33367347717285,
"logps/rejected": -20.431079864501953,
"loss": 0.7588,
"original_losses": 2.1594674587249756,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -45.83418273925781,
"rewards/margins": 5.243517875671387,
"rewards/rejected": -51.07769775390625,
"step": 410,
"weight": 0.32417041063308716
},
{
"abs_diff": 3.5729141235351562,
"all_logps_1": -8468.05078125,
"all_logps_1_values": -8468.05078125,
"all_logps_2": 414.2124938964844,
"all_logps_2_values": 414.2124938964844,
"epoch": 0.88698904621961,
"grad_norm": 40.72517812799497,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": 2.8361315727233887,
"logits/rejected": 2.8616833686828613,
"logps/chosen": -19.978229522705078,
"logps/rejected": -22.11844825744629,
"loss": 0.7242,
"original_losses": 2.3812079429626465,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -49.94557189941406,
"rewards/margins": 5.3505539894104,
"rewards/rejected": -55.29612350463867,
"step": 415,
"weight": 0.3600180447101593
},
{
"abs_diff": 2.8872458934783936,
"all_logps_1": -8678.85546875,
"all_logps_1_values": -8678.85546875,
"all_logps_2": 427.64373779296875,
"all_logps_2_values": 427.64373779296875,
"epoch": 0.897675661234304,
"grad_norm": 40.225954100303696,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": 2.452263355255127,
"logits/rejected": 2.515206813812256,
"logps/chosen": -20.152559280395508,
"logps/rejected": -21.298845291137695,
"loss": 0.7959,
"original_losses": 2.8674798011779785,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -50.38140106201172,
"rewards/margins": 2.865709066390991,
"rewards/rejected": -53.247108459472656,
"step": 420,
"weight": 0.39061683416366577
},
{
"abs_diff": 4.009498119354248,
"all_logps_1": -7007.01708984375,
"all_logps_1_values": -7007.01708984375,
"all_logps_2": 359.6187438964844,
"all_logps_2_values": 359.6187438964844,
"epoch": 0.9083622762489981,
"grad_norm": 54.351457754994804,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": 2.617743968963623,
"logits/rejected": 2.7704989910125732,
"logps/chosen": -19.43728256225586,
"logps/rejected": -21.93575668334961,
"loss": 0.7422,
"original_losses": 2.389147996902466,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -48.59320831298828,
"rewards/margins": 6.246188163757324,
"rewards/rejected": -54.83939743041992,
"step": 425,
"weight": 0.31762319803237915
},
{
"abs_diff": 2.755589723587036,
"all_logps_1": -8104.44921875,
"all_logps_1_values": -8104.44921875,
"all_logps_2": 428.79376220703125,
"all_logps_2_values": 428.79376220703125,
"epoch": 0.9190488912636923,
"grad_norm": 45.34333002111428,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": 2.6327333450317383,
"logits/rejected": 2.7319021224975586,
"logps/chosen": -18.6940975189209,
"logps/rejected": -20.32192039489746,
"loss": 0.6933,
"original_losses": 2.010368824005127,
"rewards/accuracies": 0.6875,
"rewards/chosen": -46.735252380371094,
"rewards/margins": 4.069557189941406,
"rewards/rejected": -50.80480194091797,
"step": 430,
"weight": 0.38695794343948364
},
{
"abs_diff": 3.834909439086914,
"all_logps_1": -7406.4482421875,
"all_logps_1_values": -7406.44775390625,
"all_logps_2": 382.15625,
"all_logps_2_values": 382.15625,
"epoch": 0.9297355062783863,
"grad_norm": 103.89987589364694,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": 2.688000440597534,
"logits/rejected": 2.763110399246216,
"logps/chosen": -19.011985778808594,
"logps/rejected": -21.563823699951172,
"loss": 0.7401,
"original_losses": 2.1428942680358887,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -47.52996063232422,
"rewards/margins": 6.379598617553711,
"rewards/rejected": -53.90956497192383,
"step": 435,
"weight": 0.35418570041656494
},
{
"abs_diff": 3.49601411819458,
"all_logps_1": -7640.515625,
"all_logps_1_values": -7640.515625,
"all_logps_2": 394.25,
"all_logps_2_values": 394.25,
"epoch": 0.9404221212930804,
"grad_norm": 66.9604311531267,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": 2.6405506134033203,
"logits/rejected": 2.7150299549102783,
"logps/chosen": -18.938282012939453,
"logps/rejected": -21.01675796508789,
"loss": 0.7279,
"original_losses": 2.3662502765655518,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -47.345703125,
"rewards/margins": 5.196188449859619,
"rewards/rejected": -52.541893005371094,
"step": 440,
"weight": 0.35034170746803284
},
{
"abs_diff": 3.1276192665100098,
"all_logps_1": -9211.677734375,
"all_logps_1_values": -9211.6787109375,
"all_logps_2": 462.4624938964844,
"all_logps_2_values": 462.4624938964844,
"epoch": 0.9511087363077745,
"grad_norm": 62.83164635980714,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": 2.502159357070923,
"logits/rejected": 2.6519925594329834,
"logps/chosen": -18.46548080444336,
"logps/rejected": -20.194454193115234,
"loss": 0.6978,
"original_losses": 2.2807629108428955,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -46.16370391845703,
"rewards/margins": 4.322434902191162,
"rewards/rejected": -50.48613739013672,
"step": 445,
"weight": 0.35711297392845154
},
{
"abs_diff": 3.5259463787078857,
"all_logps_1": -7040.4140625,
"all_logps_1_values": -7040.4140625,
"all_logps_2": 358.57501220703125,
"all_logps_2_values": 358.57501220703125,
"epoch": 0.9617953513224686,
"grad_norm": 58.22216553617623,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": 2.841308832168579,
"logits/rejected": 2.788696050643921,
"logps/chosen": -19.402172088623047,
"logps/rejected": -21.435121536254883,
"loss": 0.6648,
"original_losses": 2.3494513034820557,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -48.505435943603516,
"rewards/margins": 5.082365989685059,
"rewards/rejected": -53.587799072265625,
"step": 450,
"weight": 0.32446950674057007
},
{
"abs_diff": 2.527660369873047,
"all_logps_1": -7083.1259765625,
"all_logps_1_values": -7083.1259765625,
"all_logps_2": 354.76873779296875,
"all_logps_2_values": 354.76873779296875,
"epoch": 0.9724819663371627,
"grad_norm": 63.64964025419041,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": 2.5407052040100098,
"logits/rejected": 2.6334285736083984,
"logps/chosen": -19.97518539428711,
"logps/rejected": -21.144289016723633,
"loss": 0.773,
"original_losses": 2.366673469543457,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -49.937965393066406,
"rewards/margins": 2.922760486602783,
"rewards/rejected": -52.86072540283203,
"step": 455,
"weight": 0.4015112519264221
},
{
"abs_diff": 3.792357921600342,
"all_logps_1": -6872.0908203125,
"all_logps_1_values": -6872.0908203125,
"all_logps_2": 352.35626220703125,
"all_logps_2_values": 352.35626220703125,
"epoch": 0.9831685813518568,
"grad_norm": 70.18502240580426,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": 2.659250020980835,
"logits/rejected": 2.507812976837158,
"logps/chosen": -20.08974266052246,
"logps/rejected": -22.00864028930664,
"loss": 0.7596,
"original_losses": 3.0322279930114746,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -50.22435760498047,
"rewards/margins": 4.797248840332031,
"rewards/rejected": -55.0216064453125,
"step": 460,
"weight": 0.3797384202480316
},
{
"abs_diff": 3.1223583221435547,
"all_logps_1": -7477.0185546875,
"all_logps_1_values": -7477.0185546875,
"all_logps_2": 386.9937438964844,
"all_logps_2_values": 386.9937438964844,
"epoch": 0.9938551963665508,
"grad_norm": 70.11026953873642,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": 2.367159366607666,
"logits/rejected": 2.6166296005249023,
"logps/chosen": -18.468345642089844,
"logps/rejected": -20.6806697845459,
"loss": 0.6765,
"original_losses": 1.560880422592163,
"rewards/accuracies": 0.8125,
"rewards/chosen": -46.17086410522461,
"rewards/margins": 5.5308074951171875,
"rewards/rejected": -51.7016716003418,
"step": 465,
"weight": 0.36222249269485474
},
{
"epoch": 0.9981298423724285,
"step": 467,
"total_flos": 0.0,
"train_loss": 0.9884350126254227,
"train_runtime": 7236.0008,
"train_samples_per_second": 8.275,
"train_steps_per_second": 0.065
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}