yakazimir's picture
Model save
76b809b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9978142076502732,
"eval_steps": 400,
"global_step": 914,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01092896174863388,
"grad_norm": 49.12429877298342,
"learning_rate": 5.434782608695652e-08,
"logits/chosen": -1.016326904296875,
"logits/rejected": -1.0107576847076416,
"logps/chosen": -0.28068429231643677,
"logps/rejected": -0.28573077917099,
"loss": 3.0052,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.806842803955078,
"rewards/margins": 0.05046519637107849,
"rewards/rejected": -2.8573079109191895,
"step": 5
},
{
"epoch": 0.02185792349726776,
"grad_norm": 63.846051899029185,
"learning_rate": 1.0869565217391303e-07,
"logits/chosen": -1.0520384311676025,
"logits/rejected": -1.0011743307113647,
"logps/chosen": -0.2570471167564392,
"logps/rejected": -0.27129054069519043,
"loss": 2.9747,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.5704712867736816,
"rewards/margins": 0.14243429899215698,
"rewards/rejected": -2.7129054069519043,
"step": 10
},
{
"epoch": 0.03278688524590164,
"grad_norm": 53.76693099062572,
"learning_rate": 1.6304347826086955e-07,
"logits/chosen": -1.0114173889160156,
"logits/rejected": -0.9646312594413757,
"logps/chosen": -0.2674410939216614,
"logps/rejected": -0.2732146680355072,
"loss": 2.9654,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.6744110584259033,
"rewards/margins": 0.05773543566465378,
"rewards/rejected": -2.7321462631225586,
"step": 15
},
{
"epoch": 0.04371584699453552,
"grad_norm": 70.16722328630453,
"learning_rate": 2.1739130434782607e-07,
"logits/chosen": -0.9457764625549316,
"logits/rejected": -0.8962175250053406,
"logps/chosen": -0.2723461091518402,
"logps/rejected": -0.2841888964176178,
"loss": 3.0079,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.723461151123047,
"rewards/margins": 0.11842777580022812,
"rewards/rejected": -2.841888904571533,
"step": 20
},
{
"epoch": 0.0546448087431694,
"grad_norm": 34.617399530191626,
"learning_rate": 2.717391304347826e-07,
"logits/chosen": -0.9401823282241821,
"logits/rejected": -0.8662004470825195,
"logps/chosen": -0.275082528591156,
"logps/rejected": -0.29355964064598083,
"loss": 2.9781,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.7508254051208496,
"rewards/margins": 0.18477120995521545,
"rewards/rejected": -2.935596466064453,
"step": 25
},
{
"epoch": 0.06557377049180328,
"grad_norm": 57.4910388890607,
"learning_rate": 3.260869565217391e-07,
"logits/chosen": -1.0440914630889893,
"logits/rejected": -0.9793885350227356,
"logps/chosen": -0.26507988572120667,
"logps/rejected": -0.2826555371284485,
"loss": 2.9819,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.650798797607422,
"rewards/margins": 0.17575649917125702,
"rewards/rejected": -2.8265554904937744,
"step": 30
},
{
"epoch": 0.07650273224043716,
"grad_norm": 54.66121032212012,
"learning_rate": 3.8043478260869567e-07,
"logits/chosen": -1.001513123512268,
"logits/rejected": -0.9358412027359009,
"logps/chosen": -0.2545512318611145,
"logps/rejected": -0.27621084451675415,
"loss": 2.9523,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.5455121994018555,
"rewards/margins": 0.21659617125988007,
"rewards/rejected": -2.762108325958252,
"step": 35
},
{
"epoch": 0.08743169398907104,
"grad_norm": 61.608388907145425,
"learning_rate": 4.3478260869565214e-07,
"logits/chosen": -0.9582087397575378,
"logits/rejected": -0.8961701393127441,
"logps/chosen": -0.28110748529434204,
"logps/rejected": -0.29402947425842285,
"loss": 3.0188,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.811074733734131,
"rewards/margins": 0.12921971082687378,
"rewards/rejected": -2.9402947425842285,
"step": 40
},
{
"epoch": 0.09836065573770492,
"grad_norm": 29.990450343406938,
"learning_rate": 4.891304347826087e-07,
"logits/chosen": -1.0058226585388184,
"logits/rejected": -0.923748791217804,
"logps/chosen": -0.2822369635105133,
"logps/rejected": -0.3045244514942169,
"loss": 2.9589,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.82236909866333,
"rewards/margins": 0.22287502884864807,
"rewards/rejected": -3.0452442169189453,
"step": 45
},
{
"epoch": 0.1092896174863388,
"grad_norm": 61.73061513054691,
"learning_rate": 5.434782608695652e-07,
"logits/chosen": -0.950430691242218,
"logits/rejected": -0.8683624267578125,
"logps/chosen": -0.27781787514686584,
"logps/rejected": -0.2830268144607544,
"loss": 2.9812,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -2.7781786918640137,
"rewards/margins": 0.05208945274353027,
"rewards/rejected": -2.830268144607544,
"step": 50
},
{
"epoch": 0.12021857923497267,
"grad_norm": 33.948585932841986,
"learning_rate": 5.978260869565217e-07,
"logits/chosen": -0.9692623019218445,
"logits/rejected": -0.8552471399307251,
"logps/chosen": -0.2707405686378479,
"logps/rejected": -0.299907922744751,
"loss": 2.9042,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.7074055671691895,
"rewards/margins": 0.2916738986968994,
"rewards/rejected": -2.9990792274475098,
"step": 55
},
{
"epoch": 0.13114754098360656,
"grad_norm": 33.34905420886387,
"learning_rate": 6.521739130434782e-07,
"logits/chosen": -1.0130410194396973,
"logits/rejected": -0.9668112993240356,
"logps/chosen": -0.2595779299736023,
"logps/rejected": -0.3016397953033447,
"loss": 2.8565,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.5957789421081543,
"rewards/margins": 0.4206187129020691,
"rewards/rejected": -3.0163979530334473,
"step": 60
},
{
"epoch": 0.14207650273224043,
"grad_norm": 47.22484337976121,
"learning_rate": 7.065217391304348e-07,
"logits/chosen": -0.9941331148147583,
"logits/rejected": -0.9260708093643188,
"logps/chosen": -0.2983860373497009,
"logps/rejected": -0.3194151818752289,
"loss": 2.9452,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.983860492706299,
"rewards/margins": 0.2102913111448288,
"rewards/rejected": -3.1941516399383545,
"step": 65
},
{
"epoch": 0.15300546448087432,
"grad_norm": 80.82534906777506,
"learning_rate": 7.608695652173913e-07,
"logits/chosen": -0.958402156829834,
"logits/rejected": -0.9377009272575378,
"logps/chosen": -0.2862434983253479,
"logps/rejected": -0.30820196866989136,
"loss": 2.8832,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.8624351024627686,
"rewards/margins": 0.21958431601524353,
"rewards/rejected": -3.082019329071045,
"step": 70
},
{
"epoch": 0.16393442622950818,
"grad_norm": 36.06009692365191,
"learning_rate": 8.152173913043478e-07,
"logits/chosen": -0.9559575319290161,
"logits/rejected": -0.9389899969100952,
"logps/chosen": -0.29445773363113403,
"logps/rejected": -0.3360756039619446,
"loss": 2.916,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.944577217102051,
"rewards/margins": 0.4161788523197174,
"rewards/rejected": -3.3607559204101562,
"step": 75
},
{
"epoch": 0.17486338797814208,
"grad_norm": 49.3694605129278,
"learning_rate": 8.695652173913043e-07,
"logits/chosen": -0.9711343050003052,
"logits/rejected": -0.9096651077270508,
"logps/chosen": -0.29308563470840454,
"logps/rejected": -0.31533023715019226,
"loss": 2.8703,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.930856227874756,
"rewards/margins": 0.22244596481323242,
"rewards/rejected": -3.1533024311065674,
"step": 80
},
{
"epoch": 0.18579234972677597,
"grad_norm": 54.64036329213502,
"learning_rate": 9.239130434782608e-07,
"logits/chosen": -0.9719535112380981,
"logits/rejected": -0.9226737022399902,
"logps/chosen": -0.29737505316734314,
"logps/rejected": -0.34351009130477905,
"loss": 2.8675,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.973750352859497,
"rewards/margins": 0.46135035157203674,
"rewards/rejected": -3.435100555419922,
"step": 85
},
{
"epoch": 0.19672131147540983,
"grad_norm": 38.39312594421068,
"learning_rate": 9.782608695652173e-07,
"logits/chosen": -1.0393884181976318,
"logits/rejected": -0.9582293629646301,
"logps/chosen": -0.3267248272895813,
"logps/rejected": -0.3579455614089966,
"loss": 2.887,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -3.2672481536865234,
"rewards/margins": 0.3122076094150543,
"rewards/rejected": -3.579455852508545,
"step": 90
},
{
"epoch": 0.20765027322404372,
"grad_norm": 50.13238024700346,
"learning_rate": 9.999671349822886e-07,
"logits/chosen": -0.9711877107620239,
"logits/rejected": -0.9710756540298462,
"logps/chosen": -0.32033300399780273,
"logps/rejected": -0.34920942783355713,
"loss": 2.7879,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -3.2033302783966064,
"rewards/margins": 0.28876420855522156,
"rewards/rejected": -3.492094039916992,
"step": 95
},
{
"epoch": 0.2185792349726776,
"grad_norm": 68.48662325262401,
"learning_rate": 9.997663088532014e-07,
"logits/chosen": -0.9745977520942688,
"logits/rejected": -0.9276177287101746,
"logps/chosen": -0.36889714002609253,
"logps/rejected": -0.4320752024650574,
"loss": 2.8019,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -3.6889712810516357,
"rewards/margins": 0.6317806839942932,
"rewards/rejected": -4.320752143859863,
"step": 100
},
{
"epoch": 0.22950819672131148,
"grad_norm": 43.27710085441567,
"learning_rate": 9.9938298818292e-07,
"logits/chosen": -1.0145201683044434,
"logits/rejected": -0.9839836955070496,
"logps/chosen": -0.3395516276359558,
"logps/rejected": -0.39562201499938965,
"loss": 2.7773,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -3.3955161571502686,
"rewards/margins": 0.5607036352157593,
"rewards/rejected": -3.9562199115753174,
"step": 105
},
{
"epoch": 0.24043715846994534,
"grad_norm": 51.230908709126574,
"learning_rate": 9.98817312944725e-07,
"logits/chosen": -1.012632131576538,
"logits/rejected": -0.9917828440666199,
"logps/chosen": -0.3605939447879791,
"logps/rejected": -0.4645133912563324,
"loss": 2.7758,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.6059391498565674,
"rewards/margins": 1.0391945838928223,
"rewards/rejected": -4.645133972167969,
"step": 110
},
{
"epoch": 0.25136612021857924,
"grad_norm": 51.557404433953074,
"learning_rate": 9.98069489700446e-07,
"logits/chosen": -1.0225282907485962,
"logits/rejected": -0.9822538495063782,
"logps/chosen": -0.36402350664138794,
"logps/rejected": -0.49069976806640625,
"loss": 2.7184,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -3.640235424041748,
"rewards/margins": 1.2667627334594727,
"rewards/rejected": -4.906998634338379,
"step": 115
},
{
"epoch": 0.26229508196721313,
"grad_norm": 39.49660551037683,
"learning_rate": 9.971397915250336e-07,
"logits/chosen": -1.046118140220642,
"logits/rejected": -0.9747602343559265,
"logps/chosen": -0.3749980330467224,
"logps/rejected": -0.4443044662475586,
"loss": 2.6921,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.7499804496765137,
"rewards/margins": 0.6930642127990723,
"rewards/rejected": -4.443044662475586,
"step": 120
},
{
"epoch": 0.273224043715847,
"grad_norm": 126.01262679098515,
"learning_rate": 9.960285579068417e-07,
"logits/chosen": -0.9501935839653015,
"logits/rejected": -0.9143295288085938,
"logps/chosen": -0.4171057641506195,
"logps/rejected": -0.5157137513160706,
"loss": 2.6822,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.17105770111084,
"rewards/margins": 0.9860798120498657,
"rewards/rejected": -5.157137393951416,
"step": 125
},
{
"epoch": 0.28415300546448086,
"grad_norm": 47.14315030112102,
"learning_rate": 9.94736194623663e-07,
"logits/chosen": -0.9490100741386414,
"logits/rejected": -0.9435502290725708,
"logps/chosen": -0.4328651428222656,
"logps/rejected": -0.5982568860054016,
"loss": 2.6973,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.328651428222656,
"rewards/margins": 1.6539175510406494,
"rewards/rejected": -5.98256778717041,
"step": 130
},
{
"epoch": 0.29508196721311475,
"grad_norm": 39.88678414865988,
"learning_rate": 9.932631735945526e-07,
"logits/chosen": -0.9944769144058228,
"logits/rejected": -0.9127768278121948,
"logps/chosen": -0.43029850721359253,
"logps/rejected": -0.5863696932792664,
"loss": 2.6024,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -4.302985191345215,
"rewards/margins": 1.5607118606567383,
"rewards/rejected": -5.863697528839111,
"step": 135
},
{
"epoch": 0.30601092896174864,
"grad_norm": 45.7273688529439,
"learning_rate": 9.916100327075037e-07,
"logits/chosen": -0.9914215207099915,
"logits/rejected": -0.9394119381904602,
"logps/chosen": -0.47816991806030273,
"logps/rejected": -0.682498574256897,
"loss": 2.3785,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.781699180603027,
"rewards/margins": 2.0432868003845215,
"rewards/rejected": -6.824985504150391,
"step": 140
},
{
"epoch": 0.31693989071038253,
"grad_norm": 59.43998889151297,
"learning_rate": 9.89777375623032e-07,
"logits/chosen": -0.9755287170410156,
"logits/rejected": -0.9593987464904785,
"logps/chosen": -0.4856489300727844,
"logps/rejected": -0.5857899188995361,
"loss": 2.5053,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -4.856489181518555,
"rewards/margins": 1.0014104843139648,
"rewards/rejected": -5.8578996658325195,
"step": 145
},
{
"epoch": 0.32786885245901637,
"grad_norm": 47.63474073428632,
"learning_rate": 9.877658715537428e-07,
"logits/chosen": -1.0314117670059204,
"logits/rejected": -1.0008752346038818,
"logps/chosen": -0.5753797292709351,
"logps/rejected": -0.8458330035209656,
"loss": 2.3952,
"rewards/accuracies": 0.71875,
"rewards/chosen": -5.753798007965088,
"rewards/margins": 2.7045321464538574,
"rewards/rejected": -8.458330154418945,
"step": 150
},
{
"epoch": 0.33879781420765026,
"grad_norm": 65.52771946076592,
"learning_rate": 9.85576255019963e-07,
"logits/chosen": -0.9931026697158813,
"logits/rejected": -0.9415470361709595,
"logps/chosen": -0.5901178121566772,
"logps/rejected": -0.7857885360717773,
"loss": 2.3737,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -5.901178359985352,
"rewards/margins": 1.9567070007324219,
"rewards/rejected": -7.857884883880615,
"step": 155
},
{
"epoch": 0.34972677595628415,
"grad_norm": 62.974125126106905,
"learning_rate": 9.832093255815216e-07,
"logits/chosen": -1.0444536209106445,
"logits/rejected": -0.9929295778274536,
"logps/chosen": -0.7247758507728577,
"logps/rejected": -0.9017370939254761,
"loss": 2.3973,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -7.247758388519287,
"rewards/margins": 1.7696129083633423,
"rewards/rejected": -9.01737117767334,
"step": 160
},
{
"epoch": 0.36065573770491804,
"grad_norm": 63.72395656615203,
"learning_rate": 9.806659475457849e-07,
"logits/chosen": -1.0410820245742798,
"logits/rejected": -0.9887404441833496,
"logps/chosen": -0.7456644773483276,
"logps/rejected": -0.9195586442947388,
"loss": 2.3301,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -7.4566450119018555,
"rewards/margins": 1.7389415502548218,
"rewards/rejected": -9.195585250854492,
"step": 165
},
{
"epoch": 0.37158469945355194,
"grad_norm": 66.75690610820409,
"learning_rate": 9.779470496520441e-07,
"logits/chosen": -1.0688705444335938,
"logits/rejected": -1.0115132331848145,
"logps/chosen": -0.7449184656143188,
"logps/rejected": -0.9542142152786255,
"loss": 2.3577,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -7.449184417724609,
"rewards/margins": 2.0929577350616455,
"rewards/rejected": -9.542141914367676,
"step": 170
},
{
"epoch": 0.3825136612021858,
"grad_norm": 49.727137600047485,
"learning_rate": 9.750536247323789e-07,
"logits/chosen": -1.141492486000061,
"logits/rejected": -1.1157532930374146,
"logps/chosen": -0.8256582021713257,
"logps/rejected": -0.9864746332168579,
"loss": 2.2519,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -8.256583213806152,
"rewards/margins": 1.6081632375717163,
"rewards/rejected": -9.86474609375,
"step": 175
},
{
"epoch": 0.39344262295081966,
"grad_norm": 52.45035344166609,
"learning_rate": 9.719867293491144e-07,
"logits/chosen": -1.1214529275894165,
"logits/rejected": -1.1165130138397217,
"logps/chosen": -0.7945824861526489,
"logps/rejected": -1.1331901550292969,
"loss": 2.1564,
"rewards/accuracies": 0.78125,
"rewards/chosen": -7.94582462310791,
"rewards/margins": 3.3860764503479004,
"rewards/rejected": -11.331900596618652,
"step": 180
},
{
"epoch": 0.40437158469945356,
"grad_norm": 47.993586496085484,
"learning_rate": 9.687474834090067e-07,
"logits/chosen": -1.1440832614898682,
"logits/rejected": -1.1622083187103271,
"logps/chosen": -0.8209633827209473,
"logps/rejected": -1.1528552770614624,
"loss": 2.1291,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.209634780883789,
"rewards/margins": 3.3189189434051514,
"rewards/rejected": -11.52855396270752,
"step": 185
},
{
"epoch": 0.41530054644808745,
"grad_norm": 68.1701064534262,
"learning_rate": 9.653370697542987e-07,
"logits/chosen": -1.146360158920288,
"logits/rejected": -1.1054737567901611,
"logps/chosen": -0.8023772239685059,
"logps/rejected": -1.1262474060058594,
"loss": 2.1611,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.023773193359375,
"rewards/margins": 3.238701581954956,
"rewards/rejected": -11.262474060058594,
"step": 190
},
{
"epoch": 0.4262295081967213,
"grad_norm": 53.904696843657476,
"learning_rate": 9.617567337307935e-07,
"logits/chosen": -1.1726776361465454,
"logits/rejected": -1.1539947986602783,
"logps/chosen": -0.9123600125312805,
"logps/rejected": -1.2916643619537354,
"loss": 2.1905,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -9.123600006103516,
"rewards/margins": 3.793043613433838,
"rewards/rejected": -12.916644096374512,
"step": 195
},
{
"epoch": 0.4371584699453552,
"grad_norm": 54.28658232674653,
"learning_rate": 9.580077827331037e-07,
"logits/chosen": -1.1374239921569824,
"logits/rejected": -1.0560877323150635,
"logps/chosen": -0.8440427780151367,
"logps/rejected": -1.1447341442108154,
"loss": 2.1572,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -8.440427780151367,
"rewards/margins": 3.0069146156311035,
"rewards/rejected": -11.447342872619629,
"step": 200
},
{
"epoch": 0.44808743169398907,
"grad_norm": 47.58008461187375,
"learning_rate": 9.540915857272445e-07,
"logits/chosen": -1.113061785697937,
"logits/rejected": -1.1297590732574463,
"logps/chosen": -0.7553713917732239,
"logps/rejected": -1.0475962162017822,
"loss": 2.0812,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -7.553713798522949,
"rewards/margins": 2.9222495555877686,
"rewards/rejected": -10.47596263885498,
"step": 205
},
{
"epoch": 0.45901639344262296,
"grad_norm": 72.2337993655073,
"learning_rate": 9.500095727507419e-07,
"logits/chosen": -1.1518081426620483,
"logits/rejected": -1.1548099517822266,
"logps/chosen": -0.811726450920105,
"logps/rejected": -1.2030248641967773,
"loss": 1.9857,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.117263793945312,
"rewards/margins": 3.912985324859619,
"rewards/rejected": -12.030248641967773,
"step": 210
},
{
"epoch": 0.46994535519125685,
"grad_norm": 75.8271820255055,
"learning_rate": 9.457632343904402e-07,
"logits/chosen": -1.1457799673080444,
"logits/rejected": -1.0930476188659668,
"logps/chosen": -0.8380640745162964,
"logps/rejected": -1.2331631183624268,
"loss": 2.068,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.380640983581543,
"rewards/margins": 3.950991153717041,
"rewards/rejected": -12.331632614135742,
"step": 215
},
{
"epoch": 0.4808743169398907,
"grad_norm": 47.21678860467139,
"learning_rate": 9.413541212382004e-07,
"logits/chosen": -1.1901623010635376,
"logits/rejected": -1.1682528257369995,
"logps/chosen": -0.8619640469551086,
"logps/rejected": -1.1715562343597412,
"loss": 2.0188,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -8.619640350341797,
"rewards/margins": 3.0959222316741943,
"rewards/rejected": -11.71556282043457,
"step": 220
},
{
"epoch": 0.4918032786885246,
"grad_norm": 51.414680735085476,
"learning_rate": 9.367838433246857e-07,
"logits/chosen": -1.2050046920776367,
"logits/rejected": -1.165052056312561,
"logps/chosen": -0.8373457789421082,
"logps/rejected": -1.2013850212097168,
"loss": 1.9761,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.373457908630371,
"rewards/margins": 3.640392303466797,
"rewards/rejected": -12.013849258422852,
"step": 225
},
{
"epoch": 0.5027322404371585,
"grad_norm": 52.749993213229466,
"learning_rate": 9.320540695314438e-07,
"logits/chosen": -1.1119884252548218,
"logits/rejected": -1.1168959140777588,
"logps/chosen": -0.835217297077179,
"logps/rejected": -1.2099316120147705,
"loss": 2.0374,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.352171897888184,
"rewards/margins": 3.747142791748047,
"rewards/rejected": -12.099315643310547,
"step": 230
},
{
"epoch": 0.5136612021857924,
"grad_norm": 57.97242786280693,
"learning_rate": 9.271665269814983e-07,
"logits/chosen": -1.1533162593841553,
"logits/rejected": -1.1169049739837646,
"logps/chosen": -0.8319026231765747,
"logps/rejected": -1.1632182598114014,
"loss": 1.9568,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.319025039672852,
"rewards/margins": 3.3131580352783203,
"rewards/rejected": -11.632184982299805,
"step": 235
},
{
"epoch": 0.5245901639344263,
"grad_norm": 55.73546760366859,
"learning_rate": 9.221230004086721e-07,
"logits/chosen": -1.213132619857788,
"logits/rejected": -1.224669098854065,
"logps/chosen": -0.7961581945419312,
"logps/rejected": -1.243761658668518,
"loss": 1.8379,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -7.961583137512207,
"rewards/margins": 4.476034164428711,
"rewards/rejected": -12.437616348266602,
"step": 240
},
{
"epoch": 0.5355191256830601,
"grad_norm": 47.285753406270835,
"learning_rate": 9.169253315058763e-07,
"logits/chosen": -1.1385068893432617,
"logits/rejected": -1.1000906229019165,
"logps/chosen": -0.858871340751648,
"logps/rejected": -1.3097403049468994,
"loss": 1.9234,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -8.588713645935059,
"rewards/margins": 4.508688926696777,
"rewards/rejected": -13.097402572631836,
"step": 245
},
{
"epoch": 0.546448087431694,
"grad_norm": 46.893831371795955,
"learning_rate": 9.11575418252596e-07,
"logits/chosen": -1.1890825033187866,
"logits/rejected": -1.1559849977493286,
"logps/chosen": -0.8301135301589966,
"logps/rejected": -1.1934387683868408,
"loss": 1.9956,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -8.301135063171387,
"rewards/margins": 3.6332526206970215,
"rewards/rejected": -11.934389114379883,
"step": 250
},
{
"epoch": 0.5573770491803278,
"grad_norm": 55.2826289688807,
"learning_rate": 9.060752142218257e-07,
"logits/chosen": -1.1763793230056763,
"logits/rejected": -1.1470096111297607,
"logps/chosen": -0.8657994270324707,
"logps/rejected": -1.3209015130996704,
"loss": 1.8569,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.657995223999023,
"rewards/margins": 4.551021099090576,
"rewards/rejected": -13.209016799926758,
"step": 255
},
{
"epoch": 0.5683060109289617,
"grad_norm": 43.2201073763009,
"learning_rate": 9.004267278667031e-07,
"logits/chosen": -1.14817214012146,
"logits/rejected": -1.1386573314666748,
"logps/chosen": -0.814948558807373,
"logps/rejected": -1.2620770931243896,
"loss": 1.8509,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.14948558807373,
"rewards/margins": 4.471285820007324,
"rewards/rejected": -12.620772361755371,
"step": 260
},
{
"epoch": 0.5792349726775956,
"grad_norm": 46.44813195636061,
"learning_rate": 8.946320217871025e-07,
"logits/chosen": -1.137434720993042,
"logits/rejected": -1.1053473949432373,
"logps/chosen": -0.8041390180587769,
"logps/rejected": -1.2295944690704346,
"loss": 1.8193,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.041391372680664,
"rewards/margins": 4.2545552253723145,
"rewards/rejected": -12.29594612121582,
"step": 265
},
{
"epoch": 0.5901639344262295,
"grad_norm": 77.85873013123702,
"learning_rate": 8.886932119764565e-07,
"logits/chosen": -1.1461609601974487,
"logits/rejected": -1.1255290508270264,
"logps/chosen": -0.7929703593254089,
"logps/rejected": -1.264559030532837,
"loss": 1.7945,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -7.929703712463379,
"rewards/margins": 4.715887546539307,
"rewards/rejected": -12.645589828491211,
"step": 270
},
{
"epoch": 0.6010928961748634,
"grad_norm": 47.47122746211464,
"learning_rate": 8.826124670490802e-07,
"logits/chosen": -1.1295936107635498,
"logits/rejected": -1.0679261684417725,
"logps/chosen": -0.814121425151825,
"logps/rejected": -1.1613472700119019,
"loss": 1.7983,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -8.141213417053223,
"rewards/margins": 3.472259521484375,
"rewards/rejected": -11.613473892211914,
"step": 275
},
{
"epoch": 0.6120218579234973,
"grad_norm": 48.73236557934882,
"learning_rate": 8.763920074482809e-07,
"logits/chosen": -1.0854823589324951,
"logits/rejected": -1.0907032489776611,
"logps/chosen": -0.8657575845718384,
"logps/rejected": -1.4133459329605103,
"loss": 1.8024,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -8.657574653625488,
"rewards/margins": 5.475884437561035,
"rewards/rejected": -14.133459091186523,
"step": 280
},
{
"epoch": 0.6229508196721312,
"grad_norm": 47.765718733774406,
"learning_rate": 8.700341046355411e-07,
"logits/chosen": -1.2173136472702026,
"logits/rejected": -1.1900323629379272,
"logps/chosen": -0.8186389803886414,
"logps/rejected": -1.356453537940979,
"loss": 1.7205,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -8.186389923095703,
"rewards/margins": 5.37814474105835,
"rewards/rejected": -13.564535140991211,
"step": 285
},
{
"epoch": 0.6338797814207651,
"grad_norm": 109.29731033378253,
"learning_rate": 8.635410802610723e-07,
"logits/chosen": -1.1491611003875732,
"logits/rejected": -1.1173255443572998,
"logps/chosen": -0.8337961435317993,
"logps/rejected": -1.3431804180145264,
"loss": 1.7427,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.337961196899414,
"rewards/margins": 5.093844413757324,
"rewards/rejected": -13.431805610656738,
"step": 290
},
{
"epoch": 0.644808743169399,
"grad_norm": 52.41645016824533,
"learning_rate": 8.569153053160428e-07,
"logits/chosen": -1.1350514888763428,
"logits/rejected": -1.1325361728668213,
"logps/chosen": -0.8541259765625,
"logps/rejected": -1.4412286281585693,
"loss": 1.7092,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -8.541260719299316,
"rewards/margins": 5.871027946472168,
"rewards/rejected": -14.412287712097168,
"step": 295
},
{
"epoch": 0.6557377049180327,
"grad_norm": 39.769668748500315,
"learning_rate": 8.501591992667849e-07,
"logits/chosen": -1.1677170991897583,
"logits/rejected": -1.1525037288665771,
"logps/chosen": -0.8974548578262329,
"logps/rejected": -1.4408533573150635,
"loss": 1.7174,
"rewards/accuracies": 0.75,
"rewards/chosen": -8.974547386169434,
"rewards/margins": 5.433985710144043,
"rewards/rejected": -14.408534049987793,
"step": 300
},
{
"epoch": 0.6666666666666666,
"grad_norm": 62.1991466431804,
"learning_rate": 8.432752291713058e-07,
"logits/chosen": -1.1649607419967651,
"logits/rejected": -1.1135450601577759,
"logps/chosen": -0.863819420337677,
"logps/rejected": -1.4759416580200195,
"loss": 1.7163,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -8.63819408416748,
"rewards/margins": 6.121220588684082,
"rewards/rejected": -14.759414672851562,
"step": 305
},
{
"epoch": 0.6775956284153005,
"grad_norm": 46.51957742191928,
"learning_rate": 8.362659087784152e-07,
"logits/chosen": -1.0963289737701416,
"logits/rejected": -1.1045761108398438,
"logps/chosen": -0.8705239295959473,
"logps/rejected": -1.4945770502090454,
"loss": 1.7561,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.705240249633789,
"rewards/margins": 6.240530967712402,
"rewards/rejected": -14.945770263671875,
"step": 310
},
{
"epoch": 0.6885245901639344,
"grad_norm": 48.05453083003605,
"learning_rate": 8.291337976098067e-07,
"logits/chosen": -1.1423165798187256,
"logits/rejected": -1.1360952854156494,
"logps/chosen": -0.931683361530304,
"logps/rejected": -1.385315179824829,
"loss": 1.717,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -9.316834449768066,
"rewards/margins": 4.536317348480225,
"rewards/rejected": -13.853151321411133,
"step": 315
},
{
"epoch": 0.6994535519125683,
"grad_norm": 48.54110161857607,
"learning_rate": 8.218815000254231e-07,
"logits/chosen": -1.1940380334854126,
"logits/rejected": -1.1442067623138428,
"logps/chosen": -0.8465646505355835,
"logps/rejected": -1.3880670070648193,
"loss": 1.6584,
"rewards/accuracies": 0.84375,
"rewards/chosen": -8.465646743774414,
"rewards/margins": 5.415023326873779,
"rewards/rejected": -13.880670547485352,
"step": 320
},
{
"epoch": 0.7103825136612022,
"grad_norm": 65.95615056402572,
"learning_rate": 8.145116642724485e-07,
"logits/chosen": -1.1649667024612427,
"logits/rejected": -1.1449640989303589,
"logps/chosen": -0.8567377328872681,
"logps/rejected": -1.3750416040420532,
"loss": 1.6571,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -8.567378044128418,
"rewards/margins": 5.183037757873535,
"rewards/rejected": -13.750414848327637,
"step": 325
},
{
"epoch": 0.7213114754098361,
"grad_norm": 49.39205034599243,
"learning_rate": 8.07026981518276e-07,
"logits/chosen": -1.0772454738616943,
"logits/rejected": -1.0332270860671997,
"logps/chosen": -0.8580091595649719,
"logps/rejected": -1.7133315801620483,
"loss": 1.5704,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -8.580090522766113,
"rewards/margins": 8.55322265625,
"rewards/rejected": -17.13331413269043,
"step": 330
},
{
"epoch": 0.73224043715847,
"grad_norm": 53.718643483464305,
"learning_rate": 7.994301848678004e-07,
"logits/chosen": -1.0714246034622192,
"logits/rejected": -1.0167793035507202,
"logps/chosen": -0.8965083360671997,
"logps/rejected": -1.6172587871551514,
"loss": 1.6011,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -8.965082168579102,
"rewards/margins": 7.2075066566467285,
"rewards/rejected": -16.172588348388672,
"step": 335
},
{
"epoch": 0.7431693989071039,
"grad_norm": 54.634488875444745,
"learning_rate": 7.917240483654e-07,
"logits/chosen": -1.0873680114746094,
"logits/rejected": -1.0355134010314941,
"logps/chosen": -0.8684855699539185,
"logps/rejected": -1.5925236940383911,
"loss": 1.6421,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -8.684855461120605,
"rewards/margins": 7.240380764007568,
"rewards/rejected": -15.925236701965332,
"step": 340
},
{
"epoch": 0.7540983606557377,
"grad_norm": 51.28688313391757,
"learning_rate": 7.839113859819656e-07,
"logits/chosen": -1.1350085735321045,
"logits/rejected": -1.1159374713897705,
"logps/chosen": -0.9655283689498901,
"logps/rejected": -1.7449572086334229,
"loss": 1.656,
"rewards/accuracies": 0.84375,
"rewards/chosen": -9.65528392791748,
"rewards/margins": 7.794284820556641,
"rewards/rejected": -17.449567794799805,
"step": 345
},
{
"epoch": 0.7650273224043715,
"grad_norm": 55.37179382269985,
"learning_rate": 7.759950505873521e-07,
"logits/chosen": -1.1313519477844238,
"logits/rejected": -1.1056666374206543,
"logps/chosen": -0.7726918458938599,
"logps/rejected": -1.281256079673767,
"loss": 1.5694,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -7.726918697357178,
"rewards/margins": 5.0856428146362305,
"rewards/rejected": -12.81256103515625,
"step": 350
},
{
"epoch": 0.7759562841530054,
"grad_norm": 53.34572563403844,
"learning_rate": 7.67977932908626e-07,
"logits/chosen": -1.108883261680603,
"logits/rejected": -1.0669116973876953,
"logps/chosen": -0.7748141288757324,
"logps/rejected": -1.440830945968628,
"loss": 1.587,
"rewards/accuracies": 0.84375,
"rewards/chosen": -7.74813985824585,
"rewards/margins": 6.660167694091797,
"rewards/rejected": -14.408308029174805,
"step": 355
},
{
"epoch": 0.7868852459016393,
"grad_norm": 48.16281114348793,
"learning_rate": 7.598629604744872e-07,
"logits/chosen": -1.0789738893508911,
"logits/rejected": -1.060198187828064,
"logps/chosen": -0.9467193484306335,
"logps/rejected": -1.7663625478744507,
"loss": 1.5215,
"rewards/accuracies": 0.8125,
"rewards/chosen": -9.467192649841309,
"rewards/margins": 8.196432113647461,
"rewards/rejected": -17.663623809814453,
"step": 360
},
{
"epoch": 0.7978142076502732,
"grad_norm": 63.80877540833078,
"learning_rate": 7.516530965462539e-07,
"logits/chosen": -1.163653016090393,
"logits/rejected": -1.1556129455566406,
"logps/chosen": -0.8032275438308716,
"logps/rejected": -1.5929330587387085,
"loss": 1.5335,
"rewards/accuracies": 0.84375,
"rewards/chosen": -8.03227424621582,
"rewards/margins": 7.897056579589844,
"rewards/rejected": -15.929330825805664,
"step": 365
},
{
"epoch": 0.8087431693989071,
"grad_norm": 48.323366094845305,
"learning_rate": 7.433513390357989e-07,
"logits/chosen": -1.1779323816299438,
"logits/rejected": -1.1376478672027588,
"logps/chosen": -0.8401437997817993,
"logps/rejected": -1.704115629196167,
"loss": 1.4945,
"rewards/accuracies": 0.84375,
"rewards/chosen": -8.401437759399414,
"rewards/margins": 8.639719009399414,
"rewards/rejected": -17.041156768798828,
"step": 370
},
{
"epoch": 0.819672131147541,
"grad_norm": 56.42804006279489,
"learning_rate": 7.349607194108322e-07,
"logits/chosen": -1.211778163909912,
"logits/rejected": -1.1346906423568726,
"logps/chosen": -0.8210548162460327,
"logps/rejected": -1.5874106884002686,
"loss": 1.4902,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -8.210548400878906,
"rewards/margins": 7.6635589599609375,
"rewards/rejected": -15.874107360839844,
"step": 375
},
{
"epoch": 0.8306010928961749,
"grad_norm": 45.567036459266035,
"learning_rate": 7.264843015879321e-07,
"logits/chosen": -1.0812907218933105,
"logits/rejected": -1.0929306745529175,
"logps/chosen": -0.8382685780525208,
"logps/rejected": -1.5565245151519775,
"loss": 1.3906,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.382685661315918,
"rewards/margins": 7.182559967041016,
"rewards/rejected": -15.565244674682617,
"step": 380
},
{
"epoch": 0.8415300546448088,
"grad_norm": 60.144174866919506,
"learning_rate": 7.17925180813725e-07,
"logits/chosen": -1.1758795976638794,
"logits/rejected": -1.1344640254974365,
"logps/chosen": -0.9767251014709473,
"logps/rejected": -2.0028223991394043,
"loss": 1.5811,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -9.767251968383789,
"rewards/margins": 10.260972023010254,
"rewards/rejected": -20.028223037719727,
"step": 385
},
{
"epoch": 0.8524590163934426,
"grad_norm": 65.50587766305415,
"learning_rate": 7.092864825346266e-07,
"logits/chosen": -1.1589624881744385,
"logits/rejected": -1.1095167398452759,
"logps/chosen": -0.8335350155830383,
"logps/rejected": -1.9316027164459229,
"loss": 1.4313,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -8.335351943969727,
"rewards/margins": 10.980676651000977,
"rewards/rejected": -19.316028594970703,
"step": 390
},
{
"epoch": 0.8633879781420765,
"grad_norm": 55.45278300982558,
"learning_rate": 7.005713612555545e-07,
"logits/chosen": -1.1298582553863525,
"logits/rejected": -1.1044299602508545,
"logps/chosen": -0.8622570037841797,
"logps/rejected": -1.6899712085723877,
"loss": 1.4256,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.622570037841797,
"rewards/margins": 8.277142524719238,
"rewards/rejected": -16.89971351623535,
"step": 395
},
{
"epoch": 0.8743169398907104,
"grad_norm": 51.76941664963157,
"learning_rate": 6.917829993880302e-07,
"logits/chosen": -1.0573416948318481,
"logits/rejected": -1.0199925899505615,
"logps/chosen": -0.8631765246391296,
"logps/rejected": -1.7595199346542358,
"loss": 1.4299,
"rewards/accuracies": 0.875,
"rewards/chosen": -8.631765365600586,
"rewards/margins": 8.963434219360352,
"rewards/rejected": -17.595197677612305,
"step": 400
},
{
"epoch": 0.8743169398907104,
"eval_logits/chosen": -1.329952597618103,
"eval_logits/rejected": -1.276990532875061,
"eval_logps/chosen": -0.838367760181427,
"eval_logps/rejected": -1.7586109638214111,
"eval_loss": 1.4681804180145264,
"eval_rewards/accuracies": 0.8704819083213806,
"eval_rewards/chosen": -8.38367748260498,
"eval_rewards/margins": 9.202432632446289,
"eval_rewards/rejected": -17.58610725402832,
"eval_runtime": 37.0639,
"eval_samples_per_second": 35.56,
"eval_steps_per_second": 2.239,
"step": 400
},
{
"epoch": 0.8852459016393442,
"grad_norm": 50.42729833432404,
"learning_rate": 6.8292460608809e-07,
"logits/chosen": -1.1105328798294067,
"logits/rejected": -1.030253529548645,
"logps/chosen": -0.8442754745483398,
"logps/rejected": -1.8324899673461914,
"loss": 1.4028,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -8.442753791809082,
"rewards/margins": 9.882145881652832,
"rewards/rejected": -18.324899673461914,
"step": 405
},
{
"epoch": 0.8961748633879781,
"grad_norm": 52.61786510417617,
"learning_rate": 6.739994160844309e-07,
"logits/chosen": -1.1255931854248047,
"logits/rejected": -1.1459242105484009,
"logps/chosen": -0.9382074475288391,
"logps/rejected": -2.0845236778259277,
"loss": 1.3773,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -9.382074356079102,
"rewards/margins": 11.463163375854492,
"rewards/rejected": -20.845239639282227,
"step": 410
},
{
"epoch": 0.907103825136612,
"grad_norm": 53.20615382876954,
"learning_rate": 6.650106884972176e-07,
"logits/chosen": -1.174919605255127,
"logits/rejected": -1.161853551864624,
"logps/chosen": -0.812863826751709,
"logps/rejected": -1.9977823495864868,
"loss": 1.4673,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -8.12863826751709,
"rewards/margins": 11.8491849899292,
"rewards/rejected": -19.97782325744629,
"step": 415
},
{
"epoch": 0.9180327868852459,
"grad_norm": 72.30539481567058,
"learning_rate": 6.559617056479827e-07,
"logits/chosen": -1.1823254823684692,
"logits/rejected": -1.1509783267974854,
"logps/chosen": -0.9243672490119934,
"logps/rejected": -2.090036392211914,
"loss": 1.3848,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -9.243673324584961,
"rewards/margins": 11.656692504882812,
"rewards/rejected": -20.90036392211914,
"step": 420
},
{
"epoch": 0.9289617486338798,
"grad_norm": 56.55049259469843,
"learning_rate": 6.468557718610559e-07,
"logits/chosen": -1.1578831672668457,
"logits/rejected": -1.1295568943023682,
"logps/chosen": -0.9935392141342163,
"logps/rejected": -2.230989456176758,
"loss": 1.4147,
"rewards/accuracies": 0.84375,
"rewards/chosen": -9.935392379760742,
"rewards/margins": 12.37450122833252,
"rewards/rejected": -22.309894561767578,
"step": 425
},
{
"epoch": 0.9398907103825137,
"grad_norm": 53.30457991286137,
"learning_rate": 6.376962122569567e-07,
"logits/chosen": -1.085447072982788,
"logits/rejected": -1.0933465957641602,
"logps/chosen": -0.6574488282203674,
"logps/rejected": -1.6747887134552002,
"loss": 1.1961,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.574488162994385,
"rewards/margins": 10.17340087890625,
"rewards/rejected": -16.74789047241211,
"step": 430
},
{
"epoch": 0.9508196721311475,
"grad_norm": 64.87387340664675,
"learning_rate": 6.284863715381948e-07,
"logits/chosen": -1.1618143320083618,
"logits/rejected": -1.1672941446304321,
"logps/chosen": -0.826119601726532,
"logps/rejected": -2.0510828495025635,
"loss": 1.3657,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -8.26119613647461,
"rewards/margins": 12.249631881713867,
"rewards/rejected": -20.510828018188477,
"step": 435
},
{
"epoch": 0.9617486338797814,
"grad_norm": 75.88914587151349,
"learning_rate": 6.192296127679192e-07,
"logits/chosen": -1.1188112497329712,
"logits/rejected": -1.0653330087661743,
"logps/chosen": -0.856960117816925,
"logps/rejected": -1.874408483505249,
"loss": 1.3724,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -8.569601058959961,
"rewards/margins": 10.174482345581055,
"rewards/rejected": -18.744083404541016,
"step": 440
},
{
"epoch": 0.9726775956284153,
"grad_norm": 58.010471678644066,
"learning_rate": 6.099293161418629e-07,
"logits/chosen": -1.1678813695907593,
"logits/rejected": -1.1374809741973877,
"logps/chosen": -0.7309656143188477,
"logps/rejected": -1.9285995960235596,
"loss": 1.3965,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -7.309657096862793,
"rewards/margins": 11.976339340209961,
"rewards/rejected": -19.285995483398438,
"step": 445
},
{
"epoch": 0.9836065573770492,
"grad_norm": 59.07594471966485,
"learning_rate": 6.005888777540319e-07,
"logits/chosen": -1.0870949029922485,
"logits/rejected": -1.0754623413085938,
"logps/chosen": -0.8063844442367554,
"logps/rejected": -1.78205144405365,
"loss": 1.382,
"rewards/accuracies": 0.875,
"rewards/chosen": -8.063844680786133,
"rewards/margins": 9.756668090820312,
"rewards/rejected": -17.820514678955078,
"step": 450
},
{
"epoch": 0.994535519125683,
"grad_norm": 64.98206511650474,
"learning_rate": 5.912117083565873e-07,
"logits/chosen": -1.110975980758667,
"logits/rejected": -1.0932872295379639,
"logps/chosen": -1.0176341533660889,
"logps/rejected": -2.0513908863067627,
"loss": 1.4187,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -10.176340103149414,
"rewards/margins": 10.337566375732422,
"rewards/rejected": -20.513906478881836,
"step": 455
},
{
"epoch": 1.005464480874317,
"grad_norm": 46.23007729679256,
"learning_rate": 5.818012321143773e-07,
"logits/chosen": -1.1477210521697998,
"logits/rejected": -1.1113519668579102,
"logps/chosen": -0.8239518404006958,
"logps/rejected": -2.042766809463501,
"loss": 1.2383,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -8.239518165588379,
"rewards/margins": 12.188150405883789,
"rewards/rejected": -20.42766761779785,
"step": 460
},
{
"epoch": 1.0163934426229508,
"grad_norm": 47.735878726897354,
"learning_rate": 5.723608853545684e-07,
"logits/chosen": -1.1907925605773926,
"logits/rejected": -1.1501821279525757,
"logps/chosen": -0.7988258600234985,
"logps/rejected": -2.209643840789795,
"loss": 0.9683,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.988258361816406,
"rewards/margins": 14.108179092407227,
"rewards/rejected": -22.096435546875,
"step": 465
},
{
"epoch": 1.0273224043715847,
"grad_norm": 61.73193515780372,
"learning_rate": 5.628941153118388e-07,
"logits/chosen": -1.154846429824829,
"logits/rejected": -1.1379241943359375,
"logps/chosen": -0.8357902765274048,
"logps/rejected": -2.2289681434631348,
"loss": 0.971,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -8.357902526855469,
"rewards/margins": 13.931780815124512,
"rewards/rejected": -22.289682388305664,
"step": 470
},
{
"epoch": 1.0382513661202186,
"grad_norm": 40.781569445585546,
"learning_rate": 5.534043788695852e-07,
"logits/chosen": -1.1368563175201416,
"logits/rejected": -1.0872905254364014,
"logps/chosen": -0.7054915428161621,
"logps/rejected": -1.9688949584960938,
"loss": 0.9335,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.054915428161621,
"rewards/margins": 12.634035110473633,
"rewards/rejected": -19.688949584960938,
"step": 475
},
{
"epoch": 1.0491803278688525,
"grad_norm": 39.60063818333042,
"learning_rate": 5.438951412976098e-07,
"logits/chosen": -1.2157926559448242,
"logits/rejected": -1.1730263233184814,
"logps/chosen": -0.7333913445472717,
"logps/rejected": -1.9154350757598877,
"loss": 0.9748,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.3339128494262695,
"rewards/margins": 11.82043743133545,
"rewards/rejected": -19.15435218811035,
"step": 480
},
{
"epoch": 1.0601092896174864,
"grad_norm": 36.9883641416429,
"learning_rate": 5.34369874986742e-07,
"logits/chosen": -1.1718839406967163,
"logits/rejected": -1.1198641061782837,
"logps/chosen": -0.8223574757575989,
"logps/rejected": -2.161813259124756,
"loss": 0.86,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -8.2235746383667,
"rewards/margins": 13.394556045532227,
"rewards/rejected": -21.61812973022461,
"step": 485
},
{
"epoch": 1.0710382513661203,
"grad_norm": 45.67927644257257,
"learning_rate": 5.248320581808619e-07,
"logits/chosen": -1.1171070337295532,
"logits/rejected": -1.078286051750183,
"logps/chosen": -0.6840003728866577,
"logps/rejected": -2.086259126663208,
"loss": 0.952,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.840004920959473,
"rewards/margins": 14.022584915161133,
"rewards/rejected": -20.862590789794922,
"step": 490
},
{
"epoch": 1.0819672131147542,
"grad_norm": 49.80369052011741,
"learning_rate": 5.15285173706785e-07,
"logits/chosen": -1.200192928314209,
"logits/rejected": -1.164684534072876,
"logps/chosen": -0.6610619425773621,
"logps/rejected": -1.9493736028671265,
"loss": 0.9254,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -6.61061954498291,
"rewards/margins": 12.88311767578125,
"rewards/rejected": -19.493736267089844,
"step": 495
},
{
"epoch": 1.092896174863388,
"grad_norm": 63.052651567341215,
"learning_rate": 5.057327077024744e-07,
"logits/chosen": -1.235033392906189,
"logits/rejected": -1.1504552364349365,
"logps/chosen": -0.7189664840698242,
"logps/rejected": -1.9392305612564087,
"loss": 0.9191,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.1896653175354,
"rewards/margins": 12.202640533447266,
"rewards/rejected": -19.39230728149414,
"step": 500
},
{
"epoch": 1.1038251366120218,
"grad_norm": 41.75397804120868,
"learning_rate": 4.961781483440433e-07,
"logits/chosen": -1.1860939264297485,
"logits/rejected": -1.1053270101547241,
"logps/chosen": -0.6767443418502808,
"logps/rejected": -2.0427422523498535,
"loss": 0.8993,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.7674431800842285,
"rewards/margins": 13.6599760055542,
"rewards/rejected": -20.42742156982422,
"step": 505
},
{
"epoch": 1.1147540983606556,
"grad_norm": 49.30319538388188,
"learning_rate": 4.866249845720132e-07,
"logits/chosen": -1.138660192489624,
"logits/rejected": -1.085876226425171,
"logps/chosen": -0.7299310564994812,
"logps/rejected": -1.9430856704711914,
"loss": 1.016,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.29931116104126,
"rewards/margins": 12.131546020507812,
"rewards/rejected": -19.43085479736328,
"step": 510
},
{
"epoch": 1.1256830601092895,
"grad_norm": 52.12265231755663,
"learning_rate": 4.770767048172948e-07,
"logits/chosen": -1.1561533212661743,
"logits/rejected": -1.1119697093963623,
"logps/chosen": -0.6895591616630554,
"logps/rejected": -2.036057472229004,
"loss": 0.9218,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -6.895591735839844,
"rewards/margins": 13.464981079101562,
"rewards/rejected": -20.36057472229004,
"step": 515
},
{
"epoch": 1.1366120218579234,
"grad_norm": 74.15934271900579,
"learning_rate": 4.675367957273505e-07,
"logits/chosen": -1.1586833000183105,
"logits/rejected": -1.1020275354385376,
"logps/chosen": -0.7152846455574036,
"logps/rejected": -2.071946144104004,
"loss": 0.8339,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.152846336364746,
"rewards/margins": 13.566617012023926,
"rewards/rejected": -20.719463348388672,
"step": 520
},
{
"epoch": 1.1475409836065573,
"grad_norm": 42.56498431363493,
"learning_rate": 4.5800874089301455e-07,
"logits/chosen": -1.1795504093170166,
"logits/rejected": -1.1088457107543945,
"logps/chosen": -0.7248662114143372,
"logps/rejected": -2.170384645462036,
"loss": 0.7993,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.248661994934082,
"rewards/margins": 14.455184936523438,
"rewards/rejected": -21.703845977783203,
"step": 525
},
{
"epoch": 1.1584699453551912,
"grad_norm": 62.02807361999941,
"learning_rate": 4.4849601957642285e-07,
"logits/chosen": -1.0977070331573486,
"logits/rejected": -1.0548598766326904,
"logps/chosen": -0.7013900876045227,
"logps/rejected": -2.0269265174865723,
"loss": 0.9229,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.0139007568359375,
"rewards/margins": 13.255361557006836,
"rewards/rejected": -20.269264221191406,
"step": 530
},
{
"epoch": 1.169398907103825,
"grad_norm": 55.06395208463241,
"learning_rate": 4.390021054405286e-07,
"logits/chosen": -1.1502290964126587,
"logits/rejected": -1.1126412153244019,
"logps/chosen": -0.7144995331764221,
"logps/rejected": -2.0490660667419434,
"loss": 0.7918,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.144995212554932,
"rewards/margins": 13.345664978027344,
"rewards/rejected": -20.490659713745117,
"step": 535
},
{
"epoch": 1.180327868852459,
"grad_norm": 49.36114026978983,
"learning_rate": 4.295304652806592e-07,
"logits/chosen": -1.121539831161499,
"logits/rejected": -1.0707148313522339,
"logps/chosen": -0.5932679772377014,
"logps/rejected": -1.8668245077133179,
"loss": 0.8286,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.932679176330566,
"rewards/margins": 12.735565185546875,
"rewards/rejected": -18.668243408203125,
"step": 540
},
{
"epoch": 1.1912568306010929,
"grad_norm": 43.64434028835421,
"learning_rate": 4.200845577585826e-07,
"logits/chosen": -1.1462314128875732,
"logits/rejected": -1.060794711112976,
"logps/chosen": -0.6785440444946289,
"logps/rejected": -1.9171119928359985,
"loss": 0.8796,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.785439968109131,
"rewards/margins": 12.385680198669434,
"rewards/rejected": -19.171123504638672,
"step": 545
},
{
"epoch": 1.2021857923497268,
"grad_norm": 53.05809969738917,
"learning_rate": 4.106678321395433e-07,
"logits/chosen": -1.10367751121521,
"logits/rejected": -1.0523195266723633,
"logps/chosen": -0.6541143655776978,
"logps/rejected": -2.2258107662200928,
"loss": 0.7416,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -6.541143894195557,
"rewards/margins": 15.71696662902832,
"rewards/rejected": -22.25811004638672,
"step": 550
},
{
"epoch": 1.2131147540983607,
"grad_norm": 43.47249256936393,
"learning_rate": 4.012837270327288e-07,
"logits/chosen": -1.06898832321167,
"logits/rejected": -1.0365402698516846,
"logps/chosen": -0.6559886932373047,
"logps/rejected": -1.8688675165176392,
"loss": 0.8523,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.559887886047363,
"rewards/margins": 12.128788948059082,
"rewards/rejected": -18.688674926757812,
"step": 555
},
{
"epoch": 1.2240437158469946,
"grad_norm": 46.63956894655285,
"learning_rate": 3.9193566913562915e-07,
"logits/chosen": -1.1477298736572266,
"logits/rejected": -1.0738533735275269,
"logps/chosen": -0.7739165425300598,
"logps/rejected": -2.0762743949890137,
"loss": 0.8547,
"rewards/accuracies": 0.90625,
"rewards/chosen": -7.739165306091309,
"rewards/margins": 13.023576736450195,
"rewards/rejected": -20.76274299621582,
"step": 560
},
{
"epoch": 1.2349726775956285,
"grad_norm": 41.86222647859532,
"learning_rate": 3.826270719827435e-07,
"logits/chosen": -1.1392501592636108,
"logits/rejected": -1.0661112070083618,
"logps/chosen": -0.7170458436012268,
"logps/rejected": -2.319844961166382,
"loss": 0.8634,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.170458793640137,
"rewards/margins": 16.027990341186523,
"rewards/rejected": -23.19845199584961,
"step": 565
},
{
"epoch": 1.2459016393442623,
"grad_norm": 46.46396612123512,
"learning_rate": 3.7336133469909623e-07,
"logits/chosen": -1.1651411056518555,
"logits/rejected": -1.123337984085083,
"logps/chosen": -0.6901602745056152,
"logps/rejected": -1.889410376548767,
"loss": 0.9312,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.901603698730469,
"rewards/margins": 11.992501258850098,
"rewards/rejected": -18.89410400390625,
"step": 570
},
{
"epoch": 1.2568306010928962,
"grad_norm": 44.10455274659941,
"learning_rate": 3.64141840759012e-07,
"logits/chosen": -1.071542739868164,
"logits/rejected": -1.001315712928772,
"logps/chosen": -0.6726012229919434,
"logps/rejected": -2.025498867034912,
"loss": 0.7803,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.726011753082275,
"rewards/margins": 13.52897834777832,
"rewards/rejected": -20.254989624023438,
"step": 575
},
{
"epoch": 1.2677595628415301,
"grad_norm": 57.66393146233383,
"learning_rate": 3.549719567506076e-07,
"logits/chosen": -1.0479185581207275,
"logits/rejected": -1.0177090167999268,
"logps/chosen": -0.7158246636390686,
"logps/rejected": -1.8856004476547241,
"loss": 0.834,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.1582465171813965,
"rewards/margins": 11.697754859924316,
"rewards/rejected": -18.856000900268555,
"step": 580
},
{
"epoch": 1.278688524590164,
"grad_norm": 37.27610585053919,
"learning_rate": 3.4585503114644996e-07,
"logits/chosen": -1.192347764968872,
"logits/rejected": -1.096699833869934,
"logps/chosen": -0.7132994532585144,
"logps/rejected": -2.139136552810669,
"loss": 0.8319,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.13299560546875,
"rewards/margins": 14.258369445800781,
"rewards/rejected": -21.39136505126953,
"step": 585
},
{
"epoch": 1.289617486338798,
"grad_norm": 38.5581266054859,
"learning_rate": 3.3679439308082774e-07,
"logits/chosen": -1.1517072916030884,
"logits/rejected": -1.1123689413070679,
"logps/chosen": -0.5742620229721069,
"logps/rejected": -1.8333053588867188,
"loss": 0.7982,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -5.74261999130249,
"rewards/margins": 12.590433120727539,
"rewards/rejected": -18.333051681518555,
"step": 590
},
{
"epoch": 1.3005464480874318,
"grad_norm": 51.162638469265886,
"learning_rate": 3.2779335113408646e-07,
"logits/chosen": -1.1484695672988892,
"logits/rejected": -1.0942023992538452,
"logps/chosen": -0.7073885202407837,
"logps/rejected": -2.2211270332336426,
"loss": 0.8581,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.073885440826416,
"rewards/margins": 15.137385368347168,
"rewards/rejected": -22.21126937866211,
"step": 595
},
{
"epoch": 1.3114754098360657,
"grad_norm": 52.482834865002914,
"learning_rate": 3.1885519212446716e-07,
"logits/chosen": -1.2034729719161987,
"logits/rejected": -1.1127815246582031,
"logps/chosen": -0.6728402376174927,
"logps/rejected": -2.121699571609497,
"loss": 0.7897,
"rewards/accuracies": 0.96875,
"rewards/chosen": -6.728402137756348,
"rewards/margins": 14.488592147827148,
"rewards/rejected": -21.216995239257812,
"step": 600
},
{
"epoch": 1.3224043715846996,
"grad_norm": 58.20643669987731,
"learning_rate": 3.0998317990789376e-07,
"logits/chosen": -1.1773592233657837,
"logits/rejected": -1.0987378358840942,
"logps/chosen": -0.6642512083053589,
"logps/rejected": -1.81928288936615,
"loss": 0.8446,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.642512321472168,
"rewards/margins": 11.550317764282227,
"rewards/rejected": -18.192829132080078,
"step": 605
},
{
"epoch": 1.3333333333333333,
"grad_norm": 39.062955594441966,
"learning_rate": 3.0118055418614295e-07,
"logits/chosen": -1.2293663024902344,
"logits/rejected": -1.1513426303863525,
"logps/chosen": -0.7614090442657471,
"logps/rejected": -2.2579236030578613,
"loss": 0.8045,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.6140899658203125,
"rewards/margins": 14.96514892578125,
"rewards/rejected": -22.579238891601562,
"step": 610
},
{
"epoch": 1.3442622950819672,
"grad_norm": 63.44233426139689,
"learning_rate": 2.9245052932383707e-07,
"logits/chosen": -1.1708543300628662,
"logits/rejected": -1.0607430934906006,
"logps/chosen": -0.7385894656181335,
"logps/rejected": -2.120573043823242,
"loss": 0.8963,
"rewards/accuracies": 0.90625,
"rewards/chosen": -7.385894775390625,
"rewards/margins": 13.819836616516113,
"rewards/rejected": -21.205730438232422,
"step": 615
},
{
"epoch": 1.355191256830601,
"grad_norm": 44.7493718163325,
"learning_rate": 2.83796293174686e-07,
"logits/chosen": -1.0966622829437256,
"logits/rejected": -1.0379483699798584,
"logps/chosen": -0.7269963026046753,
"logps/rejected": -2.2072062492370605,
"loss": 0.8585,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.269963264465332,
"rewards/margins": 14.802099227905273,
"rewards/rejected": -22.072063446044922,
"step": 620
},
{
"epoch": 1.366120218579235,
"grad_norm": 46.49208772158338,
"learning_rate": 2.7522100591741217e-07,
"logits/chosen": -1.1784470081329346,
"logits/rejected": -1.117315649986267,
"logps/chosen": -0.6299835443496704,
"logps/rejected": -2.1423325538635254,
"loss": 0.8395,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.299835681915283,
"rewards/margins": 15.123489379882812,
"rewards/rejected": -21.423324584960938,
"step": 625
},
{
"epoch": 1.3770491803278688,
"grad_norm": 45.8083524660278,
"learning_rate": 2.6672779890178046e-07,
"logits/chosen": -1.1133753061294556,
"logits/rejected": -1.0135900974273682,
"logps/chosen": -0.7410155534744263,
"logps/rejected": -2.0132102966308594,
"loss": 0.8282,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.410154819488525,
"rewards/margins": 12.721948623657227,
"rewards/rejected": -20.132104873657227,
"step": 630
},
{
"epoch": 1.3879781420765027,
"grad_norm": 64.21787454795688,
"learning_rate": 2.5831977350515454e-07,
"logits/chosen": -1.0689822435379028,
"logits/rejected": -1.0277864933013916,
"logps/chosen": -0.7177176475524902,
"logps/rejected": -2.1230201721191406,
"loss": 0.8661,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.177175998687744,
"rewards/margins": 14.053024291992188,
"rewards/rejected": -21.230199813842773,
"step": 635
},
{
"epoch": 1.3989071038251366,
"grad_norm": 45.50472870594569,
"learning_rate": 2.500000000000001e-07,
"logits/chosen": -1.1423081159591675,
"logits/rejected": -1.1058489084243774,
"logps/chosen": -0.6915294528007507,
"logps/rejected": -2.324897289276123,
"loss": 0.7746,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.915294647216797,
"rewards/margins": 16.33367919921875,
"rewards/rejected": -23.24897003173828,
"step": 640
},
{
"epoch": 1.4098360655737705,
"grad_norm": 50.195590158524524,
"learning_rate": 2.4177151643274307e-07,
"logits/chosen": -1.1015839576721191,
"logits/rejected": -1.0532914400100708,
"logps/chosen": -0.672285258769989,
"logps/rejected": -2.207167387008667,
"loss": 0.7889,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.722853183746338,
"rewards/margins": 15.348821640014648,
"rewards/rejected": -22.071674346923828,
"step": 645
},
{
"epoch": 1.4207650273224044,
"grad_norm": 49.484361007454886,
"learning_rate": 2.3363732751439923e-07,
"logits/chosen": -1.2066833972930908,
"logits/rejected": -1.125514268875122,
"logps/chosen": -0.7560944557189941,
"logps/rejected": -2.1213643550872803,
"loss": 0.8624,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -7.560944557189941,
"rewards/margins": 13.65269947052002,
"rewards/rejected": -21.21364402770996,
"step": 650
},
{
"epoch": 1.4316939890710383,
"grad_norm": 71.24778468818619,
"learning_rate": 2.2560040352337307e-07,
"logits/chosen": -1.1408228874206543,
"logits/rejected": -1.058091402053833,
"logps/chosen": -0.7638077139854431,
"logps/rejected": -2.3721306324005127,
"loss": 0.8676,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.6380767822265625,
"rewards/margins": 16.08323097229004,
"rewards/rejected": -23.721309661865234,
"step": 655
},
{
"epoch": 1.4426229508196722,
"grad_norm": 47.51003694942907,
"learning_rate": 2.1766367922083283e-07,
"logits/chosen": -1.1455005407333374,
"logits/rejected": -1.088330864906311,
"logps/chosen": -0.6874858736991882,
"logps/rejected": -2.227865695953369,
"loss": 0.7364,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.874858856201172,
"rewards/margins": 15.40379810333252,
"rewards/rejected": -22.27865982055664,
"step": 660
},
{
"epoch": 1.453551912568306,
"grad_norm": 55.0401674528743,
"learning_rate": 2.0983005277905347e-07,
"logits/chosen": -1.1971169710159302,
"logits/rejected": -1.132368803024292,
"logps/chosen": -0.7266300320625305,
"logps/rejected": -2.1779634952545166,
"loss": 0.8288,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.266301155090332,
"rewards/margins": 14.513336181640625,
"rewards/rejected": -21.77963638305664,
"step": 665
},
{
"epoch": 1.46448087431694,
"grad_norm": 45.61973729091029,
"learning_rate": 2.021023847231202e-07,
"logits/chosen": -1.1578538417816162,
"logits/rejected": -1.0902925729751587,
"logps/chosen": -0.7618427872657776,
"logps/rejected": -2.1782405376434326,
"loss": 0.8251,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.6184282302856445,
"rewards/margins": 14.163976669311523,
"rewards/rejected": -21.782405853271484,
"step": 670
},
{
"epoch": 1.4754098360655736,
"grad_norm": 60.45804101244339,
"learning_rate": 1.94483496886381e-07,
"logits/chosen": -1.1042829751968384,
"logits/rejected": -1.0539348125457764,
"logps/chosen": -0.6615833640098572,
"logps/rejected": -2.251406192779541,
"loss": 0.7358,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.6158342361450195,
"rewards/margins": 15.898228645324707,
"rewards/rejected": -22.51406478881836,
"step": 675
},
{
"epoch": 1.4863387978142075,
"grad_norm": 48.14980683798255,
"learning_rate": 1.869761713800254e-07,
"logits/chosen": -1.1603379249572754,
"logits/rejected": -1.0794651508331299,
"logps/chosen": -0.7725010514259338,
"logps/rejected": -2.2856264114379883,
"loss": 0.8279,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.725010871887207,
"rewards/margins": 15.131253242492676,
"rewards/rejected": -22.85626220703125,
"step": 680
},
{
"epoch": 1.4972677595628414,
"grad_norm": 42.61569176571103,
"learning_rate": 1.7958314957717064e-07,
"logits/chosen": -1.1512095928192139,
"logits/rejected": -1.1126072406768799,
"logps/chosen": -0.6438918709754944,
"logps/rejected": -2.071455717086792,
"loss": 0.8299,
"rewards/accuracies": 0.96875,
"rewards/chosen": -6.4389190673828125,
"rewards/margins": 14.275640487670898,
"rewards/rejected": -20.714557647705078,
"step": 685
},
{
"epoch": 1.5081967213114753,
"grad_norm": 42.953950301305156,
"learning_rate": 1.7230713111182164e-07,
"logits/chosen": -1.2084314823150635,
"logits/rejected": -1.1416782140731812,
"logps/chosen": -0.626567542552948,
"logps/rejected": -2.2436633110046387,
"loss": 0.7812,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.265676021575928,
"rewards/margins": 16.170955657958984,
"rewards/rejected": -22.436630249023438,
"step": 690
},
{
"epoch": 1.5191256830601092,
"grad_norm": 40.560641175853355,
"learning_rate": 1.651507728930739e-07,
"logits/chosen": -1.13517165184021,
"logits/rejected": -1.079472303390503,
"logps/chosen": -0.6764928102493286,
"logps/rejected": -1.9952529668807983,
"loss": 0.8179,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -6.764927864074707,
"rewards/margins": 13.187602043151855,
"rewards/rejected": -19.95252799987793,
"step": 695
},
{
"epoch": 1.530054644808743,
"grad_norm": 43.147397995901486,
"learning_rate": 1.5811668813491696e-07,
"logits/chosen": -1.2506606578826904,
"logits/rejected": -1.1565546989440918,
"logps/chosen": -0.7353881597518921,
"logps/rejected": -2.194483757019043,
"loss": 0.8118,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -7.353880882263184,
"rewards/margins": 14.59095573425293,
"rewards/rejected": -21.944839477539062,
"step": 700
},
{
"epoch": 1.540983606557377,
"grad_norm": 62.57740103030882,
"learning_rate": 1.5120744540199343e-07,
"logits/chosen": -1.1544785499572754,
"logits/rejected": -1.0858932733535767,
"logps/chosen": -0.6997456550598145,
"logps/rejected": -2.130126476287842,
"loss": 0.7627,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.9974565505981445,
"rewards/margins": 14.303808212280273,
"rewards/rejected": -21.3012638092041,
"step": 705
},
{
"epoch": 1.5519125683060109,
"grad_norm": 52.64313236503675,
"learning_rate": 1.4442556767166369e-07,
"logits/chosen": -1.1379203796386719,
"logits/rejected": -1.0858924388885498,
"logps/chosen": -0.7317473292350769,
"logps/rejected": -2.252659320831299,
"loss": 0.8515,
"rewards/accuracies": 0.90625,
"rewards/chosen": -7.3174729347229,
"rewards/margins": 15.20911693572998,
"rewards/rejected": -22.52659034729004,
"step": 710
},
{
"epoch": 1.5628415300546448,
"grad_norm": 38.895227790104606,
"learning_rate": 1.377735314127148e-07,
"logits/chosen": -1.1292599439620972,
"logits/rejected": -1.0353094339370728,
"logps/chosen": -0.7267423868179321,
"logps/rejected": -2.1561272144317627,
"loss": 0.7422,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -7.267423152923584,
"rewards/margins": 14.293848991394043,
"rewards/rejected": -21.5612735748291,
"step": 715
},
{
"epoch": 1.5737704918032787,
"grad_norm": 56.98341311224862,
"learning_rate": 1.312537656810549e-07,
"logits/chosen": -1.108595609664917,
"logits/rejected": -1.065575361251831,
"logps/chosen": -0.8170570135116577,
"logps/rejected": -2.3806543350219727,
"loss": 0.8782,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -8.170571327209473,
"rewards/margins": 15.635971069335938,
"rewards/rejected": -23.806543350219727,
"step": 720
},
{
"epoch": 1.5846994535519126,
"grad_norm": 49.11661386514687,
"learning_rate": 1.2486865123271866e-07,
"logits/chosen": -1.170401692390442,
"logits/rejected": -1.084576964378357,
"logps/chosen": -0.7355960607528687,
"logps/rejected": -2.196117401123047,
"loss": 0.8153,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -7.355960845947266,
"rewards/margins": 14.605212211608887,
"rewards/rejected": -21.961172103881836,
"step": 725
},
{
"epoch": 1.5956284153005464,
"grad_norm": 55.74657113607268,
"learning_rate": 1.1862051965451214e-07,
"logits/chosen": -1.1835774183273315,
"logits/rejected": -1.0799705982208252,
"logps/chosen": -0.6676367521286011,
"logps/rejected": -2.179206609725952,
"loss": 0.8178,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.67636775970459,
"rewards/margins": 15.115699768066406,
"rewards/rejected": -21.79206657409668,
"step": 730
},
{
"epoch": 1.6065573770491803,
"grad_norm": 51.541880192438846,
"learning_rate": 1.1251165251261047e-07,
"logits/chosen": -1.1200164556503296,
"logits/rejected": -1.0568726062774658,
"logps/chosen": -0.645528256893158,
"logps/rejected": -2.1150240898132324,
"loss": 0.7418,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -6.455282688140869,
"rewards/margins": 14.694958686828613,
"rewards/rejected": -21.15024185180664,
"step": 735
},
{
"epoch": 1.6174863387978142,
"grad_norm": 50.10821238721753,
"learning_rate": 1.0654428051942138e-07,
"logits/chosen": -1.1239360570907593,
"logits/rejected": -1.0723472833633423,
"logps/chosen": -0.7925865054130554,
"logps/rejected": -2.1674091815948486,
"loss": 0.8513,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.92586612701416,
"rewards/margins": 13.748224258422852,
"rewards/rejected": -21.674091339111328,
"step": 740
},
{
"epoch": 1.6284153005464481,
"grad_norm": 43.56101653661629,
"learning_rate": 1.0072058271901978e-07,
"logits/chosen": -1.1107392311096191,
"logits/rejected": -1.0339866876602173,
"logps/chosen": -0.7230111956596375,
"logps/rejected": -2.215801954269409,
"loss": 0.8124,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.230112552642822,
"rewards/margins": 14.92790699005127,
"rewards/rejected": -22.158018112182617,
"step": 745
},
{
"epoch": 1.639344262295082,
"grad_norm": 44.251434071556815,
"learning_rate": 9.504268569144763e-08,
"logits/chosen": -1.1945059299468994,
"logits/rejected": -1.1036922931671143,
"logps/chosen": -0.6170369982719421,
"logps/rejected": -2.2507286071777344,
"loss": 0.7391,
"rewards/accuracies": 0.96875,
"rewards/chosen": -6.170370101928711,
"rewards/margins": 16.336917877197266,
"rewards/rejected": -22.50728988647461,
"step": 750
},
{
"epoch": 1.650273224043716,
"grad_norm": 49.60678271012646,
"learning_rate": 8.951266277617325e-08,
"logits/chosen": -1.110953688621521,
"logits/rejected": -1.0356338024139404,
"logps/chosen": -0.6501866579055786,
"logps/rejected": -2.0613694190979004,
"loss": 0.7668,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.501866817474365,
"rewards/margins": 14.111828804016113,
"rewards/rejected": -20.61369514465332,
"step": 755
},
{
"epoch": 1.6612021857923498,
"grad_norm": 56.33934467747551,
"learning_rate": 8.413253331499049e-08,
"logits/chosen": -1.185937762260437,
"logits/rejected": -1.1122770309448242,
"logps/chosen": -0.737058699131012,
"logps/rejected": -2.13063383102417,
"loss": 0.7883,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -7.370587348937988,
"rewards/margins": 13.935750961303711,
"rewards/rejected": -21.306339263916016,
"step": 760
},
{
"epoch": 1.6721311475409837,
"grad_norm": 44.94644265056986,
"learning_rate": 7.8904261914637e-08,
"logits/chosen": -1.1913588047027588,
"logits/rejected": -1.142197847366333,
"logps/chosen": -0.7348116636276245,
"logps/rejected": -2.11796236038208,
"loss": 0.8475,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -7.348116397857666,
"rewards/margins": 13.831507682800293,
"rewards/rejected": -21.179622650146484,
"step": 765
},
{
"epoch": 1.6830601092896176,
"grad_norm": 44.33346730845319,
"learning_rate": 7.382975772939865e-08,
"logits/chosen": -1.1930986642837524,
"logits/rejected": -1.1425340175628662,
"logps/chosen": -0.6687518358230591,
"logps/rejected": -2.18174409866333,
"loss": 0.8317,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -6.6875176429748535,
"rewards/margins": 15.129925727844238,
"rewards/rejected": -21.81744384765625,
"step": 770
},
{
"epoch": 1.6939890710382515,
"grad_norm": 62.98906395437277,
"learning_rate": 6.891087376396315e-08,
"logits/chosen": -1.107334852218628,
"logits/rejected": -1.0646486282348633,
"logps/chosen": -0.6770855784416199,
"logps/rejected": -1.9466581344604492,
"loss": 0.8939,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -6.770855903625488,
"rewards/margins": 12.695725440979004,
"rewards/rejected": -19.466583251953125,
"step": 775
},
{
"epoch": 1.7049180327868854,
"grad_norm": 58.192179469870524,
"learning_rate": 6.414940619677734e-08,
"logits/chosen": -1.1423081159591675,
"logits/rejected": -1.0822668075561523,
"logps/chosen": -0.7497803568840027,
"logps/rejected": -2.090430736541748,
"loss": 0.895,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.497802734375,
"rewards/margins": 13.406506538391113,
"rewards/rejected": -20.90431022644043,
"step": 780
},
{
"epoch": 1.7158469945355193,
"grad_norm": 47.51392910975651,
"learning_rate": 5.954709372415523e-08,
"logits/chosen": -1.1490824222564697,
"logits/rejected": -1.0730870962142944,
"logps/chosen": -0.7593728303909302,
"logps/rejected": -2.2705507278442383,
"loss": 0.808,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.593728065490723,
"rewards/margins": 15.111780166625977,
"rewards/rejected": -22.70550537109375,
"step": 785
},
{
"epoch": 1.7267759562841531,
"grad_norm": 48.831632577538024,
"learning_rate": 5.5105616925376296e-08,
"logits/chosen": -1.2490391731262207,
"logits/rejected": -1.1065878868103027,
"logps/chosen": -0.6748226881027222,
"logps/rejected": -2.066803455352783,
"loss": 0.8208,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.748227119445801,
"rewards/margins": 13.919805526733398,
"rewards/rejected": -20.668033599853516,
"step": 790
},
{
"epoch": 1.737704918032787,
"grad_norm": 58.14259346920981,
"learning_rate": 5.082659764900482e-08,
"logits/chosen": -1.1919060945510864,
"logits/rejected": -1.1228830814361572,
"logps/chosen": -0.6102501749992371,
"logps/rejected": -1.8674005270004272,
"loss": 0.851,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.102501392364502,
"rewards/margins": 12.571504592895508,
"rewards/rejected": -18.674007415771484,
"step": 795
},
{
"epoch": 1.748633879781421,
"grad_norm": 58.75906827764335,
"learning_rate": 4.6711598420656976e-08,
"logits/chosen": -1.165160894393921,
"logits/rejected": -1.0898354053497314,
"logps/chosen": -0.6838083863258362,
"logps/rejected": -2.08567476272583,
"loss": 0.7858,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.838083744049072,
"rewards/margins": 14.01866340637207,
"rewards/rejected": -20.856746673583984,
"step": 800
},
{
"epoch": 1.748633879781421,
"eval_logits/chosen": -1.3754932880401611,
"eval_logits/rejected": -1.2976948022842407,
"eval_logps/chosen": -0.7933117747306824,
"eval_logps/rejected": -1.9287370443344116,
"eval_loss": 1.2716172933578491,
"eval_rewards/accuracies": 0.8614457845687866,
"eval_rewards/chosen": -7.933117389678955,
"eval_rewards/margins": 11.35425090789795,
"eval_rewards/rejected": -19.287368774414062,
"eval_runtime": 34.0159,
"eval_samples_per_second": 38.747,
"eval_steps_per_second": 2.44,
"step": 800
},
{
"epoch": 1.7595628415300546,
"grad_norm": 42.21959698595358,
"learning_rate": 4.2762121872428615e-08,
"logits/chosen": -1.1773040294647217,
"logits/rejected": -1.129456639289856,
"logps/chosen": -0.6802318692207336,
"logps/rejected": -1.888240098953247,
"loss": 0.8525,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.8023176193237305,
"rewards/margins": 12.080080032348633,
"rewards/rejected": -18.882396697998047,
"step": 805
},
{
"epoch": 1.7704918032786885,
"grad_norm": 58.528554345128654,
"learning_rate": 3.897961019419516e-08,
"logits/chosen": -1.1610840559005737,
"logits/rejected": -1.0498443841934204,
"logps/chosen": -0.6537138819694519,
"logps/rejected": -2.291470766067505,
"loss": 0.8496,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.537138938903809,
"rewards/margins": 16.377567291259766,
"rewards/rejected": -22.91470718383789,
"step": 810
},
{
"epoch": 1.7814207650273224,
"grad_norm": 54.90972417331509,
"learning_rate": 3.536544460698143e-08,
"logits/chosen": -1.1778395175933838,
"logits/rejected": -1.1455332040786743,
"logps/chosen": -0.7220278978347778,
"logps/rejected": -2.2299325466156006,
"loss": 0.885,
"rewards/accuracies": 0.9375,
"rewards/chosen": -7.220279693603516,
"rewards/margins": 15.079046249389648,
"rewards/rejected": -22.299325942993164,
"step": 815
},
{
"epoch": 1.7923497267759563,
"grad_norm": 56.09383885571275,
"learning_rate": 3.192094485859526e-08,
"logits/chosen": -1.1215145587921143,
"logits/rejected": -1.073870301246643,
"logps/chosen": -0.7387723326683044,
"logps/rejected": -2.0020480155944824,
"loss": 0.7983,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.387722969055176,
"rewards/margins": 12.632759094238281,
"rewards/rejected": -20.020483016967773,
"step": 820
},
{
"epoch": 1.8032786885245902,
"grad_norm": 53.60121299218721,
"learning_rate": 2.8647368741709367e-08,
"logits/chosen": -1.2346153259277344,
"logits/rejected": -1.1147099733352661,
"logps/chosen": -0.7686562538146973,
"logps/rejected": -2.255371570587158,
"loss": 0.8238,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.686563014984131,
"rewards/margins": 14.867155075073242,
"rewards/rejected": -22.5537166595459,
"step": 825
},
{
"epoch": 1.814207650273224,
"grad_norm": 70.95861218592107,
"learning_rate": 2.5545911634565265e-08,
"logits/chosen": -1.2187676429748535,
"logits/rejected": -1.10321044921875,
"logps/chosen": -0.7037076950073242,
"logps/rejected": -2.5123300552368164,
"loss": 0.7911,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -7.0370774269104,
"rewards/margins": 18.086223602294922,
"rewards/rejected": -25.123300552368164,
"step": 830
},
{
"epoch": 1.825136612021858,
"grad_norm": 52.68881901646796,
"learning_rate": 2.261770606446983e-08,
"logits/chosen": -1.2337759733200073,
"logits/rejected": -1.1664550304412842,
"logps/chosen": -0.6923700571060181,
"logps/rejected": -1.8173946142196655,
"loss": 0.7758,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.923699855804443,
"rewards/margins": 11.250245094299316,
"rewards/rejected": -18.173946380615234,
"step": 835
},
{
"epoch": 1.8360655737704918,
"grad_norm": 49.635844220613414,
"learning_rate": 1.9863821294241522e-08,
"logits/chosen": -1.140126347541809,
"logits/rejected": -1.0474215745925903,
"logps/chosen": -0.6625608801841736,
"logps/rejected": -2.1753034591674805,
"loss": 0.8089,
"rewards/accuracies": 0.96875,
"rewards/chosen": -6.625608921051025,
"rewards/margins": 15.12742805480957,
"rewards/rejected": -21.75303840637207,
"step": 840
},
{
"epoch": 1.8469945355191257,
"grad_norm": 43.19571532603005,
"learning_rate": 1.7285262931759082e-08,
"logits/chosen": -1.0827378034591675,
"logits/rejected": -1.040972352027893,
"logps/chosen": -0.6810778379440308,
"logps/rejected": -2.233325242996216,
"loss": 0.8335,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.810777187347412,
"rewards/margins": 15.52247142791748,
"rewards/rejected": -22.333250045776367,
"step": 845
},
{
"epoch": 1.8579234972677594,
"grad_norm": 44.932248498556426,
"learning_rate": 1.4882972562753615e-08,
"logits/chosen": -1.1539499759674072,
"logits/rejected": -1.0578666925430298,
"logps/chosen": -0.6092909574508667,
"logps/rejected": -2.176682949066162,
"loss": 0.7694,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.092909812927246,
"rewards/margins": 15.673917770385742,
"rewards/rejected": -21.766826629638672,
"step": 850
},
{
"epoch": 1.8688524590163933,
"grad_norm": 42.913562748722356,
"learning_rate": 1.2657827406979404e-08,
"logits/chosen": -1.192744493484497,
"logits/rejected": -1.1254050731658936,
"logps/chosen": -0.6767739653587341,
"logps/rejected": -2.08142352104187,
"loss": 0.7891,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -6.767739772796631,
"rewards/margins": 14.04649543762207,
"rewards/rejected": -20.81423568725586,
"step": 855
},
{
"epoch": 1.8797814207650272,
"grad_norm": 41.804821801782175,
"learning_rate": 1.0610639997888915e-08,
"logits/chosen": -1.0884257555007935,
"logits/rejected": -1.0512984991073608,
"logps/chosen": -0.6403497457504272,
"logps/rejected": -1.8734140396118164,
"loss": 0.8247,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -6.40349817276001,
"rewards/margins": 12.330641746520996,
"rewards/rejected": -18.734140396118164,
"step": 860
},
{
"epoch": 1.890710382513661,
"grad_norm": 44.35333055569577,
"learning_rate": 8.742157885927804e-09,
"logits/chosen": -1.1908656358718872,
"logits/rejected": -1.1213476657867432,
"logps/chosen": -0.7420295476913452,
"logps/rejected": -2.280778646469116,
"loss": 0.7435,
"rewards/accuracies": 0.96875,
"rewards/chosen": -7.420294761657715,
"rewards/margins": 15.387492179870605,
"rewards/rejected": -22.807788848876953,
"step": 865
},
{
"epoch": 1.901639344262295,
"grad_norm": 47.53352705675185,
"learning_rate": 7.053063365559997e-09,
"logits/chosen": -1.1726518869400024,
"logits/rejected": -1.1312017440795898,
"logps/chosen": -0.624583899974823,
"logps/rejected": -2.1973233222961426,
"loss": 0.711,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.2458391189575195,
"rewards/margins": 15.727396965026855,
"rewards/rejected": -21.973236083984375,
"step": 870
},
{
"epoch": 1.9125683060109289,
"grad_norm": 49.95076655476162,
"learning_rate": 5.543973226120935e-09,
"logits/chosen": -1.1566675901412964,
"logits/rejected": -1.0951449871063232,
"logps/chosen": -0.6883528828620911,
"logps/rejected": -1.9860836267471313,
"loss": 0.7853,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.883528709411621,
"rewards/margins": 12.977307319641113,
"rewards/rejected": -19.860836029052734,
"step": 875
},
{
"epoch": 1.9234972677595628,
"grad_norm": 46.10249074996564,
"learning_rate": 4.215438526591064e-09,
"logits/chosen": -1.1910879611968994,
"logits/rejected": -1.1287364959716797,
"logps/chosen": -0.6650462746620178,
"logps/rejected": -2.1194908618927,
"loss": 0.7991,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": -6.6504621505737305,
"rewards/margins": 14.54444408416748,
"rewards/rejected": -21.19490623474121,
"step": 880
},
{
"epoch": 1.9344262295081966,
"grad_norm": 62.72289061103836,
"learning_rate": 3.0679443943712467e-09,
"logits/chosen": -1.2184160947799683,
"logits/rejected": -1.1404691934585571,
"logps/chosen": -0.7271493673324585,
"logps/rejected": -2.16991925239563,
"loss": 0.7789,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -7.271493434906006,
"rewards/margins": 14.427698135375977,
"rewards/rejected": -21.69919204711914,
"step": 885
},
{
"epoch": 1.9453551912568305,
"grad_norm": 45.380019039047724,
"learning_rate": 2.1019098481337426e-09,
"logits/chosen": -1.1838113069534302,
"logits/rejected": -1.1087853908538818,
"logps/chosen": -0.6985124349594116,
"logps/rejected": -2.234588861465454,
"loss": 0.8115,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.985124111175537,
"rewards/margins": 15.36076545715332,
"rewards/rejected": -22.345890045166016,
"step": 890
},
{
"epoch": 1.9562841530054644,
"grad_norm": 52.561200553522546,
"learning_rate": 1.3176876448135477e-09,
"logits/chosen": -1.238799810409546,
"logits/rejected": -1.1316242218017578,
"logps/chosen": -0.7857435345649719,
"logps/rejected": -2.3238725662231445,
"loss": 0.8526,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -7.8574347496032715,
"rewards/margins": 15.3812894821167,
"rewards/rejected": -23.238723754882812,
"step": 895
},
{
"epoch": 1.9672131147540983,
"grad_norm": 51.75087517711866,
"learning_rate": 7.155641507955445e-10,
"logits/chosen": -1.138656497001648,
"logits/rejected": -1.0620585680007935,
"logps/chosen": -0.6295305490493774,
"logps/rejected": -1.968534231185913,
"loss": 0.8507,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -6.2953057289123535,
"rewards/margins": 13.390034675598145,
"rewards/rejected": -19.685340881347656,
"step": 900
},
{
"epoch": 1.9781420765027322,
"grad_norm": 54.546613938165166,
"learning_rate": 2.957592373452056e-10,
"logits/chosen": -1.141952395439148,
"logits/rejected": -1.078550100326538,
"logps/chosen": -0.695178747177124,
"logps/rejected": -2.1918914318084717,
"loss": 0.8393,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.95178747177124,
"rewards/margins": 14.967126846313477,
"rewards/rejected": -21.918914794921875,
"step": 905
},
{
"epoch": 1.989071038251366,
"grad_norm": 50.15211263774824,
"learning_rate": 5.842620032053824e-11,
"logits/chosen": -1.2007973194122314,
"logits/rejected": -1.138047456741333,
"logps/chosen": -0.6629818081855774,
"logps/rejected": -1.9467570781707764,
"loss": 0.7506,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.629817962646484,
"rewards/margins": 12.837751388549805,
"rewards/rejected": -19.467571258544922,
"step": 910
},
{
"epoch": 1.9978142076502732,
"step": 914,
"total_flos": 0.0,
"train_loss": 1.4686054226084402,
"train_runtime": 12099.7941,
"train_samples_per_second": 9.679,
"train_steps_per_second": 0.076
}
],
"logging_steps": 5,
"max_steps": 914,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}