diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2804 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9978142076502732, + "eval_steps": 400, + "global_step": 914, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01092896174863388, + "grad_norm": 49.12429877298342, + "learning_rate": 5.434782608695652e-08, + "logits/chosen": -1.016326904296875, + "logits/rejected": -1.0107576847076416, + "logps/chosen": -0.28068429231643677, + "logps/rejected": -0.28573077917099, + "loss": 3.0052, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.806842803955078, + "rewards/margins": 0.05046519637107849, + "rewards/rejected": -2.8573079109191895, + "step": 5 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 63.846051899029185, + "learning_rate": 1.0869565217391303e-07, + "logits/chosen": -1.0520384311676025, + "logits/rejected": -1.0011743307113647, + "logps/chosen": -0.2570471167564392, + "logps/rejected": -0.27129054069519043, + "loss": 2.9747, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.5704712867736816, + "rewards/margins": 0.14243429899215698, + "rewards/rejected": -2.7129054069519043, + "step": 10 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 53.76693099062572, + "learning_rate": 1.6304347826086955e-07, + "logits/chosen": -1.0114173889160156, + "logits/rejected": -0.9646312594413757, + "logps/chosen": -0.2674410939216614, + "logps/rejected": -0.2732146680355072, + "loss": 2.9654, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.6744110584259033, + "rewards/margins": 0.05773543566465378, + "rewards/rejected": -2.7321462631225586, + "step": 15 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 70.16722328630453, + "learning_rate": 2.1739130434782607e-07, + "logits/chosen": -0.9457764625549316, + "logits/rejected": -0.8962175250053406, + "logps/chosen": -0.2723461091518402, + "logps/rejected": -0.2841888964176178, + "loss": 3.0079, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.723461151123047, + "rewards/margins": 0.11842777580022812, + "rewards/rejected": -2.841888904571533, + "step": 20 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 34.617399530191626, + "learning_rate": 2.717391304347826e-07, + "logits/chosen": -0.9401823282241821, + "logits/rejected": -0.8662004470825195, + "logps/chosen": -0.275082528591156, + "logps/rejected": -0.29355964064598083, + "loss": 2.9781, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.7508254051208496, + "rewards/margins": 0.18477120995521545, + "rewards/rejected": -2.935596466064453, + "step": 25 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 57.4910388890607, + "learning_rate": 3.260869565217391e-07, + "logits/chosen": -1.0440914630889893, + "logits/rejected": -0.9793885350227356, + "logps/chosen": -0.26507988572120667, + "logps/rejected": -0.2826555371284485, + "loss": 2.9819, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.650798797607422, + "rewards/margins": 0.17575649917125702, + "rewards/rejected": -2.8265554904937744, + "step": 30 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 54.66121032212012, + "learning_rate": 3.8043478260869567e-07, + "logits/chosen": -1.001513123512268, + "logits/rejected": -0.9358412027359009, + "logps/chosen": -0.2545512318611145, + "logps/rejected": -0.27621084451675415, + "loss": 2.9523, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.5455121994018555, + "rewards/margins": 0.21659617125988007, + "rewards/rejected": -2.762108325958252, + "step": 35 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 61.608388907145425, + "learning_rate": 4.3478260869565214e-07, + "logits/chosen": -0.9582087397575378, + "logits/rejected": -0.8961701393127441, + "logps/chosen": -0.28110748529434204, + "logps/rejected": -0.29402947425842285, + "loss": 3.0188, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.811074733734131, + "rewards/margins": 0.12921971082687378, + "rewards/rejected": -2.9402947425842285, + "step": 40 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 29.990450343406938, + "learning_rate": 4.891304347826087e-07, + "logits/chosen": -1.0058226585388184, + "logits/rejected": -0.923748791217804, + "logps/chosen": -0.2822369635105133, + "logps/rejected": -0.3045244514942169, + "loss": 2.9589, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.82236909866333, + "rewards/margins": 0.22287502884864807, + "rewards/rejected": -3.0452442169189453, + "step": 45 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 61.73061513054691, + "learning_rate": 5.434782608695652e-07, + "logits/chosen": -0.950430691242218, + "logits/rejected": -0.8683624267578125, + "logps/chosen": -0.27781787514686584, + "logps/rejected": -0.2830268144607544, + "loss": 2.9812, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.7781786918640137, + "rewards/margins": 0.05208945274353027, + "rewards/rejected": -2.830268144607544, + "step": 50 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 33.948585932841986, + "learning_rate": 5.978260869565217e-07, + "logits/chosen": -0.9692623019218445, + "logits/rejected": -0.8552471399307251, + "logps/chosen": -0.2707405686378479, + "logps/rejected": -0.299907922744751, + "loss": 2.9042, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.7074055671691895, + "rewards/margins": 0.2916738986968994, + "rewards/rejected": -2.9990792274475098, + "step": 55 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 33.34905420886387, + "learning_rate": 6.521739130434782e-07, + "logits/chosen": -1.0130410194396973, + "logits/rejected": -0.9668112993240356, + "logps/chosen": -0.2595779299736023, + "logps/rejected": -0.3016397953033447, + "loss": 2.8565, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.5957789421081543, + "rewards/margins": 0.4206187129020691, + "rewards/rejected": -3.0163979530334473, + "step": 60 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 47.22484337976121, + "learning_rate": 7.065217391304348e-07, + "logits/chosen": -0.9941331148147583, + "logits/rejected": -0.9260708093643188, + "logps/chosen": -0.2983860373497009, + "logps/rejected": -0.3194151818752289, + "loss": 2.9452, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.983860492706299, + "rewards/margins": 0.2102913111448288, + "rewards/rejected": -3.1941516399383545, + "step": 65 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 80.82534906777506, + "learning_rate": 7.608695652173913e-07, + "logits/chosen": -0.958402156829834, + "logits/rejected": -0.9377009272575378, + "logps/chosen": -0.2862434983253479, + "logps/rejected": -0.30820196866989136, + "loss": 2.8832, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.8624351024627686, + "rewards/margins": 0.21958431601524353, + "rewards/rejected": -3.082019329071045, + "step": 70 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 36.06009692365191, + "learning_rate": 8.152173913043478e-07, + "logits/chosen": -0.9559575319290161, + "logits/rejected": -0.9389899969100952, + "logps/chosen": -0.29445773363113403, + "logps/rejected": -0.3360756039619446, + "loss": 2.916, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.944577217102051, + "rewards/margins": 0.4161788523197174, + "rewards/rejected": -3.3607559204101562, + "step": 75 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 49.3694605129278, + "learning_rate": 8.695652173913043e-07, + "logits/chosen": -0.9711343050003052, + "logits/rejected": -0.9096651077270508, + "logps/chosen": -0.29308563470840454, + "logps/rejected": -0.31533023715019226, + "loss": 2.8703, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.930856227874756, + "rewards/margins": 0.22244596481323242, + "rewards/rejected": -3.1533024311065674, + "step": 80 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 54.64036329213502, + "learning_rate": 9.239130434782608e-07, + "logits/chosen": -0.9719535112380981, + "logits/rejected": -0.9226737022399902, + "logps/chosen": -0.29737505316734314, + "logps/rejected": -0.34351009130477905, + "loss": 2.8675, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.973750352859497, + "rewards/margins": 0.46135035157203674, + "rewards/rejected": -3.435100555419922, + "step": 85 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 38.39312594421068, + "learning_rate": 9.782608695652173e-07, + "logits/chosen": -1.0393884181976318, + "logits/rejected": -0.9582293629646301, + "logps/chosen": -0.3267248272895813, + "logps/rejected": -0.3579455614089966, + "loss": 2.887, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.2672481536865234, + "rewards/margins": 0.3122076094150543, + "rewards/rejected": -3.579455852508545, + "step": 90 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 50.13238024700346, + "learning_rate": 9.999671349822886e-07, + "logits/chosen": -0.9711877107620239, + "logits/rejected": -0.9710756540298462, + "logps/chosen": -0.32033300399780273, + "logps/rejected": -0.34920942783355713, + "loss": 2.7879, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.2033302783966064, + "rewards/margins": 0.28876420855522156, + "rewards/rejected": -3.492094039916992, + "step": 95 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 68.48662325262401, + "learning_rate": 9.997663088532014e-07, + "logits/chosen": -0.9745977520942688, + "logits/rejected": -0.9276177287101746, + "logps/chosen": -0.36889714002609253, + "logps/rejected": -0.4320752024650574, + "loss": 2.8019, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.6889712810516357, + "rewards/margins": 0.6317806839942932, + "rewards/rejected": -4.320752143859863, + "step": 100 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 43.27710085441567, + "learning_rate": 9.9938298818292e-07, + "logits/chosen": -1.0145201683044434, + "logits/rejected": -0.9839836955070496, + "logps/chosen": -0.3395516276359558, + "logps/rejected": -0.39562201499938965, + "loss": 2.7773, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.3955161571502686, + "rewards/margins": 0.5607036352157593, + "rewards/rejected": -3.9562199115753174, + "step": 105 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 51.230908709126574, + "learning_rate": 9.98817312944725e-07, + "logits/chosen": -1.012632131576538, + "logits/rejected": -0.9917828440666199, + "logps/chosen": -0.3605939447879791, + "logps/rejected": -0.4645133912563324, + "loss": 2.7758, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.6059391498565674, + "rewards/margins": 1.0391945838928223, + "rewards/rejected": -4.645133972167969, + "step": 110 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 51.557404433953074, + "learning_rate": 9.98069489700446e-07, + "logits/chosen": -1.0225282907485962, + "logits/rejected": -0.9822538495063782, + "logps/chosen": -0.36402350664138794, + "logps/rejected": -0.49069976806640625, + "loss": 2.7184, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.640235424041748, + "rewards/margins": 1.2667627334594727, + "rewards/rejected": -4.906998634338379, + "step": 115 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 39.49660551037683, + "learning_rate": 9.971397915250336e-07, + "logits/chosen": -1.046118140220642, + "logits/rejected": -0.9747602343559265, + "logps/chosen": -0.3749980330467224, + "logps/rejected": -0.4443044662475586, + "loss": 2.6921, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.7499804496765137, + "rewards/margins": 0.6930642127990723, + "rewards/rejected": -4.443044662475586, + "step": 120 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 126.01262679098515, + "learning_rate": 9.960285579068417e-07, + "logits/chosen": -0.9501935839653015, + "logits/rejected": -0.9143295288085938, + "logps/chosen": -0.4171057641506195, + "logps/rejected": -0.5157137513160706, + "loss": 2.6822, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.17105770111084, + "rewards/margins": 0.9860798120498657, + "rewards/rejected": -5.157137393951416, + "step": 125 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 47.14315030112102, + "learning_rate": 9.94736194623663e-07, + "logits/chosen": -0.9490100741386414, + "logits/rejected": -0.9435502290725708, + "logps/chosen": -0.4328651428222656, + "logps/rejected": -0.5982568860054016, + "loss": 2.6973, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.328651428222656, + "rewards/margins": 1.6539175510406494, + "rewards/rejected": -5.98256778717041, + "step": 130 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 39.88678414865988, + "learning_rate": 9.932631735945526e-07, + "logits/chosen": -0.9944769144058228, + "logits/rejected": -0.9127768278121948, + "logps/chosen": -0.43029850721359253, + "logps/rejected": -0.5863696932792664, + "loss": 2.6024, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.302985191345215, + "rewards/margins": 1.5607118606567383, + "rewards/rejected": -5.863697528839111, + "step": 135 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 45.7273688529439, + "learning_rate": 9.916100327075037e-07, + "logits/chosen": -0.9914215207099915, + "logits/rejected": -0.9394119381904602, + "logps/chosen": -0.47816991806030273, + "logps/rejected": -0.682498574256897, + "loss": 2.3785, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.781699180603027, + "rewards/margins": 2.0432868003845215, + "rewards/rejected": -6.824985504150391, + "step": 140 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 59.43998889151297, + "learning_rate": 9.89777375623032e-07, + "logits/chosen": -0.9755287170410156, + "logits/rejected": -0.9593987464904785, + "logps/chosen": -0.4856489300727844, + "logps/rejected": -0.5857899188995361, + "loss": 2.5053, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.856489181518555, + "rewards/margins": 1.0014104843139648, + "rewards/rejected": -5.8578996658325195, + "step": 145 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 47.63474073428632, + "learning_rate": 9.877658715537428e-07, + "logits/chosen": -1.0314117670059204, + "logits/rejected": -1.0008752346038818, + "logps/chosen": -0.5753797292709351, + "logps/rejected": -0.8458330035209656, + "loss": 2.3952, + "rewards/accuracies": 0.71875, + "rewards/chosen": -5.753798007965088, + "rewards/margins": 2.7045321464538574, + "rewards/rejected": -8.458330154418945, + "step": 150 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 65.52771946076592, + "learning_rate": 9.85576255019963e-07, + "logits/chosen": -0.9931026697158813, + "logits/rejected": -0.9415470361709595, + "logps/chosen": -0.5901178121566772, + "logps/rejected": -0.7857885360717773, + "loss": 2.3737, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -5.901178359985352, + "rewards/margins": 1.9567070007324219, + "rewards/rejected": -7.857884883880615, + "step": 155 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 62.974125126106905, + "learning_rate": 9.832093255815216e-07, + "logits/chosen": -1.0444536209106445, + "logits/rejected": -0.9929295778274536, + "logps/chosen": -0.7247758507728577, + "logps/rejected": -0.9017370939254761, + "loss": 2.3973, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -7.247758388519287, + "rewards/margins": 1.7696129083633423, + "rewards/rejected": -9.01737117767334, + "step": 160 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 63.72395656615203, + "learning_rate": 9.806659475457849e-07, + "logits/chosen": -1.0410820245742798, + "logits/rejected": -0.9887404441833496, + "logps/chosen": -0.7456644773483276, + "logps/rejected": -0.9195586442947388, + "loss": 2.3301, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -7.4566450119018555, + "rewards/margins": 1.7389415502548218, + "rewards/rejected": -9.195585250854492, + "step": 165 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 66.75690610820409, + "learning_rate": 9.779470496520441e-07, + "logits/chosen": -1.0688705444335938, + "logits/rejected": -1.0115132331848145, + "logps/chosen": -0.7449184656143188, + "logps/rejected": -0.9542142152786255, + "loss": 2.3577, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -7.449184417724609, + "rewards/margins": 2.0929577350616455, + "rewards/rejected": -9.542141914367676, + "step": 170 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 49.727137600047485, + "learning_rate": 9.750536247323789e-07, + "logits/chosen": -1.141492486000061, + "logits/rejected": -1.1157532930374146, + "logps/chosen": -0.8256582021713257, + "logps/rejected": -0.9864746332168579, + "loss": 2.2519, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -8.256583213806152, + "rewards/margins": 1.6081632375717163, + "rewards/rejected": -9.86474609375, + "step": 175 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 52.45035344166609, + "learning_rate": 9.719867293491144e-07, + "logits/chosen": -1.1214529275894165, + "logits/rejected": -1.1165130138397217, + "logps/chosen": -0.7945824861526489, + "logps/rejected": -1.1331901550292969, + "loss": 2.1564, + "rewards/accuracies": 0.78125, + "rewards/chosen": -7.94582462310791, + "rewards/margins": 3.3860764503479004, + "rewards/rejected": -11.331900596618652, + "step": 180 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 47.993586496085484, + "learning_rate": 9.687474834090067e-07, + "logits/chosen": -1.1440832614898682, + "logits/rejected": -1.1622083187103271, + "logps/chosen": -0.8209633827209473, + "logps/rejected": -1.1528552770614624, + "loss": 2.1291, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.209634780883789, + "rewards/margins": 3.3189189434051514, + "rewards/rejected": -11.52855396270752, + "step": 185 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 68.1701064534262, + "learning_rate": 9.653370697542987e-07, + "logits/chosen": -1.146360158920288, + "logits/rejected": -1.1054737567901611, + "logps/chosen": -0.8023772239685059, + "logps/rejected": -1.1262474060058594, + "loss": 2.1611, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -8.023773193359375, + "rewards/margins": 3.238701581954956, + "rewards/rejected": -11.262474060058594, + "step": 190 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 53.904696843657476, + "learning_rate": 9.617567337307935e-07, + "logits/chosen": -1.1726776361465454, + "logits/rejected": -1.1539947986602783, + "logps/chosen": -0.9123600125312805, + "logps/rejected": -1.2916643619537354, + "loss": 2.1905, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -9.123600006103516, + "rewards/margins": 3.793043613433838, + "rewards/rejected": -12.916644096374512, + "step": 195 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 54.28658232674653, + "learning_rate": 9.580077827331037e-07, + "logits/chosen": -1.1374239921569824, + "logits/rejected": -1.0560877323150635, + "logps/chosen": -0.8440427780151367, + "logps/rejected": -1.1447341442108154, + "loss": 2.1572, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.440427780151367, + "rewards/margins": 3.0069146156311035, + "rewards/rejected": -11.447342872619629, + "step": 200 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 47.58008461187375, + "learning_rate": 9.540915857272445e-07, + "logits/chosen": -1.113061785697937, + "logits/rejected": -1.1297590732574463, + "logps/chosen": -0.7553713917732239, + "logps/rejected": -1.0475962162017822, + "loss": 2.0812, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -7.553713798522949, + "rewards/margins": 2.9222495555877686, + "rewards/rejected": -10.47596263885498, + "step": 205 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 72.2337993655073, + "learning_rate": 9.500095727507419e-07, + "logits/chosen": -1.1518081426620483, + "logits/rejected": -1.1548099517822266, + "logps/chosen": -0.811726450920105, + "logps/rejected": -1.2030248641967773, + "loss": 1.9857, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -8.117263793945312, + "rewards/margins": 3.912985324859619, + "rewards/rejected": -12.030248641967773, + "step": 210 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 75.8271820255055, + "learning_rate": 9.457632343904402e-07, + "logits/chosen": -1.1457799673080444, + "logits/rejected": -1.0930476188659668, + "logps/chosen": -0.8380640745162964, + "logps/rejected": -1.2331631183624268, + "loss": 2.068, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.380640983581543, + "rewards/margins": 3.950991153717041, + "rewards/rejected": -12.331632614135742, + "step": 215 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 47.21678860467139, + "learning_rate": 9.413541212382004e-07, + "logits/chosen": -1.1901623010635376, + "logits/rejected": -1.1682528257369995, + "logps/chosen": -0.8619640469551086, + "logps/rejected": -1.1715562343597412, + "loss": 2.0188, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -8.619640350341797, + "rewards/margins": 3.0959222316741943, + "rewards/rejected": -11.71556282043457, + "step": 220 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 51.414680735085476, + "learning_rate": 9.367838433246857e-07, + "logits/chosen": -1.2050046920776367, + "logits/rejected": -1.165052056312561, + "logps/chosen": -0.8373457789421082, + "logps/rejected": -1.2013850212097168, + "loss": 1.9761, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.373457908630371, + "rewards/margins": 3.640392303466797, + "rewards/rejected": -12.013849258422852, + "step": 225 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 52.749993213229466, + "learning_rate": 9.320540695314438e-07, + "logits/chosen": -1.1119884252548218, + "logits/rejected": -1.1168959140777588, + "logps/chosen": -0.835217297077179, + "logps/rejected": -1.2099316120147705, + "loss": 2.0374, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.352171897888184, + "rewards/margins": 3.747142791748047, + "rewards/rejected": -12.099315643310547, + "step": 230 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 57.97242786280693, + "learning_rate": 9.271665269814983e-07, + "logits/chosen": -1.1533162593841553, + "logits/rejected": -1.1169049739837646, + "logps/chosen": -0.8319026231765747, + "logps/rejected": -1.1632182598114014, + "loss": 1.9568, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.319025039672852, + "rewards/margins": 3.3131580352783203, + "rewards/rejected": -11.632184982299805, + "step": 235 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 55.73546760366859, + "learning_rate": 9.221230004086721e-07, + "logits/chosen": -1.213132619857788, + "logits/rejected": -1.224669098854065, + "logps/chosen": -0.7961581945419312, + "logps/rejected": -1.243761658668518, + "loss": 1.8379, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.961583137512207, + "rewards/margins": 4.476034164428711, + "rewards/rejected": -12.437616348266602, + "step": 240 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 47.285753406270835, + "learning_rate": 9.169253315058763e-07, + "logits/chosen": -1.1385068893432617, + "logits/rejected": -1.1000906229019165, + "logps/chosen": -0.858871340751648, + "logps/rejected": -1.3097403049468994, + "loss": 1.9234, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -8.588713645935059, + "rewards/margins": 4.508688926696777, + "rewards/rejected": -13.097402572631836, + "step": 245 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 46.893831371795955, + "learning_rate": 9.11575418252596e-07, + "logits/chosen": -1.1890825033187866, + "logits/rejected": -1.1559849977493286, + "logps/chosen": -0.8301135301589966, + "logps/rejected": -1.1934387683868408, + "loss": 1.9956, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -8.301135063171387, + "rewards/margins": 3.6332526206970215, + "rewards/rejected": -11.934389114379883, + "step": 250 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 55.2826289688807, + "learning_rate": 9.060752142218257e-07, + "logits/chosen": -1.1763793230056763, + "logits/rejected": -1.1470096111297607, + "logps/chosen": -0.8657994270324707, + "logps/rejected": -1.3209015130996704, + "loss": 1.8569, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.657995223999023, + "rewards/margins": 4.551021099090576, + "rewards/rejected": -13.209016799926758, + "step": 255 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 43.2201073763009, + "learning_rate": 9.004267278667031e-07, + "logits/chosen": -1.14817214012146, + "logits/rejected": -1.1386573314666748, + "logps/chosen": -0.814948558807373, + "logps/rejected": -1.2620770931243896, + "loss": 1.8509, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.14948558807373, + "rewards/margins": 4.471285820007324, + "rewards/rejected": -12.620772361755371, + "step": 260 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 46.44813195636061, + "learning_rate": 8.946320217871025e-07, + "logits/chosen": -1.137434720993042, + "logits/rejected": -1.1053473949432373, + "logps/chosen": -0.8041390180587769, + "logps/rejected": -1.2295944690704346, + "loss": 1.8193, + "rewards/accuracies": 0.8125, + "rewards/chosen": -8.041391372680664, + "rewards/margins": 4.2545552253723145, + "rewards/rejected": -12.29594612121582, + "step": 265 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 77.85873013123702, + "learning_rate": 8.886932119764565e-07, + "logits/chosen": -1.1461609601974487, + "logits/rejected": -1.1255290508270264, + "logps/chosen": -0.7929703593254089, + "logps/rejected": -1.264559030532837, + "loss": 1.7945, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -7.929703712463379, + "rewards/margins": 4.715887546539307, + "rewards/rejected": -12.645589828491211, + "step": 270 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 47.47122746211464, + "learning_rate": 8.826124670490802e-07, + "logits/chosen": -1.1295936107635498, + "logits/rejected": -1.0679261684417725, + "logps/chosen": -0.814121425151825, + "logps/rejected": -1.1613472700119019, + "loss": 1.7983, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -8.141213417053223, + "rewards/margins": 3.472259521484375, + "rewards/rejected": -11.613473892211914, + "step": 275 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 48.73236557934882, + "learning_rate": 8.763920074482809e-07, + "logits/chosen": -1.0854823589324951, + "logits/rejected": -1.0907032489776611, + "logps/chosen": -0.8657575845718384, + "logps/rejected": -1.4133459329605103, + "loss": 1.8024, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.657574653625488, + "rewards/margins": 5.475884437561035, + "rewards/rejected": -14.133459091186523, + "step": 280 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 47.765718733774406, + "learning_rate": 8.700341046355411e-07, + "logits/chosen": -1.2173136472702026, + "logits/rejected": -1.1900323629379272, + "logps/chosen": -0.8186389803886414, + "logps/rejected": -1.356453537940979, + "loss": 1.7205, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -8.186389923095703, + "rewards/margins": 5.37814474105835, + "rewards/rejected": -13.564535140991211, + "step": 285 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 109.29731033378253, + "learning_rate": 8.635410802610723e-07, + "logits/chosen": -1.1491611003875732, + "logits/rejected": -1.1173255443572998, + "logps/chosen": -0.8337961435317993, + "logps/rejected": -1.3431804180145264, + "loss": 1.7427, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.337961196899414, + "rewards/margins": 5.093844413757324, + "rewards/rejected": -13.431805610656738, + "step": 290 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 52.41645016824533, + "learning_rate": 8.569153053160428e-07, + "logits/chosen": -1.1350514888763428, + "logits/rejected": -1.1325361728668213, + "logps/chosen": -0.8541259765625, + "logps/rejected": -1.4412286281585693, + "loss": 1.7092, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -8.541260719299316, + "rewards/margins": 5.871027946472168, + "rewards/rejected": -14.412287712097168, + "step": 295 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 39.769668748500315, + "learning_rate": 8.501591992667849e-07, + "logits/chosen": -1.1677170991897583, + "logits/rejected": -1.1525037288665771, + "logps/chosen": -0.8974548578262329, + "logps/rejected": -1.4408533573150635, + "loss": 1.7174, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.974547386169434, + "rewards/margins": 5.433985710144043, + "rewards/rejected": -14.408534049987793, + "step": 300 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 62.1991466431804, + "learning_rate": 8.432752291713058e-07, + "logits/chosen": -1.1649607419967651, + "logits/rejected": -1.1135450601577759, + "logps/chosen": -0.863819420337677, + "logps/rejected": -1.4759416580200195, + "loss": 1.7163, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -8.63819408416748, + "rewards/margins": 6.121220588684082, + "rewards/rejected": -14.759414672851562, + "step": 305 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 46.51957742191928, + "learning_rate": 8.362659087784152e-07, + "logits/chosen": -1.0963289737701416, + "logits/rejected": -1.1045761108398438, + "logps/chosen": -0.8705239295959473, + "logps/rejected": -1.4945770502090454, + "loss": 1.7561, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -8.705240249633789, + "rewards/margins": 6.240530967712402, + "rewards/rejected": -14.945770263671875, + "step": 310 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 48.05453083003605, + "learning_rate": 8.291337976098067e-07, + "logits/chosen": -1.1423165798187256, + "logits/rejected": -1.1360952854156494, + "logps/chosen": -0.931683361530304, + "logps/rejected": -1.385315179824829, + "loss": 1.717, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -9.316834449768066, + "rewards/margins": 4.536317348480225, + "rewards/rejected": -13.853151321411133, + "step": 315 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 48.54110161857607, + "learning_rate": 8.218815000254231e-07, + "logits/chosen": -1.1940380334854126, + "logits/rejected": -1.1442067623138428, + "logps/chosen": -0.8465646505355835, + "logps/rejected": -1.3880670070648193, + "loss": 1.6584, + "rewards/accuracies": 0.84375, + "rewards/chosen": -8.465646743774414, + "rewards/margins": 5.415023326873779, + "rewards/rejected": -13.880670547485352, + "step": 320 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 65.95615056402572, + "learning_rate": 8.145116642724485e-07, + "logits/chosen": -1.1649667024612427, + "logits/rejected": -1.1449640989303589, + "logps/chosen": -0.8567377328872681, + "logps/rejected": -1.3750416040420532, + "loss": 1.6571, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -8.567378044128418, + "rewards/margins": 5.183037757873535, + "rewards/rejected": -13.750414848327637, + "step": 325 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 49.39205034599243, + "learning_rate": 8.07026981518276e-07, + "logits/chosen": -1.0772454738616943, + "logits/rejected": -1.0332270860671997, + "logps/chosen": -0.8580091595649719, + "logps/rejected": -1.7133315801620483, + "loss": 1.5704, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -8.580090522766113, + "rewards/margins": 8.55322265625, + "rewards/rejected": -17.13331413269043, + "step": 330 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 53.718643483464305, + "learning_rate": 7.994301848678004e-07, + "logits/chosen": -1.0714246034622192, + "logits/rejected": -1.0167793035507202, + "logps/chosen": -0.8965083360671997, + "logps/rejected": -1.6172587871551514, + "loss": 1.6011, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -8.965082168579102, + "rewards/margins": 7.2075066566467285, + "rewards/rejected": -16.172588348388672, + "step": 335 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 54.634488875444745, + "learning_rate": 7.917240483654e-07, + "logits/chosen": -1.0873680114746094, + "logits/rejected": -1.0355134010314941, + "logps/chosen": -0.8684855699539185, + "logps/rejected": -1.5925236940383911, + "loss": 1.6421, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -8.684855461120605, + "rewards/margins": 7.240380764007568, + "rewards/rejected": -15.925236701965332, + "step": 340 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 51.28688313391757, + "learning_rate": 7.839113859819656e-07, + "logits/chosen": -1.1350085735321045, + "logits/rejected": -1.1159374713897705, + "logps/chosen": -0.9655283689498901, + "logps/rejected": -1.7449572086334229, + "loss": 1.656, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.65528392791748, + "rewards/margins": 7.794284820556641, + "rewards/rejected": -17.449567794799805, + "step": 345 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 55.37179382269985, + "learning_rate": 7.759950505873521e-07, + "logits/chosen": -1.1313519477844238, + "logits/rejected": -1.1056666374206543, + "logps/chosen": -0.7726918458938599, + "logps/rejected": -1.281256079673767, + "loss": 1.5694, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.726918697357178, + "rewards/margins": 5.0856428146362305, + "rewards/rejected": -12.81256103515625, + "step": 350 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 53.34572563403844, + "learning_rate": 7.67977932908626e-07, + "logits/chosen": -1.108883261680603, + "logits/rejected": -1.0669116973876953, + "logps/chosen": -0.7748141288757324, + "logps/rejected": -1.440830945968628, + "loss": 1.587, + "rewards/accuracies": 0.84375, + "rewards/chosen": -7.74813985824585, + "rewards/margins": 6.660167694091797, + "rewards/rejected": -14.408308029174805, + "step": 355 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 48.16281114348793, + "learning_rate": 7.598629604744872e-07, + "logits/chosen": -1.0789738893508911, + "logits/rejected": -1.060198187828064, + "logps/chosen": -0.9467193484306335, + "logps/rejected": -1.7663625478744507, + "loss": 1.5215, + "rewards/accuracies": 0.8125, + "rewards/chosen": -9.467192649841309, + "rewards/margins": 8.196432113647461, + "rewards/rejected": -17.663623809814453, + "step": 360 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 63.80877540833078, + "learning_rate": 7.516530965462539e-07, + "logits/chosen": -1.163653016090393, + "logits/rejected": -1.1556129455566406, + "logps/chosen": -0.8032275438308716, + "logps/rejected": -1.5929330587387085, + "loss": 1.5335, + "rewards/accuracies": 0.84375, + "rewards/chosen": -8.03227424621582, + "rewards/margins": 7.897056579589844, + "rewards/rejected": -15.929330825805664, + "step": 365 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 48.323366094845305, + "learning_rate": 7.433513390357989e-07, + "logits/chosen": -1.1779323816299438, + "logits/rejected": -1.1376478672027588, + "logps/chosen": -0.8401437997817993, + "logps/rejected": -1.704115629196167, + "loss": 1.4945, + "rewards/accuracies": 0.84375, + "rewards/chosen": -8.401437759399414, + "rewards/margins": 8.639719009399414, + "rewards/rejected": -17.041156768798828, + "step": 370 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 56.42804006279489, + "learning_rate": 7.349607194108322e-07, + "logits/chosen": -1.211778163909912, + "logits/rejected": -1.1346906423568726, + "logps/chosen": -0.8210548162460327, + "logps/rejected": -1.5874106884002686, + "loss": 1.4902, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -8.210548400878906, + "rewards/margins": 7.6635589599609375, + "rewards/rejected": -15.874107360839844, + "step": 375 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 45.567036459266035, + "learning_rate": 7.264843015879321e-07, + "logits/chosen": -1.0812907218933105, + "logits/rejected": -1.0929306745529175, + "logps/chosen": -0.8382685780525208, + "logps/rejected": -1.5565245151519775, + "loss": 1.3906, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.382685661315918, + "rewards/margins": 7.182559967041016, + "rewards/rejected": -15.565244674682617, + "step": 380 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 60.144174866919506, + "learning_rate": 7.17925180813725e-07, + "logits/chosen": -1.1758795976638794, + "logits/rejected": -1.1344640254974365, + "logps/chosen": -0.9767251014709473, + "logps/rejected": -2.0028223991394043, + "loss": 1.5811, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -9.767251968383789, + "rewards/margins": 10.260972023010254, + "rewards/rejected": -20.028223037719727, + "step": 385 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 65.50587766305415, + "learning_rate": 7.092864825346266e-07, + "logits/chosen": -1.1589624881744385, + "logits/rejected": -1.1095167398452759, + "logps/chosen": -0.8335350155830383, + "logps/rejected": -1.9316027164459229, + "loss": 1.4313, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -8.335351943969727, + "rewards/margins": 10.980676651000977, + "rewards/rejected": -19.316028594970703, + "step": 390 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 55.45278300982558, + "learning_rate": 7.005713612555545e-07, + "logits/chosen": -1.1298582553863525, + "logits/rejected": -1.1044299602508545, + "logps/chosen": -0.8622570037841797, + "logps/rejected": -1.6899712085723877, + "loss": 1.4256, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -8.622570037841797, + "rewards/margins": 8.277142524719238, + "rewards/rejected": -16.89971351623535, + "step": 395 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 51.76941664963157, + "learning_rate": 6.917829993880302e-07, + "logits/chosen": -1.0573416948318481, + "logits/rejected": -1.0199925899505615, + "logps/chosen": -0.8631765246391296, + "logps/rejected": -1.7595199346542358, + "loss": 1.4299, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.631765365600586, + "rewards/margins": 8.963434219360352, + "rewards/rejected": -17.595197677612305, + "step": 400 + }, + { + "epoch": 0.8743169398907104, + "eval_logits/chosen": -1.329952597618103, + "eval_logits/rejected": -1.276990532875061, + "eval_logps/chosen": -0.838367760181427, + "eval_logps/rejected": -1.7586109638214111, + "eval_loss": 1.4681804180145264, + "eval_rewards/accuracies": 0.8704819083213806, + "eval_rewards/chosen": -8.38367748260498, + "eval_rewards/margins": 9.202432632446289, + "eval_rewards/rejected": -17.58610725402832, + "eval_runtime": 37.0639, + "eval_samples_per_second": 35.56, + "eval_steps_per_second": 2.239, + "step": 400 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 50.42729833432404, + "learning_rate": 6.8292460608809e-07, + "logits/chosen": -1.1105328798294067, + "logits/rejected": -1.030253529548645, + "logps/chosen": -0.8442754745483398, + "logps/rejected": -1.8324899673461914, + "loss": 1.4028, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -8.442753791809082, + "rewards/margins": 9.882145881652832, + "rewards/rejected": -18.324899673461914, + "step": 405 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 52.61786510417617, + "learning_rate": 6.739994160844309e-07, + "logits/chosen": -1.1255931854248047, + "logits/rejected": -1.1459242105484009, + "logps/chosen": -0.9382074475288391, + "logps/rejected": -2.0845236778259277, + "loss": 1.3773, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.382074356079102, + "rewards/margins": 11.463163375854492, + "rewards/rejected": -20.845239639282227, + "step": 410 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 53.20615382876954, + "learning_rate": 6.650106884972176e-07, + "logits/chosen": -1.174919605255127, + "logits/rejected": -1.161853551864624, + "logps/chosen": -0.812863826751709, + "logps/rejected": -1.9977823495864868, + "loss": 1.4673, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -8.12863826751709, + "rewards/margins": 11.8491849899292, + "rewards/rejected": -19.97782325744629, + "step": 415 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 72.30539481567058, + "learning_rate": 6.559617056479827e-07, + "logits/chosen": -1.1823254823684692, + "logits/rejected": -1.1509783267974854, + "logps/chosen": -0.9243672490119934, + "logps/rejected": -2.090036392211914, + "loss": 1.3848, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -9.243673324584961, + "rewards/margins": 11.656692504882812, + "rewards/rejected": -20.90036392211914, + "step": 420 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 56.55049259469843, + "learning_rate": 6.468557718610559e-07, + "logits/chosen": -1.1578831672668457, + "logits/rejected": -1.1295568943023682, + "logps/chosen": -0.9935392141342163, + "logps/rejected": -2.230989456176758, + "loss": 1.4147, + "rewards/accuracies": 0.84375, + "rewards/chosen": -9.935392379760742, + "rewards/margins": 12.37450122833252, + "rewards/rejected": -22.309894561767578, + "step": 425 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 53.30457991286137, + "learning_rate": 6.376962122569567e-07, + "logits/chosen": -1.085447072982788, + "logits/rejected": -1.0933465957641602, + "logps/chosen": -0.6574488282203674, + "logps/rejected": -1.6747887134552002, + "loss": 1.1961, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.574488162994385, + "rewards/margins": 10.17340087890625, + "rewards/rejected": -16.74789047241211, + "step": 430 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 64.87387340664675, + "learning_rate": 6.284863715381948e-07, + "logits/chosen": -1.1618143320083618, + "logits/rejected": -1.1672941446304321, + "logps/chosen": -0.826119601726532, + "logps/rejected": -2.0510828495025635, + "loss": 1.3657, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -8.26119613647461, + "rewards/margins": 12.249631881713867, + "rewards/rejected": -20.510828018188477, + "step": 435 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 75.88914587151349, + "learning_rate": 6.192296127679192e-07, + "logits/chosen": -1.1188112497329712, + "logits/rejected": -1.0653330087661743, + "logps/chosen": -0.856960117816925, + "logps/rejected": -1.874408483505249, + "loss": 1.3724, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -8.569601058959961, + "rewards/margins": 10.174482345581055, + "rewards/rejected": -18.744083404541016, + "step": 440 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 58.010471678644066, + "learning_rate": 6.099293161418629e-07, + "logits/chosen": -1.1678813695907593, + "logits/rejected": -1.1374809741973877, + "logps/chosen": -0.7309656143188477, + "logps/rejected": -1.9285995960235596, + "loss": 1.3965, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -7.309657096862793, + "rewards/margins": 11.976339340209961, + "rewards/rejected": -19.285995483398438, + "step": 445 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 59.07594471966485, + "learning_rate": 6.005888777540319e-07, + "logits/chosen": -1.0870949029922485, + "logits/rejected": -1.0754623413085938, + "logps/chosen": -0.8063844442367554, + "logps/rejected": -1.78205144405365, + "loss": 1.382, + "rewards/accuracies": 0.875, + "rewards/chosen": -8.063844680786133, + "rewards/margins": 9.756668090820312, + "rewards/rejected": -17.820514678955078, + "step": 450 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 64.98206511650474, + "learning_rate": 5.912117083565873e-07, + "logits/chosen": -1.110975980758667, + "logits/rejected": -1.0932872295379639, + "logps/chosen": -1.0176341533660889, + "logps/rejected": -2.0513908863067627, + "loss": 1.4187, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -10.176340103149414, + "rewards/margins": 10.337566375732422, + "rewards/rejected": -20.513906478881836, + "step": 455 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 46.23007729679256, + "learning_rate": 5.818012321143773e-07, + "logits/chosen": -1.1477210521697998, + "logits/rejected": -1.1113519668579102, + "logps/chosen": -0.8239518404006958, + "logps/rejected": -2.042766809463501, + "loss": 1.2383, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -8.239518165588379, + "rewards/margins": 12.188150405883789, + "rewards/rejected": -20.42766761779785, + "step": 460 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 47.735878726897354, + "learning_rate": 5.723608853545684e-07, + "logits/chosen": -1.1907925605773926, + "logits/rejected": -1.1501821279525757, + "logps/chosen": -0.7988258600234985, + "logps/rejected": -2.209643840789795, + "loss": 0.9683, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -7.988258361816406, + "rewards/margins": 14.108179092407227, + "rewards/rejected": -22.096435546875, + "step": 465 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 61.73193515780372, + "learning_rate": 5.628941153118388e-07, + "logits/chosen": -1.154846429824829, + "logits/rejected": -1.1379241943359375, + "logps/chosen": -0.8357902765274048, + "logps/rejected": -2.2289681434631348, + "loss": 0.971, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -8.357902526855469, + "rewards/margins": 13.931780815124512, + "rewards/rejected": -22.289682388305664, + "step": 470 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 40.781569445585546, + "learning_rate": 5.534043788695852e-07, + "logits/chosen": -1.1368563175201416, + "logits/rejected": -1.0872905254364014, + "logps/chosen": -0.7054915428161621, + "logps/rejected": -1.9688949584960938, + "loss": 0.9335, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -7.054915428161621, + "rewards/margins": 12.634035110473633, + "rewards/rejected": -19.688949584960938, + "step": 475 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 39.60063818333042, + "learning_rate": 5.438951412976098e-07, + "logits/chosen": -1.2157926559448242, + "logits/rejected": -1.1730263233184814, + "logps/chosen": -0.7333913445472717, + "logps/rejected": -1.9154350757598877, + "loss": 0.9748, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -7.3339128494262695, + "rewards/margins": 11.82043743133545, + "rewards/rejected": -19.15435218811035, + "step": 480 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 36.9883641416429, + "learning_rate": 5.34369874986742e-07, + "logits/chosen": -1.1718839406967163, + "logits/rejected": -1.1198641061782837, + "logps/chosen": -0.8223574757575989, + "logps/rejected": -2.161813259124756, + "loss": 0.86, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.2235746383667, + "rewards/margins": 13.394556045532227, + "rewards/rejected": -21.61812973022461, + "step": 485 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 45.67927644257257, + "learning_rate": 5.248320581808619e-07, + "logits/chosen": -1.1171070337295532, + "logits/rejected": -1.078286051750183, + "logps/chosen": -0.6840003728866577, + "logps/rejected": -2.086259126663208, + "loss": 0.952, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.840004920959473, + "rewards/margins": 14.022584915161133, + "rewards/rejected": -20.862590789794922, + "step": 490 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 49.80369052011741, + "learning_rate": 5.15285173706785e-07, + "logits/chosen": -1.200192928314209, + "logits/rejected": -1.164684534072876, + "logps/chosen": -0.6610619425773621, + "logps/rejected": -1.9493736028671265, + "loss": 0.9254, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.61061954498291, + "rewards/margins": 12.88311767578125, + "rewards/rejected": -19.493736267089844, + "step": 495 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 63.052651567341215, + "learning_rate": 5.057327077024744e-07, + "logits/chosen": -1.235033392906189, + "logits/rejected": -1.1504552364349365, + "logps/chosen": -0.7189664840698242, + "logps/rejected": -1.9392305612564087, + "loss": 0.9191, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -7.1896653175354, + "rewards/margins": 12.202640533447266, + "rewards/rejected": -19.39230728149414, + "step": 500 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 41.75397804120868, + "learning_rate": 4.961781483440433e-07, + "logits/chosen": -1.1860939264297485, + "logits/rejected": -1.1053270101547241, + "logps/chosen": -0.6767443418502808, + "logps/rejected": -2.0427422523498535, + "loss": 0.8993, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.7674431800842285, + "rewards/margins": 13.6599760055542, + "rewards/rejected": -20.42742156982422, + "step": 505 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 49.30319538388188, + "learning_rate": 4.866249845720132e-07, + "logits/chosen": -1.138660192489624, + "logits/rejected": -1.085876226425171, + "logps/chosen": -0.7299310564994812, + "logps/rejected": -1.9430856704711914, + "loss": 1.016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -7.29931116104126, + "rewards/margins": 12.131546020507812, + "rewards/rejected": -19.43085479736328, + "step": 510 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 52.12265231755663, + "learning_rate": 4.770767048172948e-07, + "logits/chosen": -1.1561533212661743, + "logits/rejected": -1.1119697093963623, + "logps/chosen": -0.6895591616630554, + "logps/rejected": -2.036057472229004, + "loss": 0.9218, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.895591735839844, + "rewards/margins": 13.464981079101562, + "rewards/rejected": -20.36057472229004, + "step": 515 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 74.15934271900579, + "learning_rate": 4.675367957273505e-07, + "logits/chosen": -1.1586833000183105, + "logits/rejected": -1.1020275354385376, + "logps/chosen": -0.7152846455574036, + "logps/rejected": -2.071946144104004, + "loss": 0.8339, + "rewards/accuracies": 0.9375, + "rewards/chosen": -7.152846336364746, + "rewards/margins": 13.566617012023926, + "rewards/rejected": -20.719463348388672, + "step": 520 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 42.56498431363493, + "learning_rate": 4.5800874089301455e-07, + "logits/chosen": -1.1795504093170166, + "logits/rejected": -1.1088457107543945, + "logps/chosen": -0.7248662114143372, + "logps/rejected": -2.170384645462036, + "loss": 0.7993, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -7.248661994934082, + "rewards/margins": 14.455184936523438, + "rewards/rejected": -21.703845977783203, + "step": 525 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 62.02807361999941, + "learning_rate": 4.4849601957642285e-07, + "logits/chosen": -1.0977070331573486, + "logits/rejected": -1.0548598766326904, + "logps/chosen": -0.7013900876045227, + "logps/rejected": -2.0269265174865723, + "loss": 0.9229, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.0139007568359375, + "rewards/margins": 13.255361557006836, + "rewards/rejected": -20.269264221191406, + "step": 530 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 55.06395208463241, + "learning_rate": 4.390021054405286e-07, + "logits/chosen": -1.1502290964126587, + "logits/rejected": -1.1126412153244019, + "logps/chosen": -0.7144995331764221, + "logps/rejected": -2.0490660667419434, + "loss": 0.7918, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -7.144995212554932, + "rewards/margins": 13.345664978027344, + "rewards/rejected": -20.490659713745117, + "step": 535 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 49.36114026978983, + "learning_rate": 4.295304652806592e-07, + "logits/chosen": -1.121539831161499, + "logits/rejected": -1.0707148313522339, + "logps/chosen": -0.5932679772377014, + "logps/rejected": -1.8668245077133179, + "loss": 0.8286, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.932679176330566, + "rewards/margins": 12.735565185546875, + "rewards/rejected": -18.668243408203125, + "step": 540 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 43.64434028835421, + "learning_rate": 4.200845577585826e-07, + "logits/chosen": -1.1462314128875732, + "logits/rejected": -1.060794711112976, + "logps/chosen": -0.6785440444946289, + "logps/rejected": -1.9171119928359985, + "loss": 0.8796, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.785439968109131, + "rewards/margins": 12.385680198669434, + "rewards/rejected": -19.171123504638672, + "step": 545 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 53.05809969738917, + "learning_rate": 4.106678321395433e-07, + "logits/chosen": -1.10367751121521, + "logits/rejected": -1.0523195266723633, + "logps/chosen": -0.6541143655776978, + "logps/rejected": -2.2258107662200928, + "loss": 0.7416, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.541143894195557, + "rewards/margins": 15.71696662902832, + "rewards/rejected": -22.25811004638672, + "step": 550 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 43.47249256936393, + "learning_rate": 4.012837270327288e-07, + "logits/chosen": -1.06898832321167, + "logits/rejected": -1.0365402698516846, + "logps/chosen": -0.6559886932373047, + "logps/rejected": -1.8688675165176392, + "loss": 0.8523, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.559887886047363, + "rewards/margins": 12.128788948059082, + "rewards/rejected": -18.688674926757812, + "step": 555 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 46.63956894655285, + "learning_rate": 3.9193566913562915e-07, + "logits/chosen": -1.1477298736572266, + "logits/rejected": -1.0738533735275269, + "logps/chosen": -0.7739165425300598, + "logps/rejected": -2.0762743949890137, + "loss": 0.8547, + "rewards/accuracies": 0.90625, + "rewards/chosen": -7.739165306091309, + "rewards/margins": 13.023576736450195, + "rewards/rejected": -20.76274299621582, + "step": 560 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 41.86222647859532, + "learning_rate": 3.826270719827435e-07, + "logits/chosen": -1.1392501592636108, + "logits/rejected": -1.0661112070083618, + "logps/chosen": -0.7170458436012268, + "logps/rejected": -2.319844961166382, + "loss": 0.8634, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -7.170458793640137, + "rewards/margins": 16.027990341186523, + "rewards/rejected": -23.19845199584961, + "step": 565 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 46.46396612123512, + "learning_rate": 3.7336133469909623e-07, + "logits/chosen": -1.1651411056518555, + "logits/rejected": -1.123337984085083, + "logps/chosen": -0.6901602745056152, + "logps/rejected": -1.889410376548767, + "loss": 0.9312, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.901603698730469, + "rewards/margins": 11.992501258850098, + "rewards/rejected": -18.89410400390625, + "step": 570 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 44.10455274659941, + "learning_rate": 3.64141840759012e-07, + "logits/chosen": -1.071542739868164, + "logits/rejected": -1.001315712928772, + "logps/chosen": -0.6726012229919434, + "logps/rejected": -2.025498867034912, + "loss": 0.7803, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.726011753082275, + "rewards/margins": 13.52897834777832, + "rewards/rejected": -20.254989624023438, + "step": 575 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 57.66393146233383, + "learning_rate": 3.549719567506076e-07, + "logits/chosen": -1.0479185581207275, + "logits/rejected": -1.0177090167999268, + "logps/chosen": -0.7158246636390686, + "logps/rejected": -1.8856004476547241, + "loss": 0.834, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.1582465171813965, + "rewards/margins": 11.697754859924316, + "rewards/rejected": -18.856000900268555, + "step": 580 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 37.27610585053919, + "learning_rate": 3.4585503114644996e-07, + "logits/chosen": -1.192347764968872, + "logits/rejected": -1.096699833869934, + "logps/chosen": -0.7132994532585144, + "logps/rejected": -2.139136552810669, + "loss": 0.8319, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -7.13299560546875, + "rewards/margins": 14.258369445800781, + "rewards/rejected": -21.39136505126953, + "step": 585 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 38.5581266054859, + "learning_rate": 3.3679439308082774e-07, + "logits/chosen": -1.1517072916030884, + "logits/rejected": -1.1123689413070679, + "logps/chosen": -0.5742620229721069, + "logps/rejected": -1.8333053588867188, + "loss": 0.7982, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.74261999130249, + "rewards/margins": 12.590433120727539, + "rewards/rejected": -18.333051681518555, + "step": 590 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 51.162638469265886, + "learning_rate": 3.2779335113408646e-07, + "logits/chosen": -1.1484695672988892, + "logits/rejected": -1.0942023992538452, + "logps/chosen": -0.7073885202407837, + "logps/rejected": -2.2211270332336426, + "loss": 0.8581, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.073885440826416, + "rewards/margins": 15.137385368347168, + "rewards/rejected": -22.21126937866211, + "step": 595 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 52.482834865002914, + "learning_rate": 3.1885519212446716e-07, + "logits/chosen": -1.2034729719161987, + "logits/rejected": -1.1127815246582031, + "logps/chosen": -0.6728402376174927, + "logps/rejected": -2.121699571609497, + "loss": 0.7897, + "rewards/accuracies": 0.96875, + "rewards/chosen": -6.728402137756348, + "rewards/margins": 14.488592147827148, + "rewards/rejected": -21.216995239257812, + "step": 600 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 58.20643669987731, + "learning_rate": 3.0998317990789376e-07, + "logits/chosen": -1.1773592233657837, + "logits/rejected": -1.0987378358840942, + "logps/chosen": -0.6642512083053589, + "logps/rejected": -1.81928288936615, + "loss": 0.8446, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.642512321472168, + "rewards/margins": 11.550317764282227, + "rewards/rejected": -18.192829132080078, + "step": 605 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 39.062955594441966, + "learning_rate": 3.0118055418614295e-07, + "logits/chosen": -1.2293663024902344, + "logits/rejected": -1.1513426303863525, + "logps/chosen": -0.7614090442657471, + "logps/rejected": -2.2579236030578613, + "loss": 0.8045, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -7.6140899658203125, + "rewards/margins": 14.96514892578125, + "rewards/rejected": -22.579238891601562, + "step": 610 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 63.44233426139689, + "learning_rate": 2.9245052932383707e-07, + "logits/chosen": -1.1708543300628662, + "logits/rejected": -1.0607430934906006, + "logps/chosen": -0.7385894656181335, + "logps/rejected": -2.120573043823242, + "loss": 0.8963, + "rewards/accuracies": 0.90625, + "rewards/chosen": -7.385894775390625, + "rewards/margins": 13.819836616516113, + "rewards/rejected": -21.205730438232422, + "step": 615 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 44.7493718163325, + "learning_rate": 2.83796293174686e-07, + "logits/chosen": -1.0966622829437256, + "logits/rejected": -1.0379483699798584, + "logps/chosen": -0.7269963026046753, + "logps/rejected": -2.2072062492370605, + "loss": 0.8585, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -7.269963264465332, + "rewards/margins": 14.802099227905273, + "rewards/rejected": -22.072063446044922, + "step": 620 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 46.49208772158338, + "learning_rate": 2.7522100591741217e-07, + "logits/chosen": -1.1784470081329346, + "logits/rejected": -1.117315649986267, + "logps/chosen": -0.6299835443496704, + "logps/rejected": -2.1423325538635254, + "loss": 0.8395, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.299835681915283, + "rewards/margins": 15.123489379882812, + "rewards/rejected": -21.423324584960938, + "step": 625 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 45.8083524660278, + "learning_rate": 2.6672779890178046e-07, + "logits/chosen": -1.1133753061294556, + "logits/rejected": -1.0135900974273682, + "logps/chosen": -0.7410155534744263, + "logps/rejected": -2.0132102966308594, + "loss": 0.8282, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -7.410154819488525, + "rewards/margins": 12.721948623657227, + "rewards/rejected": -20.132104873657227, + "step": 630 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 64.21787454795688, + "learning_rate": 2.5831977350515454e-07, + "logits/chosen": -1.0689822435379028, + "logits/rejected": -1.0277864933013916, + "logps/chosen": -0.7177176475524902, + "logps/rejected": -2.1230201721191406, + "loss": 0.8661, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -7.177175998687744, + "rewards/margins": 14.053024291992188, + "rewards/rejected": -21.230199813842773, + "step": 635 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 45.50472870594569, + "learning_rate": 2.500000000000001e-07, + "logits/chosen": -1.1423081159591675, + "logits/rejected": -1.1058489084243774, + "logps/chosen": -0.6915294528007507, + "logps/rejected": -2.324897289276123, + "loss": 0.7746, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.915294647216797, + "rewards/margins": 16.33367919921875, + "rewards/rejected": -23.24897003173828, + "step": 640 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 50.195590158524524, + "learning_rate": 2.4177151643274307e-07, + "logits/chosen": -1.1015839576721191, + "logits/rejected": -1.0532914400100708, + "logps/chosen": -0.672285258769989, + "logps/rejected": -2.207167387008667, + "loss": 0.7889, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.722853183746338, + "rewards/margins": 15.348821640014648, + "rewards/rejected": -22.071674346923828, + "step": 645 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 49.484361007454886, + "learning_rate": 2.3363732751439923e-07, + "logits/chosen": -1.2066833972930908, + "logits/rejected": -1.125514268875122, + "logps/chosen": -0.7560944557189941, + "logps/rejected": -2.1213643550872803, + "loss": 0.8624, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -7.560944557189941, + "rewards/margins": 13.65269947052002, + "rewards/rejected": -21.21364402770996, + "step": 650 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 71.24778468818619, + "learning_rate": 2.2560040352337307e-07, + "logits/chosen": -1.1408228874206543, + "logits/rejected": -1.058091402053833, + "logps/chosen": -0.7638077139854431, + "logps/rejected": -2.3721306324005127, + "loss": 0.8676, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -7.6380767822265625, + "rewards/margins": 16.08323097229004, + "rewards/rejected": -23.721309661865234, + "step": 655 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 47.51003694942907, + "learning_rate": 2.1766367922083283e-07, + "logits/chosen": -1.1455005407333374, + "logits/rejected": -1.088330864906311, + "logps/chosen": -0.6874858736991882, + "logps/rejected": -2.227865695953369, + "loss": 0.7364, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.874858856201172, + "rewards/margins": 15.40379810333252, + "rewards/rejected": -22.27865982055664, + "step": 660 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 55.0401674528743, + "learning_rate": 2.0983005277905347e-07, + "logits/chosen": -1.1971169710159302, + "logits/rejected": -1.132368803024292, + "logps/chosen": -0.7266300320625305, + "logps/rejected": -2.1779634952545166, + "loss": 0.8288, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -7.266301155090332, + "rewards/margins": 14.513336181640625, + "rewards/rejected": -21.77963638305664, + "step": 665 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 45.61973729091029, + "learning_rate": 2.021023847231202e-07, + "logits/chosen": -1.1578538417816162, + "logits/rejected": -1.0902925729751587, + "logps/chosen": -0.7618427872657776, + "logps/rejected": -2.1782405376434326, + "loss": 0.8251, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.6184282302856445, + "rewards/margins": 14.163976669311523, + "rewards/rejected": -21.782405853271484, + "step": 670 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 60.45804101244339, + "learning_rate": 1.94483496886381e-07, + "logits/chosen": -1.1042829751968384, + "logits/rejected": -1.0539348125457764, + "logps/chosen": -0.6615833640098572, + "logps/rejected": -2.251406192779541, + "loss": 0.7358, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.6158342361450195, + "rewards/margins": 15.898228645324707, + "rewards/rejected": -22.51406478881836, + "step": 675 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 48.14980683798255, + "learning_rate": 1.869761713800254e-07, + "logits/chosen": -1.1603379249572754, + "logits/rejected": -1.0794651508331299, + "logps/chosen": -0.7725010514259338, + "logps/rejected": -2.2856264114379883, + "loss": 0.8279, + "rewards/accuracies": 0.9375, + "rewards/chosen": -7.725010871887207, + "rewards/margins": 15.131253242492676, + "rewards/rejected": -22.85626220703125, + "step": 680 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 42.61569176571103, + "learning_rate": 1.7958314957717064e-07, + "logits/chosen": -1.1512095928192139, + "logits/rejected": -1.1126072406768799, + "logps/chosen": -0.6438918709754944, + "logps/rejected": -2.071455717086792, + "loss": 0.8299, + "rewards/accuracies": 0.96875, + "rewards/chosen": -6.4389190673828125, + "rewards/margins": 14.275640487670898, + "rewards/rejected": -20.714557647705078, + "step": 685 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 42.953950301305156, + "learning_rate": 1.7230713111182164e-07, + "logits/chosen": -1.2084314823150635, + "logits/rejected": -1.1416782140731812, + "logps/chosen": -0.626567542552948, + "logps/rejected": -2.2436633110046387, + "loss": 0.7812, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.265676021575928, + "rewards/margins": 16.170955657958984, + "rewards/rejected": -22.436630249023438, + "step": 690 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 40.560641175853355, + "learning_rate": 1.651507728930739e-07, + "logits/chosen": -1.13517165184021, + "logits/rejected": -1.079472303390503, + "logps/chosen": -0.6764928102493286, + "logps/rejected": -1.9952529668807983, + "loss": 0.8179, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.764927864074707, + "rewards/margins": 13.187602043151855, + "rewards/rejected": -19.95252799987793, + "step": 695 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 43.147397995901486, + "learning_rate": 1.5811668813491696e-07, + "logits/chosen": -1.2506606578826904, + "logits/rejected": -1.1565546989440918, + "logps/chosen": -0.7353881597518921, + "logps/rejected": -2.194483757019043, + "loss": 0.8118, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -7.353880882263184, + "rewards/margins": 14.59095573425293, + "rewards/rejected": -21.944839477539062, + "step": 700 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 62.57740103030882, + "learning_rate": 1.5120744540199343e-07, + "logits/chosen": -1.1544785499572754, + "logits/rejected": -1.0858932733535767, + "logps/chosen": -0.6997456550598145, + "logps/rejected": -2.130126476287842, + "loss": 0.7627, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.9974565505981445, + "rewards/margins": 14.303808212280273, + "rewards/rejected": -21.3012638092041, + "step": 705 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 52.64313236503675, + "learning_rate": 1.4442556767166369e-07, + "logits/chosen": -1.1379203796386719, + "logits/rejected": -1.0858924388885498, + "logps/chosen": -0.7317473292350769, + "logps/rejected": -2.252659320831299, + "loss": 0.8515, + "rewards/accuracies": 0.90625, + "rewards/chosen": -7.3174729347229, + "rewards/margins": 15.20911693572998, + "rewards/rejected": -22.52659034729004, + "step": 710 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 38.895227790104606, + "learning_rate": 1.377735314127148e-07, + "logits/chosen": -1.1292599439620972, + "logits/rejected": -1.0353094339370728, + "logps/chosen": -0.7267423868179321, + "logps/rejected": -2.1561272144317627, + "loss": 0.7422, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -7.267423152923584, + "rewards/margins": 14.293848991394043, + "rewards/rejected": -21.5612735748291, + "step": 715 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 56.98341311224862, + "learning_rate": 1.312537656810549e-07, + "logits/chosen": -1.108595609664917, + "logits/rejected": -1.065575361251831, + "logps/chosen": -0.8170570135116577, + "logps/rejected": -2.3806543350219727, + "loss": 0.8782, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -8.170571327209473, + "rewards/margins": 15.635971069335938, + "rewards/rejected": -23.806543350219727, + "step": 720 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 49.11661386514687, + "learning_rate": 1.2486865123271866e-07, + "logits/chosen": -1.170401692390442, + "logits/rejected": -1.084576964378357, + "logps/chosen": -0.7355960607528687, + "logps/rejected": -2.196117401123047, + "loss": 0.8153, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -7.355960845947266, + "rewards/margins": 14.605212211608887, + "rewards/rejected": -21.961172103881836, + "step": 725 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 55.74657113607268, + "learning_rate": 1.1862051965451214e-07, + "logits/chosen": -1.1835774183273315, + "logits/rejected": -1.0799705982208252, + "logps/chosen": -0.6676367521286011, + "logps/rejected": -2.179206609725952, + "loss": 0.8178, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.67636775970459, + "rewards/margins": 15.115699768066406, + "rewards/rejected": -21.79206657409668, + "step": 730 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 51.541880192438846, + "learning_rate": 1.1251165251261047e-07, + "logits/chosen": -1.1200164556503296, + "logits/rejected": -1.0568726062774658, + "logps/chosen": -0.645528256893158, + "logps/rejected": -2.1150240898132324, + "loss": 0.7418, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.455282688140869, + "rewards/margins": 14.694958686828613, + "rewards/rejected": -21.15024185180664, + "step": 735 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 50.10821238721753, + "learning_rate": 1.0654428051942138e-07, + "logits/chosen": -1.1239360570907593, + "logits/rejected": -1.0723472833633423, + "logps/chosen": -0.7925865054130554, + "logps/rejected": -2.1674091815948486, + "loss": 0.8513, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.92586612701416, + "rewards/margins": 13.748224258422852, + "rewards/rejected": -21.674091339111328, + "step": 740 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 43.56101653661629, + "learning_rate": 1.0072058271901978e-07, + "logits/chosen": -1.1107392311096191, + "logits/rejected": -1.0339866876602173, + "logps/chosen": -0.7230111956596375, + "logps/rejected": -2.215801954269409, + "loss": 0.8124, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -7.230112552642822, + "rewards/margins": 14.92790699005127, + "rewards/rejected": -22.158018112182617, + "step": 745 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 44.251434071556815, + "learning_rate": 9.504268569144763e-08, + "logits/chosen": -1.1945059299468994, + "logits/rejected": -1.1036922931671143, + "logps/chosen": -0.6170369982719421, + "logps/rejected": -2.2507286071777344, + "loss": 0.7391, + "rewards/accuracies": 0.96875, + "rewards/chosen": -6.170370101928711, + "rewards/margins": 16.336917877197266, + "rewards/rejected": -22.50728988647461, + "step": 750 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 49.60678271012646, + "learning_rate": 8.951266277617325e-08, + "logits/chosen": -1.110953688621521, + "logits/rejected": -1.0356338024139404, + "logps/chosen": -0.6501866579055786, + "logps/rejected": -2.0613694190979004, + "loss": 0.7668, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.501866817474365, + "rewards/margins": 14.111828804016113, + "rewards/rejected": -20.61369514465332, + "step": 755 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 56.33934467747551, + "learning_rate": 8.413253331499049e-08, + "logits/chosen": -1.185937762260437, + "logits/rejected": -1.1122770309448242, + "logps/chosen": -0.737058699131012, + "logps/rejected": -2.13063383102417, + "loss": 0.7883, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.370587348937988, + "rewards/margins": 13.935750961303711, + "rewards/rejected": -21.306339263916016, + "step": 760 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 44.94644265056986, + "learning_rate": 7.8904261914637e-08, + "logits/chosen": -1.1913588047027588, + "logits/rejected": -1.142197847366333, + "logps/chosen": -0.7348116636276245, + "logps/rejected": -2.11796236038208, + "loss": 0.8475, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -7.348116397857666, + "rewards/margins": 13.831507682800293, + "rewards/rejected": -21.179622650146484, + "step": 765 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 44.33346730845319, + "learning_rate": 7.382975772939865e-08, + "logits/chosen": -1.1930986642837524, + "logits/rejected": -1.1425340175628662, + "logps/chosen": -0.6687518358230591, + "logps/rejected": -2.18174409866333, + "loss": 0.8317, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -6.6875176429748535, + "rewards/margins": 15.129925727844238, + "rewards/rejected": -21.81744384765625, + "step": 770 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 62.98906395437277, + "learning_rate": 6.891087376396315e-08, + "logits/chosen": -1.107334852218628, + "logits/rejected": -1.0646486282348633, + "logps/chosen": -0.6770855784416199, + "logps/rejected": -1.9466581344604492, + "loss": 0.8939, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -6.770855903625488, + "rewards/margins": 12.695725440979004, + "rewards/rejected": -19.466583251953125, + "step": 775 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 58.192179469870524, + "learning_rate": 6.414940619677734e-08, + "logits/chosen": -1.1423081159591675, + "logits/rejected": -1.0822668075561523, + "logps/chosen": -0.7497803568840027, + "logps/rejected": -2.090430736541748, + "loss": 0.895, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.497802734375, + "rewards/margins": 13.406506538391113, + "rewards/rejected": -20.90431022644043, + "step": 780 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 47.51392910975651, + "learning_rate": 5.954709372415523e-08, + "logits/chosen": -1.1490824222564697, + "logits/rejected": -1.0730870962142944, + "logps/chosen": -0.7593728303909302, + "logps/rejected": -2.2705507278442383, + "loss": 0.808, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -7.593728065490723, + "rewards/margins": 15.111780166625977, + "rewards/rejected": -22.70550537109375, + "step": 785 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 48.831632577538024, + "learning_rate": 5.5105616925376296e-08, + "logits/chosen": -1.2490391731262207, + "logits/rejected": -1.1065878868103027, + "logps/chosen": -0.6748226881027222, + "logps/rejected": -2.066803455352783, + "loss": 0.8208, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.748227119445801, + "rewards/margins": 13.919805526733398, + "rewards/rejected": -20.668033599853516, + "step": 790 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 58.14259346920981, + "learning_rate": 5.082659764900482e-08, + "logits/chosen": -1.1919060945510864, + "logits/rejected": -1.1228830814361572, + "logps/chosen": -0.6102501749992371, + "logps/rejected": -1.8674005270004272, + "loss": 0.851, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.102501392364502, + "rewards/margins": 12.571504592895508, + "rewards/rejected": -18.674007415771484, + "step": 795 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 58.75906827764335, + "learning_rate": 4.6711598420656976e-08, + "logits/chosen": -1.165160894393921, + "logits/rejected": -1.0898354053497314, + "logps/chosen": -0.6838083863258362, + "logps/rejected": -2.08567476272583, + "loss": 0.7858, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.838083744049072, + "rewards/margins": 14.01866340637207, + "rewards/rejected": -20.856746673583984, + "step": 800 + }, + { + "epoch": 1.748633879781421, + "eval_logits/chosen": -1.3754932880401611, + "eval_logits/rejected": -1.2976948022842407, + "eval_logps/chosen": -0.7933117747306824, + "eval_logps/rejected": -1.9287370443344116, + "eval_loss": 1.2716172933578491, + "eval_rewards/accuracies": 0.8614457845687866, + "eval_rewards/chosen": -7.933117389678955, + "eval_rewards/margins": 11.35425090789795, + "eval_rewards/rejected": -19.287368774414062, + "eval_runtime": 34.0159, + "eval_samples_per_second": 38.747, + "eval_steps_per_second": 2.44, + "step": 800 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 42.21959698595358, + "learning_rate": 4.2762121872428615e-08, + "logits/chosen": -1.1773040294647217, + "logits/rejected": -1.129456639289856, + "logps/chosen": -0.6802318692207336, + "logps/rejected": -1.888240098953247, + "loss": 0.8525, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.8023176193237305, + "rewards/margins": 12.080080032348633, + "rewards/rejected": -18.882396697998047, + "step": 805 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 58.528554345128654, + "learning_rate": 3.897961019419516e-08, + "logits/chosen": -1.1610840559005737, + "logits/rejected": -1.0498443841934204, + "logps/chosen": -0.6537138819694519, + "logps/rejected": -2.291470766067505, + "loss": 0.8496, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.537138938903809, + "rewards/margins": 16.377567291259766, + "rewards/rejected": -22.91470718383789, + "step": 810 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 54.90972417331509, + "learning_rate": 3.536544460698143e-08, + "logits/chosen": -1.1778395175933838, + "logits/rejected": -1.1455332040786743, + "logps/chosen": -0.7220278978347778, + "logps/rejected": -2.2299325466156006, + "loss": 0.885, + "rewards/accuracies": 0.9375, + "rewards/chosen": -7.220279693603516, + "rewards/margins": 15.079046249389648, + "rewards/rejected": -22.299325942993164, + "step": 815 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 56.09383885571275, + "learning_rate": 3.192094485859526e-08, + "logits/chosen": -1.1215145587921143, + "logits/rejected": -1.073870301246643, + "logps/chosen": -0.7387723326683044, + "logps/rejected": -2.0020480155944824, + "loss": 0.7983, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -7.387722969055176, + "rewards/margins": 12.632759094238281, + "rewards/rejected": -20.020483016967773, + "step": 820 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 53.60121299218721, + "learning_rate": 2.8647368741709367e-08, + "logits/chosen": -1.2346153259277344, + "logits/rejected": -1.1147099733352661, + "logps/chosen": -0.7686562538146973, + "logps/rejected": -2.255371570587158, + "loss": 0.8238, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.686563014984131, + "rewards/margins": 14.867155075073242, + "rewards/rejected": -22.5537166595459, + "step": 825 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 70.95861218592107, + "learning_rate": 2.5545911634565265e-08, + "logits/chosen": -1.2187676429748535, + "logits/rejected": -1.10321044921875, + "logps/chosen": -0.7037076950073242, + "logps/rejected": -2.5123300552368164, + "loss": 0.7911, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -7.0370774269104, + "rewards/margins": 18.086223602294922, + "rewards/rejected": -25.123300552368164, + "step": 830 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 52.68881901646796, + "learning_rate": 2.261770606446983e-08, + "logits/chosen": -1.2337759733200073, + "logits/rejected": -1.1664550304412842, + "logps/chosen": -0.6923700571060181, + "logps/rejected": -1.8173946142196655, + "loss": 0.7758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.923699855804443, + "rewards/margins": 11.250245094299316, + "rewards/rejected": -18.173946380615234, + "step": 835 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 49.635844220613414, + "learning_rate": 1.9863821294241522e-08, + "logits/chosen": -1.140126347541809, + "logits/rejected": -1.0474215745925903, + "logps/chosen": -0.6625608801841736, + "logps/rejected": -2.1753034591674805, + "loss": 0.8089, + "rewards/accuracies": 0.96875, + "rewards/chosen": -6.625608921051025, + "rewards/margins": 15.12742805480957, + "rewards/rejected": -21.75303840637207, + "step": 840 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 43.19571532603005, + "learning_rate": 1.7285262931759082e-08, + "logits/chosen": -1.0827378034591675, + "logits/rejected": -1.040972352027893, + "logps/chosen": -0.6810778379440308, + "logps/rejected": -2.233325242996216, + "loss": 0.8335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.810777187347412, + "rewards/margins": 15.52247142791748, + "rewards/rejected": -22.333250045776367, + "step": 845 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 44.932248498556426, + "learning_rate": 1.4882972562753615e-08, + "logits/chosen": -1.1539499759674072, + "logits/rejected": -1.0578666925430298, + "logps/chosen": -0.6092909574508667, + "logps/rejected": -2.176682949066162, + "loss": 0.7694, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.092909812927246, + "rewards/margins": 15.673917770385742, + "rewards/rejected": -21.766826629638672, + "step": 850 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 42.913562748722356, + "learning_rate": 1.2657827406979404e-08, + "logits/chosen": -1.192744493484497, + "logits/rejected": -1.1254050731658936, + "logps/chosen": -0.6767739653587341, + "logps/rejected": -2.08142352104187, + "loss": 0.7891, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -6.767739772796631, + "rewards/margins": 14.04649543762207, + "rewards/rejected": -20.81423568725586, + "step": 855 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 41.804821801782175, + "learning_rate": 1.0610639997888915e-08, + "logits/chosen": -1.0884257555007935, + "logits/rejected": -1.0512984991073608, + "logps/chosen": -0.6403497457504272, + "logps/rejected": -1.8734140396118164, + "loss": 0.8247, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.40349817276001, + "rewards/margins": 12.330641746520996, + "rewards/rejected": -18.734140396118164, + "step": 860 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 44.35333055569577, + "learning_rate": 8.742157885927804e-09, + "logits/chosen": -1.1908656358718872, + "logits/rejected": -1.1213476657867432, + "logps/chosen": -0.7420295476913452, + "logps/rejected": -2.280778646469116, + "loss": 0.7435, + "rewards/accuracies": 0.96875, + "rewards/chosen": -7.420294761657715, + "rewards/margins": 15.387492179870605, + "rewards/rejected": -22.807788848876953, + "step": 865 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 47.53352705675185, + "learning_rate": 7.053063365559997e-09, + "logits/chosen": -1.1726518869400024, + "logits/rejected": -1.1312017440795898, + "logps/chosen": -0.624583899974823, + "logps/rejected": -2.1973233222961426, + "loss": 0.711, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -6.2458391189575195, + "rewards/margins": 15.727396965026855, + "rewards/rejected": -21.973236083984375, + "step": 870 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 49.95076655476162, + "learning_rate": 5.543973226120935e-09, + "logits/chosen": -1.1566675901412964, + "logits/rejected": -1.0951449871063232, + "logps/chosen": -0.6883528828620911, + "logps/rejected": -1.9860836267471313, + "loss": 0.7853, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.883528709411621, + "rewards/margins": 12.977307319641113, + "rewards/rejected": -19.860836029052734, + "step": 875 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 46.10249074996564, + "learning_rate": 4.215438526591064e-09, + "logits/chosen": -1.1910879611968994, + "logits/rejected": -1.1287364959716797, + "logps/chosen": -0.6650462746620178, + "logps/rejected": -2.1194908618927, + "loss": 0.7991, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.6504621505737305, + "rewards/margins": 14.54444408416748, + "rewards/rejected": -21.19490623474121, + "step": 880 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 62.72289061103836, + "learning_rate": 3.0679443943712467e-09, + "logits/chosen": -1.2184160947799683, + "logits/rejected": -1.1404691934585571, + "logps/chosen": -0.7271493673324585, + "logps/rejected": -2.16991925239563, + "loss": 0.7789, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -7.271493434906006, + "rewards/margins": 14.427698135375977, + "rewards/rejected": -21.69919204711914, + "step": 885 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 45.380019039047724, + "learning_rate": 2.1019098481337426e-09, + "logits/chosen": -1.1838113069534302, + "logits/rejected": -1.1087853908538818, + "logps/chosen": -0.6985124349594116, + "logps/rejected": -2.234588861465454, + "loss": 0.8115, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.985124111175537, + "rewards/margins": 15.36076545715332, + "rewards/rejected": -22.345890045166016, + "step": 890 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 52.561200553522546, + "learning_rate": 1.3176876448135477e-09, + "logits/chosen": -1.238799810409546, + "logits/rejected": -1.1316242218017578, + "logps/chosen": -0.7857435345649719, + "logps/rejected": -2.3238725662231445, + "loss": 0.8526, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -7.8574347496032715, + "rewards/margins": 15.3812894821167, + "rewards/rejected": -23.238723754882812, + "step": 895 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 51.75087517711866, + "learning_rate": 7.155641507955445e-10, + "logits/chosen": -1.138656497001648, + "logits/rejected": -1.0620585680007935, + "logps/chosen": -0.6295305490493774, + "logps/rejected": -1.968534231185913, + "loss": 0.8507, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.2953057289123535, + "rewards/margins": 13.390034675598145, + "rewards/rejected": -19.685340881347656, + "step": 900 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 54.546613938165166, + "learning_rate": 2.957592373452056e-10, + "logits/chosen": -1.141952395439148, + "logits/rejected": -1.078550100326538, + "logps/chosen": -0.695178747177124, + "logps/rejected": -2.1918914318084717, + "loss": 0.8393, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.95178747177124, + "rewards/margins": 14.967126846313477, + "rewards/rejected": -21.918914794921875, + "step": 905 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 50.15211263774824, + "learning_rate": 5.842620032053824e-11, + "logits/chosen": -1.2007973194122314, + "logits/rejected": -1.138047456741333, + "logps/chosen": -0.6629818081855774, + "logps/rejected": -1.9467570781707764, + "loss": 0.7506, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.629817962646484, + "rewards/margins": 12.837751388549805, + "rewards/rejected": -19.467571258544922, + "step": 910 + }, + { + "epoch": 1.9978142076502732, + "step": 914, + "total_flos": 0.0, + "train_loss": 1.4686054226084402, + "train_runtime": 12099.7941, + "train_samples_per_second": 9.679, + "train_steps_per_second": 0.076 + } + ], + "logging_steps": 5, + "max_steps": 914, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}