{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997172745264349, "eval_steps": 500, "global_step": 442, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022618037885213456, "grad_norm": 131.355215075171, "learning_rate": 1.7777777777777777e-08, "logits/chosen": -11.149957656860352, "logits/rejected": -11.106039047241211, "logps/chosen": -0.4639046788215637, "logps/rejected": -0.459951788187027, "loss": 4.9394, "rewards/accuracies": 0.375, "rewards/chosen": -4.639046669006348, "rewards/margins": -0.039528995752334595, "rewards/rejected": -4.599517822265625, "step": 1 }, { "epoch": 0.004523607577042691, "grad_norm": 68.75410369813167, "learning_rate": 3.5555555555555554e-08, "logits/chosen": -10.890952110290527, "logits/rejected": -10.69871711730957, "logps/chosen": -0.5820316672325134, "logps/rejected": -0.5644893646240234, "loss": 5.3979, "rewards/accuracies": 0.3125, "rewards/chosen": -5.820316314697266, "rewards/margins": -0.17542320489883423, "rewards/rejected": -5.644893646240234, "step": 2 }, { "epoch": 0.006785411365564037, "grad_norm": 98.10181248017503, "learning_rate": 5.333333333333333e-08, "logits/chosen": -10.386991500854492, "logits/rejected": -10.3389892578125, "logps/chosen": -0.7467580437660217, "logps/rejected": -0.7350905537605286, "loss": 5.3645, "rewards/accuracies": 0.59375, "rewards/chosen": -7.467580318450928, "rewards/margins": -0.11667439341545105, "rewards/rejected": -7.350905418395996, "step": 3 }, { "epoch": 0.009047215154085382, "grad_norm": 85.65000023027866, "learning_rate": 7.111111111111111e-08, "logits/chosen": -10.73530387878418, "logits/rejected": -10.564247131347656, "logps/chosen": -0.5459556579589844, "logps/rejected": -0.4875364899635315, "loss": 5.1509, "rewards/accuracies": 0.46875, "rewards/chosen": -5.459556579589844, "rewards/margins": -0.584191620349884, "rewards/rejected": -4.875364780426025, "step": 4 }, { "epoch": 0.01130901894260673, "grad_norm": 74.9791226991399, "learning_rate": 8.888888888888888e-08, "logits/chosen": -10.528997421264648, "logits/rejected": -9.97251033782959, "logps/chosen": -0.6336177587509155, "logps/rejected": -0.6410748958587646, "loss": 5.2521, "rewards/accuracies": 0.5, "rewards/chosen": -6.336178302764893, "rewards/margins": 0.07457125931978226, "rewards/rejected": -6.410749435424805, "step": 5 }, { "epoch": 0.013570822731128074, "grad_norm": 85.29375010182012, "learning_rate": 1.0666666666666666e-07, "logits/chosen": -10.442211151123047, "logits/rejected": -10.494767189025879, "logps/chosen": -0.5163740515708923, "logps/rejected": -0.5633006691932678, "loss": 5.3068, "rewards/accuracies": 0.53125, "rewards/chosen": -5.163741111755371, "rewards/margins": 0.46926558017730713, "rewards/rejected": -5.6330060958862305, "step": 6 }, { "epoch": 0.01583262651964942, "grad_norm": 72.34857602748069, "learning_rate": 1.2444444444444443e-07, "logits/chosen": -10.570417404174805, "logits/rejected": -10.182153701782227, "logps/chosen": -0.49319958686828613, "logps/rejected": -0.5546016693115234, "loss": 4.7171, "rewards/accuracies": 0.53125, "rewards/chosen": -4.931995868682861, "rewards/margins": 0.6140204668045044, "rewards/rejected": -5.546016693115234, "step": 7 }, { "epoch": 0.018094430308170765, "grad_norm": 96.60351151716078, "learning_rate": 1.4222222222222222e-07, "logits/chosen": -11.686549186706543, "logits/rejected": -11.599323272705078, "logps/chosen": -0.654168963432312, "logps/rejected": -0.7179521322250366, "loss": 5.2239, "rewards/accuracies": 0.53125, "rewards/chosen": -6.541690349578857, "rewards/margins": 0.6378321051597595, "rewards/rejected": -7.179522514343262, "step": 8 }, { "epoch": 0.020356234096692113, "grad_norm": 69.08528348151097, "learning_rate": 1.6e-07, "logits/chosen": -10.81006908416748, "logits/rejected": -10.815924644470215, "logps/chosen": -0.6012924313545227, "logps/rejected": -0.6476706266403198, "loss": 5.1255, "rewards/accuracies": 0.53125, "rewards/chosen": -6.0129241943359375, "rewards/margins": 0.46378093957901, "rewards/rejected": -6.476705074310303, "step": 9 }, { "epoch": 0.02261803788521346, "grad_norm": 98.9910788377654, "learning_rate": 1.7777777777777776e-07, "logits/chosen": -11.031579971313477, "logits/rejected": -10.416993141174316, "logps/chosen": -0.534875750541687, "logps/rejected": -0.5605251789093018, "loss": 5.0563, "rewards/accuracies": 0.625, "rewards/chosen": -5.348757743835449, "rewards/margins": 0.25649386644363403, "rewards/rejected": -5.605251789093018, "step": 10 }, { "epoch": 0.024879841673734804, "grad_norm": 92.47375199683604, "learning_rate": 1.9555555555555555e-07, "logits/chosen": -11.211597442626953, "logits/rejected": -10.974644660949707, "logps/chosen": -0.5139535665512085, "logps/rejected": -0.5967007279396057, "loss": 4.9212, "rewards/accuracies": 0.4375, "rewards/chosen": -5.139535903930664, "rewards/margins": 0.8274715542793274, "rewards/rejected": -5.967007637023926, "step": 11 }, { "epoch": 0.02714164546225615, "grad_norm": 109.2539243326917, "learning_rate": 2.133333333333333e-07, "logits/chosen": -10.3906831741333, "logits/rejected": -10.407954216003418, "logps/chosen": -0.5846738815307617, "logps/rejected": -0.5586296319961548, "loss": 5.2254, "rewards/accuracies": 0.5625, "rewards/chosen": -5.846738338470459, "rewards/margins": -0.26044273376464844, "rewards/rejected": -5.586296081542969, "step": 12 }, { "epoch": 0.029403449250777494, "grad_norm": 93.26949602496981, "learning_rate": 2.3111111111111107e-07, "logits/chosen": -11.434279441833496, "logits/rejected": -11.00756549835205, "logps/chosen": -0.57530277967453, "logps/rejected": -0.5521742105484009, "loss": 5.3681, "rewards/accuracies": 0.65625, "rewards/chosen": -5.753026962280273, "rewards/margins": -0.23128610849380493, "rewards/rejected": -5.52174186706543, "step": 13 }, { "epoch": 0.03166525303929884, "grad_norm": 54.76880243693634, "learning_rate": 2.4888888888888886e-07, "logits/chosen": -11.06928825378418, "logits/rejected": -10.667929649353027, "logps/chosen": -0.49921348690986633, "logps/rejected": -0.5616152882575989, "loss": 4.7488, "rewards/accuracies": 0.5, "rewards/chosen": -4.992135047912598, "rewards/margins": 0.6240180134773254, "rewards/rejected": -5.616153240203857, "step": 14 }, { "epoch": 0.033927056827820185, "grad_norm": 71.5182395693498, "learning_rate": 2.666666666666666e-07, "logits/chosen": -11.895730972290039, "logits/rejected": -11.64004135131836, "logps/chosen": -0.49031415581703186, "logps/rejected": -0.5405735373497009, "loss": 4.9109, "rewards/accuracies": 0.625, "rewards/chosen": -4.903141498565674, "rewards/margins": 0.5025936961174011, "rewards/rejected": -5.405735015869141, "step": 15 }, { "epoch": 0.03618886061634153, "grad_norm": 79.7034591294439, "learning_rate": 2.8444444444444443e-07, "logits/chosen": -10.60659122467041, "logits/rejected": -10.282760620117188, "logps/chosen": -0.6062531471252441, "logps/rejected": -0.5618928670883179, "loss": 5.1619, "rewards/accuracies": 0.5, "rewards/chosen": -6.062531471252441, "rewards/margins": -0.44360262155532837, "rewards/rejected": -5.618928909301758, "step": 16 }, { "epoch": 0.038450664404862875, "grad_norm": 87.84593204498888, "learning_rate": 3.022222222222222e-07, "logits/chosen": -12.490971565246582, "logits/rejected": -12.19153881072998, "logps/chosen": -0.41767269372940063, "logps/rejected": -0.40474578738212585, "loss": 5.3782, "rewards/accuracies": 0.4375, "rewards/chosen": -4.176727294921875, "rewards/margins": -0.12926939129829407, "rewards/rejected": -4.047458171844482, "step": 17 }, { "epoch": 0.04071246819338423, "grad_norm": 110.95316335628588, "learning_rate": 3.2e-07, "logits/chosen": -11.399320602416992, "logits/rejected": -11.420541763305664, "logps/chosen": -0.6453328728675842, "logps/rejected": -0.6450071334838867, "loss": 5.0468, "rewards/accuracies": 0.3125, "rewards/chosen": -6.453329086303711, "rewards/margins": -0.0032582059502601624, "rewards/rejected": -6.450070381164551, "step": 18 }, { "epoch": 0.04297427198190557, "grad_norm": 86.46973948808649, "learning_rate": 3.3777777777777777e-07, "logits/chosen": -12.149145126342773, "logits/rejected": -12.085639953613281, "logps/chosen": -0.49516329169273376, "logps/rejected": -0.41885480284690857, "loss": 5.052, "rewards/accuracies": 0.34375, "rewards/chosen": -4.951632499694824, "rewards/margins": -0.7630849480628967, "rewards/rejected": -4.188547611236572, "step": 19 }, { "epoch": 0.04523607577042692, "grad_norm": 71.36928249846149, "learning_rate": 3.5555555555555553e-07, "logits/chosen": -10.735100746154785, "logits/rejected": -10.853598594665527, "logps/chosen": -0.5569137930870056, "logps/rejected": -0.5804443955421448, "loss": 4.6835, "rewards/accuracies": 0.53125, "rewards/chosen": -5.5691375732421875, "rewards/margins": 0.2353065013885498, "rewards/rejected": -5.804443836212158, "step": 20 }, { "epoch": 0.04749787955894826, "grad_norm": 61.042781292962395, "learning_rate": 3.7333333333333334e-07, "logits/chosen": -11.995122909545898, "logits/rejected": -11.363445281982422, "logps/chosen": -0.4467337131500244, "logps/rejected": -0.4861924648284912, "loss": 5.0424, "rewards/accuracies": 0.6875, "rewards/chosen": -4.467337131500244, "rewards/margins": 0.3945879340171814, "rewards/rejected": -4.86192512512207, "step": 21 }, { "epoch": 0.04975968334746961, "grad_norm": 89.07868369848414, "learning_rate": 3.911111111111111e-07, "logits/chosen": -11.574544906616211, "logits/rejected": -11.458039283752441, "logps/chosen": -0.5010548233985901, "logps/rejected": -0.5385463833808899, "loss": 5.0061, "rewards/accuracies": 0.40625, "rewards/chosen": -5.010547637939453, "rewards/margins": 0.37491610646247864, "rewards/rejected": -5.385463714599609, "step": 22 }, { "epoch": 0.05202148713599095, "grad_norm": 169.3436229453717, "learning_rate": 4.0888888888888886e-07, "logits/chosen": -10.919013977050781, "logits/rejected": -10.827438354492188, "logps/chosen": -0.5667375326156616, "logps/rejected": -0.5423346161842346, "loss": 4.978, "rewards/accuracies": 0.5, "rewards/chosen": -5.667375564575195, "rewards/margins": -0.2440294474363327, "rewards/rejected": -5.423345565795898, "step": 23 }, { "epoch": 0.0542832909245123, "grad_norm": 58.12255640669961, "learning_rate": 4.266666666666666e-07, "logits/chosen": -12.290349960327148, "logits/rejected": -12.1292142868042, "logps/chosen": -0.3345947861671448, "logps/rejected": -0.37946847081184387, "loss": 4.7173, "rewards/accuracies": 0.53125, "rewards/chosen": -3.3459479808807373, "rewards/margins": 0.44873636960983276, "rewards/rejected": -3.794684410095215, "step": 24 }, { "epoch": 0.05654509471303364, "grad_norm": 93.15623268640158, "learning_rate": 4.4444444444444444e-07, "logits/chosen": -11.1387939453125, "logits/rejected": -10.918662071228027, "logps/chosen": -0.48852428793907166, "logps/rejected": -0.5354989767074585, "loss": 4.8657, "rewards/accuracies": 0.59375, "rewards/chosen": -4.8852434158325195, "rewards/margins": 0.46974682807922363, "rewards/rejected": -5.354990005493164, "step": 25 }, { "epoch": 0.05880689850155499, "grad_norm": 64.22760084086403, "learning_rate": 4.6222222222222214e-07, "logits/chosen": -11.181513786315918, "logits/rejected": -10.725030899047852, "logps/chosen": -0.43731987476348877, "logps/rejected": -0.48887380957603455, "loss": 4.7432, "rewards/accuracies": 0.59375, "rewards/chosen": -4.373198986053467, "rewards/margins": 0.5155391097068787, "rewards/rejected": -4.88873815536499, "step": 26 }, { "epoch": 0.061068702290076333, "grad_norm": 86.04795850026115, "learning_rate": 4.8e-07, "logits/chosen": -10.508721351623535, "logits/rejected": -10.471704483032227, "logps/chosen": -0.4588507413864136, "logps/rejected": -0.5172092914581299, "loss": 4.8766, "rewards/accuracies": 0.5, "rewards/chosen": -4.588507175445557, "rewards/margins": 0.5835859775543213, "rewards/rejected": -5.172093391418457, "step": 27 }, { "epoch": 0.06333050607859768, "grad_norm": 74.38153566770265, "learning_rate": 4.977777777777777e-07, "logits/chosen": -10.403984069824219, "logits/rejected": -10.611076354980469, "logps/chosen": -0.47997862100601196, "logps/rejected": -0.4639643728733063, "loss": 4.9471, "rewards/accuracies": 0.40625, "rewards/chosen": -4.79978609085083, "rewards/margins": -0.16014233231544495, "rewards/rejected": -4.639643669128418, "step": 28 }, { "epoch": 0.06559230986711903, "grad_norm": 71.67345566558542, "learning_rate": 5.155555555555556e-07, "logits/chosen": -11.623980522155762, "logits/rejected": -11.187301635742188, "logps/chosen": -0.47259610891342163, "logps/rejected": -0.4480085074901581, "loss": 4.9723, "rewards/accuracies": 0.5625, "rewards/chosen": -4.725960731506348, "rewards/margins": -0.24587592482566833, "rewards/rejected": -4.480085372924805, "step": 29 }, { "epoch": 0.06785411365564037, "grad_norm": 67.06827734638706, "learning_rate": 5.333333333333332e-07, "logits/chosen": -11.207446098327637, "logits/rejected": -10.807114601135254, "logps/chosen": -0.37271052598953247, "logps/rejected": -0.4592744708061218, "loss": 4.7976, "rewards/accuracies": 0.71875, "rewards/chosen": -3.7271053791046143, "rewards/margins": 0.8656396865844727, "rewards/rejected": -4.592744827270508, "step": 30 }, { "epoch": 0.07011591744416172, "grad_norm": 72.20873132579756, "learning_rate": 5.511111111111111e-07, "logits/chosen": -10.57562255859375, "logits/rejected": -10.428007125854492, "logps/chosen": -0.43888184428215027, "logps/rejected": -0.464484840631485, "loss": 4.8315, "rewards/accuracies": 0.65625, "rewards/chosen": -4.388818264007568, "rewards/margins": 0.25603026151657104, "rewards/rejected": -4.644848823547363, "step": 31 }, { "epoch": 0.07237772123268306, "grad_norm": 65.66194705402224, "learning_rate": 5.688888888888889e-07, "logits/chosen": -10.86208724975586, "logits/rejected": -10.499285697937012, "logps/chosen": -0.4337802529335022, "logps/rejected": -0.4744107127189636, "loss": 5.0868, "rewards/accuracies": 0.40625, "rewards/chosen": -4.337802886962891, "rewards/margins": 0.4063045084476471, "rewards/rejected": -4.744107246398926, "step": 32 }, { "epoch": 0.07463952502120441, "grad_norm": 69.10527972763198, "learning_rate": 5.866666666666666e-07, "logits/chosen": -10.774530410766602, "logits/rejected": -10.699942588806152, "logps/chosen": -0.4071800112724304, "logps/rejected": -0.4233216643333435, "loss": 4.9605, "rewards/accuracies": 0.53125, "rewards/chosen": -4.0717997550964355, "rewards/margins": 0.1614171266555786, "rewards/rejected": -4.233217239379883, "step": 33 }, { "epoch": 0.07690132880972575, "grad_norm": 64.69922555278133, "learning_rate": 6.044444444444444e-07, "logits/chosen": -10.982305526733398, "logits/rejected": -10.901609420776367, "logps/chosen": -0.37619659304618835, "logps/rejected": -0.4010980725288391, "loss": 4.7433, "rewards/accuracies": 0.5, "rewards/chosen": -3.7619662284851074, "rewards/margins": 0.2490149885416031, "rewards/rejected": -4.01098108291626, "step": 34 }, { "epoch": 0.0791631325982471, "grad_norm": 64.60258006742473, "learning_rate": 6.222222222222223e-07, "logits/chosen": -10.168670654296875, "logits/rejected": -10.236058235168457, "logps/chosen": -0.46076497435569763, "logps/rejected": -0.4832019805908203, "loss": 4.6019, "rewards/accuracies": 0.46875, "rewards/chosen": -4.607649803161621, "rewards/margins": 0.22437021136283875, "rewards/rejected": -4.832019805908203, "step": 35 }, { "epoch": 0.08142493638676845, "grad_norm": 66.16525403071286, "learning_rate": 6.4e-07, "logits/chosen": -10.984823226928711, "logits/rejected": -10.869633674621582, "logps/chosen": -0.4266025125980377, "logps/rejected": -0.44316184520721436, "loss": 4.6435, "rewards/accuracies": 0.53125, "rewards/chosen": -4.266025066375732, "rewards/margins": 0.16559378802776337, "rewards/rejected": -4.431619167327881, "step": 36 }, { "epoch": 0.08368674017528979, "grad_norm": 62.62054800184612, "learning_rate": 6.577777777777777e-07, "logits/chosen": -11.466747283935547, "logits/rejected": -10.947083473205566, "logps/chosen": -0.4133344292640686, "logps/rejected": -0.46087807416915894, "loss": 4.6937, "rewards/accuracies": 0.65625, "rewards/chosen": -4.1333441734313965, "rewards/margins": 0.47543561458587646, "rewards/rejected": -4.608780384063721, "step": 37 }, { "epoch": 0.08594854396381114, "grad_norm": 68.71159791998826, "learning_rate": 6.755555555555555e-07, "logits/chosen": -10.58063793182373, "logits/rejected": -10.66105842590332, "logps/chosen": -0.4248150587081909, "logps/rejected": -0.4461151957511902, "loss": 4.6967, "rewards/accuracies": 0.34375, "rewards/chosen": -4.248150825500488, "rewards/margins": 0.2130012959241867, "rewards/rejected": -4.461152076721191, "step": 38 }, { "epoch": 0.08821034775233248, "grad_norm": 70.5941913597691, "learning_rate": 6.933333333333333e-07, "logits/chosen": -11.376758575439453, "logits/rejected": -11.398736953735352, "logps/chosen": -0.47147077322006226, "logps/rejected": -0.4626140296459198, "loss": 4.6912, "rewards/accuracies": 0.5625, "rewards/chosen": -4.714707374572754, "rewards/margins": -0.08856695890426636, "rewards/rejected": -4.626140594482422, "step": 39 }, { "epoch": 0.09047215154085383, "grad_norm": 68.46057335163108, "learning_rate": 7.111111111111111e-07, "logits/chosen": -11.606950759887695, "logits/rejected": -11.105400085449219, "logps/chosen": -0.39962151646614075, "logps/rejected": -0.4578825831413269, "loss": 4.5643, "rewards/accuracies": 0.65625, "rewards/chosen": -3.9962148666381836, "rewards/margins": 0.5826107263565063, "rewards/rejected": -4.578825950622559, "step": 40 }, { "epoch": 0.09273395532937517, "grad_norm": 48.972841115050805, "learning_rate": 7.288888888888888e-07, "logits/chosen": -11.43770980834961, "logits/rejected": -11.56243896484375, "logps/chosen": -0.41414573788642883, "logps/rejected": -0.4131737947463989, "loss": 4.6472, "rewards/accuracies": 0.4375, "rewards/chosen": -4.141457557678223, "rewards/margins": -0.009719468653202057, "rewards/rejected": -4.13173770904541, "step": 41 }, { "epoch": 0.09499575911789652, "grad_norm": 143.02756863226023, "learning_rate": 7.466666666666667e-07, "logits/chosen": -11.282739639282227, "logits/rejected": -10.897704124450684, "logps/chosen": -0.4278091490268707, "logps/rejected": -0.4735082983970642, "loss": 4.8335, "rewards/accuracies": 0.5, "rewards/chosen": -4.278091907501221, "rewards/margins": 0.4569913148880005, "rewards/rejected": -4.735082626342773, "step": 42 }, { "epoch": 0.09725756290641786, "grad_norm": 74.71400905339502, "learning_rate": 7.644444444444444e-07, "logits/chosen": -10.040319442749023, "logits/rejected": -9.910164833068848, "logps/chosen": -0.5288741588592529, "logps/rejected": -0.5330761671066284, "loss": 4.9162, "rewards/accuracies": 0.5, "rewards/chosen": -5.2887420654296875, "rewards/margins": 0.04202008247375488, "rewards/rejected": -5.330761909484863, "step": 43 }, { "epoch": 0.09951936669493922, "grad_norm": 81.89498458784372, "learning_rate": 7.822222222222222e-07, "logits/chosen": -11.844489097595215, "logits/rejected": -11.597496032714844, "logps/chosen": -0.3373556435108185, "logps/rejected": -0.42685818672180176, "loss": 4.4339, "rewards/accuracies": 0.625, "rewards/chosen": -3.373556613922119, "rewards/margins": 0.8950251340866089, "rewards/rejected": -4.268581390380859, "step": 44 }, { "epoch": 0.10178117048346055, "grad_norm": 70.48086533057375, "learning_rate": 8e-07, "logits/chosen": -10.962928771972656, "logits/rejected": -10.9669771194458, "logps/chosen": -0.40894240140914917, "logps/rejected": -0.47473400831222534, "loss": 4.6343, "rewards/accuracies": 0.71875, "rewards/chosen": -4.089423656463623, "rewards/margins": 0.657916784286499, "rewards/rejected": -4.747340679168701, "step": 45 }, { "epoch": 0.1040429742719819, "grad_norm": 68.45670029697006, "learning_rate": 7.999874759018868e-07, "logits/chosen": -10.595868110656738, "logits/rejected": -10.306282043457031, "logps/chosen": -0.463877409696579, "logps/rejected": -0.5967152118682861, "loss": 4.6017, "rewards/accuracies": 0.53125, "rewards/chosen": -4.6387739181518555, "rewards/margins": 1.3283770084381104, "rewards/rejected": -5.967151165008545, "step": 46 }, { "epoch": 0.10630477806050326, "grad_norm": 48.95212367492393, "learning_rate": 7.999499043918123e-07, "logits/chosen": -12.154573440551758, "logits/rejected": -12.19536304473877, "logps/chosen": -0.45791739225387573, "logps/rejected": -0.5772292017936707, "loss": 4.7494, "rewards/accuracies": 0.65625, "rewards/chosen": -4.579174041748047, "rewards/margins": 1.1931182146072388, "rewards/rejected": -5.772292137145996, "step": 47 }, { "epoch": 0.1085665818490246, "grad_norm": 78.70326465142523, "learning_rate": 7.998872878225228e-07, "logits/chosen": -11.652605056762695, "logits/rejected": -11.418684005737305, "logps/chosen": -0.4871661365032196, "logps/rejected": -0.5542778372764587, "loss": 4.8017, "rewards/accuracies": 0.5625, "rewards/chosen": -4.871661186218262, "rewards/margins": 0.6711173057556152, "rewards/rejected": -5.542778491973877, "step": 48 }, { "epoch": 0.11082838563754595, "grad_norm": 52.90074896212443, "learning_rate": 7.997996301150987e-07, "logits/chosen": -12.08781623840332, "logits/rejected": -11.528596878051758, "logps/chosen": -0.4144824743270874, "logps/rejected": -0.5010120868682861, "loss": 4.6588, "rewards/accuracies": 0.6875, "rewards/chosen": -4.144824981689453, "rewards/margins": 0.8652949929237366, "rewards/rejected": -5.010120391845703, "step": 49 }, { "epoch": 0.11309018942606729, "grad_norm": 94.26111558590127, "learning_rate": 7.996869367587088e-07, "logits/chosen": -11.582418441772461, "logits/rejected": -10.963386535644531, "logps/chosen": -0.4467932879924774, "logps/rejected": -0.4817226529121399, "loss": 4.7057, "rewards/accuracies": 0.625, "rewards/chosen": -4.46793270111084, "rewards/margins": 0.34929385781288147, "rewards/rejected": -4.817226409912109, "step": 50 }, { "epoch": 0.11535199321458864, "grad_norm": 49.386260226236786, "learning_rate": 7.99549214810266e-07, "logits/chosen": -10.763232231140137, "logits/rejected": -10.5509672164917, "logps/chosen": -0.5276934504508972, "logps/rejected": -0.5668250322341919, "loss": 4.505, "rewards/accuracies": 0.59375, "rewards/chosen": -5.276934623718262, "rewards/margins": 0.39131537079811096, "rewards/rejected": -5.668249607086182, "step": 51 }, { "epoch": 0.11761379700310998, "grad_norm": 75.77039924958046, "learning_rate": 7.993864728939867e-07, "logits/chosen": -10.996638298034668, "logits/rejected": -11.125421524047852, "logps/chosen": -0.4168677031993866, "logps/rejected": -0.4338124394416809, "loss": 4.8877, "rewards/accuracies": 0.5, "rewards/chosen": -4.16867733001709, "rewards/margins": 0.1694469451904297, "rewards/rejected": -4.3381242752075195, "step": 52 }, { "epoch": 0.11987560079163133, "grad_norm": 128.1604406398008, "learning_rate": 7.991987212008491e-07, "logits/chosen": -10.845922470092773, "logits/rejected": -10.981675148010254, "logps/chosen": -0.5582807660102844, "logps/rejected": -0.5974184274673462, "loss": 4.4079, "rewards/accuracies": 0.46875, "rewards/chosen": -5.582807540893555, "rewards/margins": 0.3913762867450714, "rewards/rejected": -5.974184036254883, "step": 53 }, { "epoch": 0.12213740458015267, "grad_norm": 58.15165845926177, "learning_rate": 7.989859714879565e-07, "logits/chosen": -10.547262191772461, "logits/rejected": -10.480108261108398, "logps/chosen": -0.5517194271087646, "logps/rejected": -0.6646666526794434, "loss": 4.9804, "rewards/accuracies": 0.5625, "rewards/chosen": -5.5171942710876465, "rewards/margins": 1.1294726133346558, "rewards/rejected": -6.646667003631592, "step": 54 }, { "epoch": 0.12439920836867402, "grad_norm": 48.309919673308315, "learning_rate": 7.987482370778005e-07, "logits/chosen": -11.610102653503418, "logits/rejected": -11.80359935760498, "logps/chosen": -0.5112394690513611, "logps/rejected": -0.5051109790802002, "loss": 4.7434, "rewards/accuracies": 0.46875, "rewards/chosen": -5.112394332885742, "rewards/margins": -0.061284855008125305, "rewards/rejected": -5.051109790802002, "step": 55 }, { "epoch": 0.12666101215719536, "grad_norm": 92.15090562707765, "learning_rate": 7.984855328574262e-07, "logits/chosen": -11.098040580749512, "logits/rejected": -10.789083480834961, "logps/chosen": -0.489580363035202, "logps/rejected": -0.5100796818733215, "loss": 4.5609, "rewards/accuracies": 0.4375, "rewards/chosen": -4.895802974700928, "rewards/margins": 0.20499347150325775, "rewards/rejected": -5.100796699523926, "step": 56 }, { "epoch": 0.1289228159457167, "grad_norm": 94.24070870623638, "learning_rate": 7.981978752775009e-07, "logits/chosen": -9.92190933227539, "logits/rejected": -9.928149223327637, "logps/chosen": -0.6262676119804382, "logps/rejected": -0.6750127077102661, "loss": 4.5092, "rewards/accuracies": 0.53125, "rewards/chosen": -6.262676239013672, "rewards/margins": 0.48745113611221313, "rewards/rejected": -6.750126838684082, "step": 57 }, { "epoch": 0.13118461973423806, "grad_norm": 73.68507158036604, "learning_rate": 7.978852823512833e-07, "logits/chosen": -10.95576000213623, "logits/rejected": -10.358962059020996, "logps/chosen": -0.4652557671070099, "logps/rejected": -0.4836958050727844, "loss": 4.8084, "rewards/accuracies": 0.65625, "rewards/chosen": -4.652557849884033, "rewards/margins": 0.1843997836112976, "rewards/rejected": -4.8369574546813965, "step": 58 }, { "epoch": 0.1334464235227594, "grad_norm": 66.30964256222111, "learning_rate": 7.975477736534957e-07, "logits/chosen": -12.005413055419922, "logits/rejected": -11.653824806213379, "logps/chosen": -0.46185585856437683, "logps/rejected": -0.5776143670082092, "loss": 4.5199, "rewards/accuracies": 0.65625, "rewards/chosen": -4.618558883666992, "rewards/margins": 1.1575853824615479, "rewards/rejected": -5.776144027709961, "step": 59 }, { "epoch": 0.13570822731128074, "grad_norm": 101.38755828571541, "learning_rate": 7.971853703190986e-07, "logits/chosen": -11.413613319396973, "logits/rejected": -10.73826789855957, "logps/chosen": -0.5611809492111206, "logps/rejected": -0.6577370762825012, "loss": 4.6643, "rewards/accuracies": 0.5, "rewards/chosen": -5.611808776855469, "rewards/margins": 0.9655615091323853, "rewards/rejected": -6.577370643615723, "step": 60 }, { "epoch": 0.1379700310998021, "grad_norm": 68.86595519869128, "learning_rate": 7.967980950419664e-07, "logits/chosen": -11.096137046813965, "logits/rejected": -10.685264587402344, "logps/chosen": -0.4981518089771271, "logps/rejected": -0.665467381477356, "loss": 4.4761, "rewards/accuracies": 0.75, "rewards/chosen": -4.981517791748047, "rewards/margins": 1.673156499862671, "rewards/rejected": -6.654675006866455, "step": 61 }, { "epoch": 0.14023183488832344, "grad_norm": 65.33178558436228, "learning_rate": 7.963859720734669e-07, "logits/chosen": -12.070573806762695, "logits/rejected": -11.637935638427734, "logps/chosen": -0.38139575719833374, "logps/rejected": -0.45192593336105347, "loss": 4.5535, "rewards/accuracies": 0.65625, "rewards/chosen": -3.813957691192627, "rewards/margins": 0.7053009867668152, "rewards/rejected": -4.519258975982666, "step": 62 }, { "epoch": 0.14249363867684478, "grad_norm": 83.62029217364055, "learning_rate": 7.959490272209427e-07, "logits/chosen": -10.89778995513916, "logits/rejected": -10.380571365356445, "logps/chosen": -0.4739726185798645, "logps/rejected": -0.6313707232475281, "loss": 4.5111, "rewards/accuracies": 0.625, "rewards/chosen": -4.7397260665893555, "rewards/margins": 1.5739809274673462, "rewards/rejected": -6.31370735168457, "step": 63 }, { "epoch": 0.14475544246536612, "grad_norm": 54.013979661163816, "learning_rate": 7.954872878460946e-07, "logits/chosen": -11.172213554382324, "logits/rejected": -10.982388496398926, "logps/chosen": -0.4742993414402008, "logps/rejected": -0.6619201898574829, "loss": 4.3085, "rewards/accuracies": 0.6875, "rewards/chosen": -4.742993354797363, "rewards/margins": 1.8762080669403076, "rewards/rejected": -6.619201183319092, "step": 64 }, { "epoch": 0.14701724625388748, "grad_norm": 81.96626905055028, "learning_rate": 7.950007828632691e-07, "logits/chosen": -10.859444618225098, "logits/rejected": -10.706047058105469, "logps/chosen": -0.5983306169509888, "logps/rejected": -0.6700727939605713, "loss": 4.3127, "rewards/accuracies": 0.6875, "rewards/chosen": -5.98330545425415, "rewards/margins": 0.717422366142273, "rewards/rejected": -6.700727462768555, "step": 65 }, { "epoch": 0.14927905004240882, "grad_norm": 58.0945518868609, "learning_rate": 7.944895427376465e-07, "logits/chosen": -10.645411491394043, "logits/rejected": -10.482881546020508, "logps/chosen": -0.5315589904785156, "logps/rejected": -0.7207262516021729, "loss": 4.238, "rewards/accuracies": 0.75, "rewards/chosen": -5.315589904785156, "rewards/margins": 1.8916726112365723, "rewards/rejected": -7.2072625160217285, "step": 66 }, { "epoch": 0.15154085383093016, "grad_norm": 40.98461722424557, "learning_rate": 7.939535994833345e-07, "logits/chosen": -12.032543182373047, "logits/rejected": -11.655169486999512, "logps/chosen": -0.40713435411453247, "logps/rejected": -0.5286428928375244, "loss": 4.1796, "rewards/accuracies": 0.625, "rewards/chosen": -4.071343421936035, "rewards/margins": 1.215085744857788, "rewards/rejected": -5.286429405212402, "step": 67 }, { "epoch": 0.1538026576194515, "grad_norm": 58.64617780662404, "learning_rate": 7.933929866613628e-07, "logits/chosen": -11.718114852905273, "logits/rejected": -11.243300437927246, "logps/chosen": -0.5240508317947388, "logps/rejected": -0.5563682317733765, "loss": 4.6826, "rewards/accuracies": 0.65625, "rewards/chosen": -5.240508556365967, "rewards/margins": 0.3231736421585083, "rewards/rejected": -5.5636820793151855, "step": 68 }, { "epoch": 0.15606446140797287, "grad_norm": 70.31268006854283, "learning_rate": 7.928077393775808e-07, "logits/chosen": -11.418298721313477, "logits/rejected": -11.400525093078613, "logps/chosen": -0.5047922730445862, "logps/rejected": -0.6909648776054382, "loss": 3.9852, "rewards/accuracies": 0.65625, "rewards/chosen": -5.0479230880737305, "rewards/margins": 1.8617255687713623, "rewards/rejected": -6.909648418426514, "step": 69 }, { "epoch": 0.1583262651964942, "grad_norm": 80.21299453365818, "learning_rate": 7.921978942804609e-07, "logits/chosen": -10.426657676696777, "logits/rejected": -10.646503448486328, "logps/chosen": -0.5763324499130249, "logps/rejected": -0.6343460083007812, "loss": 4.159, "rewards/accuracies": 0.625, "rewards/chosen": -5.7633256912231445, "rewards/margins": 0.5801345705986023, "rewards/rejected": -6.3434600830078125, "step": 70 }, { "epoch": 0.16058806898501554, "grad_norm": 80.79055167712902, "learning_rate": 7.915634895588021e-07, "logits/chosen": -11.959595680236816, "logits/rejected": -12.10846996307373, "logps/chosen": -0.5684102177619934, "logps/rejected": -0.5796740055084229, "loss": 4.8753, "rewards/accuracies": 0.53125, "rewards/chosen": -5.684101581573486, "rewards/margins": 0.11263775080442429, "rewards/rejected": -5.796739101409912, "step": 71 }, { "epoch": 0.1628498727735369, "grad_norm": 75.73711259124929, "learning_rate": 7.909045649393394e-07, "logits/chosen": -12.076671600341797, "logits/rejected": -11.380012512207031, "logps/chosen": -0.5402446389198303, "logps/rejected": -0.5312026739120483, "loss": 4.8356, "rewards/accuracies": 0.53125, "rewards/chosen": -5.402446269989014, "rewards/margins": -0.09041957557201385, "rewards/rejected": -5.3120269775390625, "step": 72 }, { "epoch": 0.16511167656205825, "grad_norm": 75.75288370918786, "learning_rate": 7.902211616842556e-07, "logits/chosen": -10.804548263549805, "logits/rejected": -10.961880683898926, "logps/chosen": -0.5909055471420288, "logps/rejected": -0.673595666885376, "loss": 4.3771, "rewards/accuracies": 0.53125, "rewards/chosen": -5.909054756164551, "rewards/margins": 0.8269017934799194, "rewards/rejected": -6.735957145690918, "step": 73 }, { "epoch": 0.16737348035057958, "grad_norm": 62.67088862221453, "learning_rate": 7.89513322588598e-07, "logits/chosen": -12.931127548217773, "logits/rejected": -12.251081466674805, "logps/chosen": -0.4514605700969696, "logps/rejected": -0.5236379504203796, "loss": 4.3108, "rewards/accuracies": 0.6875, "rewards/chosen": -4.51460599899292, "rewards/margins": 0.7217735648155212, "rewards/rejected": -5.236379623413086, "step": 74 }, { "epoch": 0.16963528413910092, "grad_norm": 80.95433872904526, "learning_rate": 7.887810919775976e-07, "logits/chosen": -11.493197441101074, "logits/rejected": -11.485479354858398, "logps/chosen": -0.6065416932106018, "logps/rejected": -0.6815317273139954, "loss": 4.3937, "rewards/accuracies": 0.53125, "rewards/chosen": -6.065417289733887, "rewards/margins": 0.7499004006385803, "rewards/rejected": -6.815317630767822, "step": 75 }, { "epoch": 0.1718970879276223, "grad_norm": 46.993337340785516, "learning_rate": 7.880245157038949e-07, "logits/chosen": -11.63713264465332, "logits/rejected": -11.74251651763916, "logps/chosen": -0.5179169178009033, "logps/rejected": -0.5965338945388794, "loss": 4.3272, "rewards/accuracies": 0.625, "rewards/chosen": -5.179169178009033, "rewards/margins": 0.7861694693565369, "rewards/rejected": -5.965338706970215, "step": 76 }, { "epoch": 0.17415889171614363, "grad_norm": 87.9689908780646, "learning_rate": 7.872436411446671e-07, "logits/chosen": -12.063104629516602, "logits/rejected": -11.92426872253418, "logps/chosen": -0.5937929749488831, "logps/rejected": -0.7317577004432678, "loss": 4.6281, "rewards/accuracies": 0.59375, "rewards/chosen": -5.937929153442383, "rewards/margins": 1.3796474933624268, "rewards/rejected": -7.3175764083862305, "step": 77 }, { "epoch": 0.17642069550466496, "grad_norm": 58.552629911827445, "learning_rate": 7.86438517198662e-07, "logits/chosen": -11.951095581054688, "logits/rejected": -11.894678115844727, "logps/chosen": -0.6496031880378723, "logps/rejected": -0.7183038592338562, "loss": 4.4127, "rewards/accuracies": 0.46875, "rewards/chosen": -6.496031761169434, "rewards/margins": 0.6870064735412598, "rewards/rejected": -7.183038711547852, "step": 78 }, { "epoch": 0.1786824992931863, "grad_norm": 50.220014654534005, "learning_rate": 7.856091942831366e-07, "logits/chosen": -12.49548625946045, "logits/rejected": -12.11488151550293, "logps/chosen": -0.5599091649055481, "logps/rejected": -0.6432383060455322, "loss": 4.5361, "rewards/accuracies": 0.625, "rewards/chosen": -5.599091529846191, "rewards/margins": 0.83329176902771, "rewards/rejected": -6.432382583618164, "step": 79 }, { "epoch": 0.18094430308170767, "grad_norm": 66.70412693533851, "learning_rate": 7.847557243306982e-07, "logits/chosen": -11.657049179077148, "logits/rejected": -11.260804176330566, "logps/chosen": -0.5219194293022156, "logps/rejected": -0.6938945055007935, "loss": 4.3537, "rewards/accuracies": 0.625, "rewards/chosen": -5.219193458557129, "rewards/margins": 1.719750165939331, "rewards/rejected": -6.9389448165893555, "step": 80 }, { "epoch": 0.183206106870229, "grad_norm": 63.80924880121896, "learning_rate": 7.838781607860541e-07, "logits/chosen": -12.70258903503418, "logits/rejected": -12.420878410339355, "logps/chosen": -0.625399112701416, "logps/rejected": -0.7799273133277893, "loss": 4.2639, "rewards/accuracies": 0.6875, "rewards/chosen": -6.25399112701416, "rewards/margins": 1.5452824831008911, "rewards/rejected": -7.7992730140686035, "step": 81 }, { "epoch": 0.18546791065875035, "grad_norm": 57.78934306013499, "learning_rate": 7.82976558602664e-07, "logits/chosen": -11.984509468078613, "logits/rejected": -12.11713695526123, "logps/chosen": -0.5547804832458496, "logps/rejected": -0.6996307373046875, "loss": 4.4859, "rewards/accuracies": 0.65625, "rewards/chosen": -5.547804832458496, "rewards/margins": 1.4485028982162476, "rewards/rejected": -6.996307373046875, "step": 82 }, { "epoch": 0.1877297144472717, "grad_norm": 74.92592513817463, "learning_rate": 7.820509742392988e-07, "logits/chosen": -12.603782653808594, "logits/rejected": -12.160541534423828, "logps/chosen": -0.6248946189880371, "logps/rejected": -0.6692970991134644, "loss": 4.2867, "rewards/accuracies": 0.65625, "rewards/chosen": -6.248946189880371, "rewards/margins": 0.444024920463562, "rewards/rejected": -6.692971229553223, "step": 83 }, { "epoch": 0.18999151823579305, "grad_norm": 88.28010047139081, "learning_rate": 7.811014656565054e-07, "logits/chosen": -12.70538330078125, "logits/rejected": -12.26504898071289, "logps/chosen": -0.5449544191360474, "logps/rejected": -0.7343254685401917, "loss": 4.002, "rewards/accuracies": 0.59375, "rewards/chosen": -5.4495439529418945, "rewards/margins": 1.8937102556228638, "rewards/rejected": -7.343254566192627, "step": 84 }, { "epoch": 0.1922533220243144, "grad_norm": 91.0049552486995, "learning_rate": 7.801280923129773e-07, "logits/chosen": -11.466194152832031, "logits/rejected": -11.01245403289795, "logps/chosen": -0.6134600043296814, "logps/rejected": -0.7226859927177429, "loss": 4.7861, "rewards/accuracies": 0.59375, "rewards/chosen": -6.134600639343262, "rewards/margins": 1.0922595262527466, "rewards/rejected": -7.226860046386719, "step": 85 }, { "epoch": 0.19451512581283573, "grad_norm": 80.46205455624374, "learning_rate": 7.791309151618305e-07, "logits/chosen": -12.178560256958008, "logits/rejected": -12.0822114944458, "logps/chosen": -0.589695930480957, "logps/rejected": -0.6354808211326599, "loss": 4.5683, "rewards/accuracies": 0.46875, "rewards/chosen": -5.8969597816467285, "rewards/margins": 0.45784902572631836, "rewards/rejected": -6.354808330535889, "step": 86 }, { "epoch": 0.1967769296013571, "grad_norm": 65.70377095012364, "learning_rate": 7.781099966467874e-07, "logits/chosen": -14.043821334838867, "logits/rejected": -13.850515365600586, "logps/chosen": -0.5184203386306763, "logps/rejected": -0.5960665345191956, "loss": 4.4942, "rewards/accuracies": 0.65625, "rewards/chosen": -5.184203147888184, "rewards/margins": 0.7764618396759033, "rewards/rejected": -5.960664749145508, "step": 87 }, { "epoch": 0.19903873338987843, "grad_norm": 85.44105817952351, "learning_rate": 7.770654006982664e-07, "logits/chosen": -11.956082344055176, "logits/rejected": -11.715299606323242, "logps/chosen": -0.7433941960334778, "logps/rejected": -0.8704244494438171, "loss": 4.6953, "rewards/accuracies": 0.625, "rewards/chosen": -7.4339423179626465, "rewards/margins": 1.2703025341033936, "rewards/rejected": -8.704244613647461, "step": 88 }, { "epoch": 0.20130053717839977, "grad_norm": 66.70633497431983, "learning_rate": 7.759971927293781e-07, "logits/chosen": -12.565323829650879, "logits/rejected": -12.145037651062012, "logps/chosen": -0.5749909281730652, "logps/rejected": -0.7184647917747498, "loss": 4.1798, "rewards/accuracies": 0.6875, "rewards/chosen": -5.749909400939941, "rewards/margins": 1.4347392320632935, "rewards/rejected": -7.184648513793945, "step": 89 }, { "epoch": 0.2035623409669211, "grad_norm": 61.091515513789844, "learning_rate": 7.749054396318297e-07, "logits/chosen": -11.981965065002441, "logits/rejected": -11.925725936889648, "logps/chosen": -0.6095532178878784, "logps/rejected": -0.7097649574279785, "loss": 4.5833, "rewards/accuracies": 0.625, "rewards/chosen": -6.095531463623047, "rewards/margins": 1.00211763381958, "rewards/rejected": -7.097649097442627, "step": 90 }, { "epoch": 0.20582414475544247, "grad_norm": 98.35174892551164, "learning_rate": 7.737902097717356e-07, "logits/chosen": -12.858875274658203, "logits/rejected": -12.816228866577148, "logps/chosen": -0.5555391907691956, "logps/rejected": -0.6473320722579956, "loss": 4.3657, "rewards/accuracies": 0.46875, "rewards/chosen": -5.555391788482666, "rewards/margins": 0.9179282784461975, "rewards/rejected": -6.473320960998535, "step": 91 }, { "epoch": 0.2080859485439638, "grad_norm": 71.84927934229238, "learning_rate": 7.726515729853367e-07, "logits/chosen": -11.215812683105469, "logits/rejected": -10.828777313232422, "logps/chosen": -0.6313311457633972, "logps/rejected": -0.8224250674247742, "loss": 4.3914, "rewards/accuracies": 0.75, "rewards/chosen": -6.313311576843262, "rewards/margins": 1.9109392166137695, "rewards/rejected": -8.224250793457031, "step": 92 }, { "epoch": 0.21034775233248515, "grad_norm": 63.14236456247472, "learning_rate": 7.714896005746272e-07, "logits/chosen": -12.176814079284668, "logits/rejected": -11.898388862609863, "logps/chosen": -0.5294336080551147, "logps/rejected": -0.6646890044212341, "loss": 4.0327, "rewards/accuracies": 0.71875, "rewards/chosen": -5.294336318969727, "rewards/margins": 1.3525540828704834, "rewards/rejected": -6.646890163421631, "step": 93 }, { "epoch": 0.21260955612100652, "grad_norm": 121.71357306492177, "learning_rate": 7.703043653028896e-07, "logits/chosen": -12.20483684539795, "logits/rejected": -11.81241226196289, "logps/chosen": -0.6999309659004211, "logps/rejected": -0.8097646236419678, "loss": 4.668, "rewards/accuracies": 0.6875, "rewards/chosen": -6.999309539794922, "rewards/margins": 1.0983363389968872, "rewards/rejected": -8.09764575958252, "step": 94 }, { "epoch": 0.21487135990952785, "grad_norm": 104.68871191294573, "learning_rate": 7.690959413901379e-07, "logits/chosen": -13.26455307006836, "logits/rejected": -13.093438148498535, "logps/chosen": -0.6004514694213867, "logps/rejected": -0.7420926690101624, "loss": 4.5244, "rewards/accuracies": 0.625, "rewards/chosen": -6.004515171051025, "rewards/margins": 1.4164113998413086, "rewards/rejected": -7.420926570892334, "step": 95 }, { "epoch": 0.2171331636980492, "grad_norm": 96.3754544871051, "learning_rate": 7.678644045084704e-07, "logits/chosen": -13.176921844482422, "logits/rejected": -12.706074714660645, "logps/chosen": -0.5092126727104187, "logps/rejected": -0.673244833946228, "loss": 4.126, "rewards/accuracies": 0.625, "rewards/chosen": -5.09212589263916, "rewards/margins": 1.6403214931488037, "rewards/rejected": -6.732447624206543, "step": 96 }, { "epoch": 0.21939496748657053, "grad_norm": 64.90730762680234, "learning_rate": 7.666098317773308e-07, "logits/chosen": -12.79755687713623, "logits/rejected": -12.852503776550293, "logps/chosen": -0.730464518070221, "logps/rejected": -0.8265626430511475, "loss": 4.1382, "rewards/accuracies": 0.5625, "rewards/chosen": -7.30464506149292, "rewards/margins": 0.9609812498092651, "rewards/rejected": -8.265625953674316, "step": 97 }, { "epoch": 0.2216567712750919, "grad_norm": 61.06704486908139, "learning_rate": 7.653323017586789e-07, "logits/chosen": -13.87999153137207, "logits/rejected": -13.832286834716797, "logps/chosen": -0.629042387008667, "logps/rejected": -0.607448399066925, "loss": 4.3219, "rewards/accuracies": 0.5, "rewards/chosen": -6.290423393249512, "rewards/margins": -0.21593987941741943, "rewards/rejected": -6.074484348297119, "step": 98 }, { "epoch": 0.22391857506361323, "grad_norm": 68.27361430232956, "learning_rate": 7.640318944520711e-07, "logits/chosen": -12.233078956604004, "logits/rejected": -11.775206565856934, "logps/chosen": -0.7409114241600037, "logps/rejected": -0.9343410134315491, "loss": 4.2391, "rewards/accuracies": 0.75, "rewards/chosen": -7.409113883972168, "rewards/margins": 1.9342964887619019, "rewards/rejected": -9.343409538269043, "step": 99 }, { "epoch": 0.22618037885213457, "grad_norm": 101.95762268614794, "learning_rate": 7.627086912896511e-07, "logits/chosen": -13.06617546081543, "logits/rejected": -12.822061538696289, "logps/chosen": -0.6488937139511108, "logps/rejected": -0.6966894268989563, "loss": 4.3114, "rewards/accuracies": 0.59375, "rewards/chosen": -6.4889373779296875, "rewards/margins": 0.47795701026916504, "rewards/rejected": -6.966893672943115, "step": 100 }, { "epoch": 0.2284421826406559, "grad_norm": 62.41706596840518, "learning_rate": 7.613627751310499e-07, "logits/chosen": -13.649486541748047, "logits/rejected": -13.323113441467285, "logps/chosen": -0.5429686307907104, "logps/rejected": -0.7514999508857727, "loss": 4.1066, "rewards/accuracies": 0.84375, "rewards/chosen": -5.429686546325684, "rewards/margins": 2.085312604904175, "rewards/rejected": -7.5149993896484375, "step": 101 }, { "epoch": 0.23070398642917728, "grad_norm": 98.8807647714299, "learning_rate": 7.599942302581977e-07, "logits/chosen": -13.609407424926758, "logits/rejected": -13.286027908325195, "logps/chosen": -0.6267982721328735, "logps/rejected": -0.8196748495101929, "loss": 4.085, "rewards/accuracies": 0.875, "rewards/chosen": -6.267982006072998, "rewards/margins": 1.9287660121917725, "rewards/rejected": -8.196748733520508, "step": 102 }, { "epoch": 0.23296579021769862, "grad_norm": 69.62248080154978, "learning_rate": 7.586031423700457e-07, "logits/chosen": -13.66258430480957, "logits/rejected": -13.500720024108887, "logps/chosen": -0.67485111951828, "logps/rejected": -0.7922409772872925, "loss": 4.2884, "rewards/accuracies": 0.6875, "rewards/chosen": -6.74851131439209, "rewards/margins": 1.1738990545272827, "rewards/rejected": -7.92241096496582, "step": 103 }, { "epoch": 0.23522759400621995, "grad_norm": 128.9243427342601, "learning_rate": 7.571895985772e-07, "logits/chosen": -13.242142677307129, "logits/rejected": -13.256977081298828, "logps/chosen": -0.6713986396789551, "logps/rejected": -0.823998749256134, "loss": 4.4093, "rewards/accuracies": 0.6875, "rewards/chosen": -6.713986873626709, "rewards/margins": 1.526000738143921, "rewards/rejected": -8.239988327026367, "step": 104 }, { "epoch": 0.23748939779474132, "grad_norm": 90.13452963738787, "learning_rate": 7.557536873964661e-07, "logits/chosen": -13.565170288085938, "logits/rejected": -13.13564682006836, "logps/chosen": -0.6910791993141174, "logps/rejected": -0.9325417280197144, "loss": 4.4509, "rewards/accuracies": 0.78125, "rewards/chosen": -6.910791873931885, "rewards/margins": 2.4146251678466797, "rewards/rejected": -9.325417518615723, "step": 105 }, { "epoch": 0.23975120158326266, "grad_norm": 87.0306411773028, "learning_rate": 7.542954987453069e-07, "logits/chosen": -14.550992012023926, "logits/rejected": -14.176923751831055, "logps/chosen": -0.6862035393714905, "logps/rejected": -0.8450896143913269, "loss": 3.9713, "rewards/accuracies": 0.625, "rewards/chosen": -6.862035751342773, "rewards/margins": 1.5888599157333374, "rewards/rejected": -8.450895309448242, "step": 106 }, { "epoch": 0.242013005371784, "grad_norm": 79.22884465246462, "learning_rate": 7.528151239362108e-07, "logits/chosen": -14.102907180786133, "logits/rejected": -13.666712760925293, "logps/chosen": -0.6612896919250488, "logps/rejected": -0.831270694732666, "loss": 4.1818, "rewards/accuracies": 0.6875, "rewards/chosen": -6.6128973960876465, "rewards/margins": 1.6998090744018555, "rewards/rejected": -8.312705993652344, "step": 107 }, { "epoch": 0.24427480916030533, "grad_norm": 127.69113358221105, "learning_rate": 7.513126556709748e-07, "logits/chosen": -11.86813735961914, "logits/rejected": -11.855137825012207, "logps/chosen": -0.6619610786437988, "logps/rejected": -0.9614608287811279, "loss": 3.5632, "rewards/accuracies": 0.71875, "rewards/chosen": -6.6196112632751465, "rewards/margins": 2.9949963092803955, "rewards/rejected": -9.614606857299805, "step": 108 }, { "epoch": 0.2465366129488267, "grad_norm": 68.99980133964193, "learning_rate": 7.497881880348984e-07, "logits/chosen": -14.216558456420898, "logits/rejected": -13.699520111083984, "logps/chosen": -0.6432782411575317, "logps/rejected": -0.8865514397621155, "loss": 3.6564, "rewards/accuracies": 0.71875, "rewards/chosen": -6.432782173156738, "rewards/margins": 2.432731866836548, "rewards/rejected": -8.865514755249023, "step": 109 }, { "epoch": 0.24879841673734804, "grad_norm": 94.37373909884498, "learning_rate": 7.482418164908931e-07, "logits/chosen": -13.685918807983398, "logits/rejected": -13.722275733947754, "logps/chosen": -0.7590062618255615, "logps/rejected": -0.8727726936340332, "loss": 4.4461, "rewards/accuracies": 0.5625, "rewards/chosen": -7.590063095092773, "rewards/margins": 1.1376643180847168, "rewards/rejected": -8.727726936340332, "step": 110 }, { "epoch": 0.2510602205258694, "grad_norm": 104.73538958533439, "learning_rate": 7.466736378735035e-07, "logits/chosen": -13.90713882446289, "logits/rejected": -13.833703994750977, "logps/chosen": -0.9833253622055054, "logps/rejected": -1.1174235343933105, "loss": 4.0684, "rewards/accuracies": 0.625, "rewards/chosen": -9.833253860473633, "rewards/margins": 1.340980887413025, "rewards/rejected": -11.174234390258789, "step": 111 }, { "epoch": 0.2533220243143907, "grad_norm": 85.78487702365295, "learning_rate": 7.450837503828439e-07, "logits/chosen": -14.123536109924316, "logits/rejected": -14.122791290283203, "logps/chosen": -0.7747003436088562, "logps/rejected": -0.9367392063140869, "loss": 3.7847, "rewards/accuracies": 0.65625, "rewards/chosen": -7.747003555297852, "rewards/margins": 1.6203885078430176, "rewards/rejected": -9.367391586303711, "step": 112 }, { "epoch": 0.2555838281029121, "grad_norm": 79.39379626286185, "learning_rate": 7.43472253578449e-07, "logits/chosen": -15.111526489257812, "logits/rejected": -15.17776870727539, "logps/chosen": -0.6799838542938232, "logps/rejected": -0.7487653493881226, "loss": 4.1428, "rewards/accuracies": 0.53125, "rewards/chosen": -6.799839019775391, "rewards/margins": 0.6878141760826111, "rewards/rejected": -7.487652778625488, "step": 113 }, { "epoch": 0.2578456318914334, "grad_norm": 95.27260892673486, "learning_rate": 7.418392483730389e-07, "logits/chosen": -15.093989372253418, "logits/rejected": -14.798469543457031, "logps/chosen": -0.611186683177948, "logps/rejected": -0.733193039894104, "loss": 3.9567, "rewards/accuracies": 0.75, "rewards/chosen": -6.111865997314453, "rewards/margins": 1.2200640439987183, "rewards/rejected": -7.331930637359619, "step": 114 }, { "epoch": 0.26010743567995476, "grad_norm": 81.84816803138266, "learning_rate": 7.401848370262012e-07, "logits/chosen": -16.052608489990234, "logits/rejected": -15.86906623840332, "logps/chosen": -0.7116187810897827, "logps/rejected": -0.8240950107574463, "loss": 4.2147, "rewards/accuracies": 0.71875, "rewards/chosen": -7.116188049316406, "rewards/margins": 1.1247621774673462, "rewards/rejected": -8.240950584411621, "step": 115 }, { "epoch": 0.2623692394684761, "grad_norm": 86.40835196804031, "learning_rate": 7.385091231379856e-07, "logits/chosen": -15.110920906066895, "logits/rejected": -15.024141311645508, "logps/chosen": -0.7939636707305908, "logps/rejected": -0.9955480098724365, "loss": 4.0034, "rewards/accuracies": 0.65625, "rewards/chosen": -7.93963623046875, "rewards/margins": 2.0158443450927734, "rewards/rejected": -9.955480575561523, "step": 116 }, { "epoch": 0.26463104325699743, "grad_norm": 96.7390682646137, "learning_rate": 7.368122116424182e-07, "logits/chosen": -13.677536964416504, "logits/rejected": -13.632445335388184, "logps/chosen": -0.8173962235450745, "logps/rejected": -0.8863806128501892, "loss": 4.2779, "rewards/accuracies": 0.53125, "rewards/chosen": -8.173962593078613, "rewards/margins": 0.6898432970046997, "rewards/rejected": -8.863805770874023, "step": 117 }, { "epoch": 0.2668928470455188, "grad_norm": 114.76228974717415, "learning_rate": 7.350942088009289e-07, "logits/chosen": -16.132448196411133, "logits/rejected": -15.948546409606934, "logps/chosen": -0.8236314058303833, "logps/rejected": -0.9717513918876648, "loss": 3.7875, "rewards/accuracies": 0.6875, "rewards/chosen": -8.236313819885254, "rewards/margins": 1.4811999797821045, "rewards/rejected": -9.717514038085938, "step": 118 }, { "epoch": 0.26915465083404017, "grad_norm": 124.74702715605902, "learning_rate": 7.333552221956986e-07, "logits/chosen": -14.226578712463379, "logits/rejected": -13.749677658081055, "logps/chosen": -0.9559565782546997, "logps/rejected": -1.1939551830291748, "loss": 3.7927, "rewards/accuracies": 0.71875, "rewards/chosen": -9.559566497802734, "rewards/margins": 2.37998628616333, "rewards/rejected": -11.939552307128906, "step": 119 }, { "epoch": 0.2714164546225615, "grad_norm": 139.60375866242782, "learning_rate": 7.315953607229217e-07, "logits/chosen": -15.55072021484375, "logits/rejected": -15.846210479736328, "logps/chosen": -0.9729312658309937, "logps/rejected": -1.1994317770004272, "loss": 4.0626, "rewards/accuracies": 0.65625, "rewards/chosen": -9.729312896728516, "rewards/margins": 2.265005588531494, "rewards/rejected": -11.994318962097168, "step": 120 }, { "epoch": 0.27367825841108284, "grad_norm": 90.79159608076576, "learning_rate": 7.298147345859869e-07, "logits/chosen": -15.140702247619629, "logits/rejected": -14.700098037719727, "logps/chosen": -0.8421116471290588, "logps/rejected": -1.0816903114318848, "loss": 4.0231, "rewards/accuracies": 0.78125, "rewards/chosen": -8.421116828918457, "rewards/margins": 2.3957865238189697, "rewards/rejected": -10.816903114318848, "step": 121 }, { "epoch": 0.2759400621996042, "grad_norm": 100.537933010007, "learning_rate": 7.280134552885762e-07, "logits/chosen": -16.38404083251953, "logits/rejected": -15.996622085571289, "logps/chosen": -0.7793571949005127, "logps/rejected": -0.9447546005249023, "loss": 4.1454, "rewards/accuracies": 0.6875, "rewards/chosen": -7.793571949005127, "rewards/margins": 1.6539742946624756, "rewards/rejected": -9.44754695892334, "step": 122 }, { "epoch": 0.2782018659881255, "grad_norm": 92.3597151880949, "learning_rate": 7.261916356276831e-07, "logits/chosen": -16.811389923095703, "logits/rejected": -16.297218322753906, "logps/chosen": -1.1431193351745605, "logps/rejected": -1.4224827289581299, "loss": 3.5578, "rewards/accuracies": 0.75, "rewards/chosen": -11.431194305419922, "rewards/margins": 2.7936320304870605, "rewards/rejected": -14.22482681274414, "step": 123 }, { "epoch": 0.2804636697766469, "grad_norm": 85.41024914421368, "learning_rate": 7.243493896865486e-07, "logits/chosen": -16.567768096923828, "logits/rejected": -16.550174713134766, "logps/chosen": -0.7305294275283813, "logps/rejected": -0.8769953846931458, "loss": 3.8388, "rewards/accuracies": 0.625, "rewards/chosen": -7.305294990539551, "rewards/margins": 1.4646586179733276, "rewards/rejected": -8.769953727722168, "step": 124 }, { "epoch": 0.2827254735651682, "grad_norm": 116.2256817022879, "learning_rate": 7.224868328275169e-07, "logits/chosen": -15.456303596496582, "logits/rejected": -15.146892547607422, "logps/chosen": -0.8332209587097168, "logps/rejected": -1.0618098974227905, "loss": 3.8379, "rewards/accuracies": 0.78125, "rewards/chosen": -8.332210540771484, "rewards/margins": 2.285888433456421, "rewards/rejected": -10.618098258972168, "step": 125 }, { "epoch": 0.28498727735368956, "grad_norm": 171.39099454139836, "learning_rate": 7.206040816848126e-07, "logits/chosen": -13.179584503173828, "logits/rejected": -13.4354887008667, "logps/chosen": -0.7763444781303406, "logps/rejected": -1.0508021116256714, "loss": 4.0712, "rewards/accuracies": 0.65625, "rewards/chosen": -7.763444423675537, "rewards/margins": 2.7445759773254395, "rewards/rejected": -10.508020401000977, "step": 126 }, { "epoch": 0.2872490811422109, "grad_norm": 113.59593931675269, "learning_rate": 7.187012541572356e-07, "logits/chosen": -16.92714500427246, "logits/rejected": -16.797697067260742, "logps/chosen": -0.8984054923057556, "logps/rejected": -1.2400306463241577, "loss": 3.9837, "rewards/accuracies": 0.625, "rewards/chosen": -8.984055519104004, "rewards/margins": 3.416250467300415, "rewards/rejected": -12.400304794311523, "step": 127 }, { "epoch": 0.28951088493073224, "grad_norm": 106.75878513488918, "learning_rate": 7.167784694007791e-07, "logits/chosen": -17.011579513549805, "logits/rejected": -16.66845703125, "logps/chosen": -0.8532888889312744, "logps/rejected": -1.0494173765182495, "loss": 3.6795, "rewards/accuracies": 0.625, "rewards/chosen": -8.532888412475586, "rewards/margins": 1.9612853527069092, "rewards/rejected": -10.494174003601074, "step": 128 }, { "epoch": 0.2917726887192536, "grad_norm": 100.82906895906213, "learning_rate": 7.148358478211682e-07, "logits/chosen": -17.22789764404297, "logits/rejected": -16.70311737060547, "logps/chosen": -1.0011430978775024, "logps/rejected": -1.2051304578781128, "loss": 3.8174, "rewards/accuracies": 0.65625, "rewards/chosen": -10.011430740356445, "rewards/margins": 2.039872646331787, "rewards/rejected": -12.05130386352539, "step": 129 }, { "epoch": 0.29403449250777497, "grad_norm": 70.96241822690538, "learning_rate": 7.128735110663187e-07, "logits/chosen": -16.649568557739258, "logits/rejected": -16.656688690185547, "logps/chosen": -0.9126195907592773, "logps/rejected": -1.2946593761444092, "loss": 3.4187, "rewards/accuracies": 0.75, "rewards/chosen": -9.12619686126709, "rewards/margins": 3.8203978538513184, "rewards/rejected": -12.946593284606934, "step": 130 }, { "epoch": 0.2962962962962963, "grad_norm": 101.80550264504929, "learning_rate": 7.108915820187211e-07, "logits/chosen": -14.773153305053711, "logits/rejected": -14.596695899963379, "logps/chosen": -0.9897314310073853, "logps/rejected": -1.3485132455825806, "loss": 3.2847, "rewards/accuracies": 0.8125, "rewards/chosen": -9.89731502532959, "rewards/margins": 3.587818145751953, "rewards/rejected": -13.48513126373291, "step": 131 }, { "epoch": 0.29855810008481765, "grad_norm": 118.07277179415749, "learning_rate": 7.088901847877447e-07, "logits/chosen": -15.380992889404297, "logits/rejected": -15.349335670471191, "logps/chosen": -0.9675842523574829, "logps/rejected": -1.2969377040863037, "loss": 4.5014, "rewards/accuracies": 0.75, "rewards/chosen": -9.67584228515625, "rewards/margins": 3.293534755706787, "rewards/rejected": -12.969377517700195, "step": 132 }, { "epoch": 0.300819903873339, "grad_norm": 126.2514417468051, "learning_rate": 7.068694447018658e-07, "logits/chosen": -16.715206146240234, "logits/rejected": -16.719194412231445, "logps/chosen": -0.8660197854042053, "logps/rejected": -0.9733752608299255, "loss": 3.7002, "rewards/accuracies": 0.625, "rewards/chosen": -8.660198211669922, "rewards/margins": 1.073554515838623, "rewards/rejected": -9.733752250671387, "step": 133 }, { "epoch": 0.3030817076618603, "grad_norm": 115.59595983452179, "learning_rate": 7.048294883008199e-07, "logits/chosen": -17.525606155395508, "logits/rejected": -17.24049186706543, "logps/chosen": -0.9643100500106812, "logps/rejected": -1.2070279121398926, "loss": 3.5993, "rewards/accuracies": 0.78125, "rewards/chosen": -9.64310073852539, "rewards/margins": 2.4271788597106934, "rewards/rejected": -12.070280075073242, "step": 134 }, { "epoch": 0.3053435114503817, "grad_norm": 135.03715755778867, "learning_rate": 7.027704433276776e-07, "logits/chosen": -18.083145141601562, "logits/rejected": -17.37511444091797, "logps/chosen": -0.9572893381118774, "logps/rejected": -1.3515632152557373, "loss": 3.6303, "rewards/accuracies": 0.78125, "rewards/chosen": -9.572893142700195, "rewards/margins": 3.9427390098571777, "rewards/rejected": -13.515631675720215, "step": 135 }, { "epoch": 0.307605315238903, "grad_norm": 123.3145878922545, "learning_rate": 7.006924387208452e-07, "logits/chosen": -16.337995529174805, "logits/rejected": -16.18612289428711, "logps/chosen": -0.7492311596870422, "logps/rejected": -0.9364847540855408, "loss": 3.8638, "rewards/accuracies": 0.6875, "rewards/chosen": -7.492311000823975, "rewards/margins": 1.8725361824035645, "rewards/rejected": -9.364849090576172, "step": 136 }, { "epoch": 0.30986711902742436, "grad_norm": 110.47458672576012, "learning_rate": 6.985956046059904e-07, "logits/chosen": -15.281987190246582, "logits/rejected": -15.222082138061523, "logps/chosen": -0.8718824982643127, "logps/rejected": -1.2887756824493408, "loss": 3.8853, "rewards/accuracies": 0.75, "rewards/chosen": -8.718825340270996, "rewards/margins": 4.168931007385254, "rewards/rejected": -12.887757301330566, "step": 137 }, { "epoch": 0.31212892281594573, "grad_norm": 93.15535964190208, "learning_rate": 6.964800722878945e-07, "logits/chosen": -16.852909088134766, "logits/rejected": -16.60708999633789, "logps/chosen": -0.8743470907211304, "logps/rejected": -1.0895050764083862, "loss": 3.3061, "rewards/accuracies": 0.59375, "rewards/chosen": -8.743470191955566, "rewards/margins": 2.151580810546875, "rewards/rejected": -10.895051002502441, "step": 138 }, { "epoch": 0.31439072660446704, "grad_norm": 116.25516493400698, "learning_rate": 6.943459742422287e-07, "logits/chosen": -16.162519454956055, "logits/rejected": -15.761104583740234, "logps/chosen": -1.1220320463180542, "logps/rejected": -1.5008246898651123, "loss": 3.8586, "rewards/accuracies": 0.71875, "rewards/chosen": -11.220320701599121, "rewards/margins": 3.7879250049591064, "rewards/rejected": -15.008245468139648, "step": 139 }, { "epoch": 0.3166525303929884, "grad_norm": 115.45171227467374, "learning_rate": 6.921934441072597e-07, "logits/chosen": -17.508764266967773, "logits/rejected": -17.509645462036133, "logps/chosen": -1.0907777547836304, "logps/rejected": -1.355396032333374, "loss": 3.8912, "rewards/accuracies": 0.65625, "rewards/chosen": -10.90777587890625, "rewards/margins": 2.646184206008911, "rewards/rejected": -13.553960800170898, "step": 140 }, { "epoch": 0.3189143341815098, "grad_norm": 145.15074217143749, "learning_rate": 6.900226166754807e-07, "logits/chosen": -16.634740829467773, "logits/rejected": -16.976970672607422, "logps/chosen": -1.3348404169082642, "logps/rejected": -1.4817439317703247, "loss": 4.6661, "rewards/accuracies": 0.78125, "rewards/chosen": -13.348404884338379, "rewards/margins": 1.469035029411316, "rewards/rejected": -14.817439079284668, "step": 141 }, { "epoch": 0.3211761379700311, "grad_norm": 108.45634524328419, "learning_rate": 6.8783362788517e-07, "logits/chosen": -16.96560287475586, "logits/rejected": -16.87692642211914, "logps/chosen": -1.285522699356079, "logps/rejected": -1.6592097282409668, "loss": 3.4579, "rewards/accuracies": 0.75, "rewards/chosen": -12.855226516723633, "rewards/margins": 3.7368712425231934, "rewards/rejected": -16.592098236083984, "step": 142 }, { "epoch": 0.32343794175855245, "grad_norm": 108.50675097342489, "learning_rate": 6.856266148118796e-07, "logits/chosen": -16.803754806518555, "logits/rejected": -17.20581817626953, "logps/chosen": -1.0886934995651245, "logps/rejected": -1.5176775455474854, "loss": 3.3977, "rewards/accuracies": 0.8125, "rewards/chosen": -10.886935234069824, "rewards/margins": 4.2898406982421875, "rewards/rejected": -15.176775932312012, "step": 143 }, { "epoch": 0.3256997455470738, "grad_norm": 135.30929548671452, "learning_rate": 6.834017156598512e-07, "logits/chosen": -17.159934997558594, "logits/rejected": -16.915081024169922, "logps/chosen": -1.0447005033493042, "logps/rejected": -1.5800331830978394, "loss": 3.5652, "rewards/accuracies": 0.875, "rewards/chosen": -10.447005271911621, "rewards/margins": 5.353327751159668, "rewards/rejected": -15.800333023071289, "step": 144 }, { "epoch": 0.3279615493355951, "grad_norm": 92.79875046989964, "learning_rate": 6.811590697533607e-07, "logits/chosen": -18.941160202026367, "logits/rejected": -18.880695343017578, "logps/chosen": -1.2164652347564697, "logps/rejected": -1.3915354013442993, "loss": 3.8421, "rewards/accuracies": 0.625, "rewards/chosen": -12.164650917053223, "rewards/margins": 1.7507033348083496, "rewards/rejected": -13.915353775024414, "step": 145 }, { "epoch": 0.3302233531241165, "grad_norm": 152.87216949016505, "learning_rate": 6.788988175279951e-07, "logits/chosen": -17.32018280029297, "logits/rejected": -17.33584976196289, "logps/chosen": -1.1823511123657227, "logps/rejected": -1.55608069896698, "loss": 3.9364, "rewards/accuracies": 0.71875, "rewards/chosen": -11.823511123657227, "rewards/margins": 3.7372941970825195, "rewards/rejected": -15.560805320739746, "step": 146 }, { "epoch": 0.3324851569126378, "grad_norm": 112.74886884591804, "learning_rate": 6.766211005218577e-07, "logits/chosen": -17.034011840820312, "logits/rejected": -16.896516799926758, "logps/chosen": -1.0992332696914673, "logps/rejected": -1.6042219400405884, "loss": 3.2047, "rewards/accuracies": 0.75, "rewards/chosen": -10.992332458496094, "rewards/margins": 5.049887180328369, "rewards/rejected": -16.042219161987305, "step": 147 }, { "epoch": 0.33474696070115917, "grad_norm": 102.37494798795927, "learning_rate": 6.743260613667047e-07, "logits/chosen": -20.16105079650879, "logits/rejected": -20.097129821777344, "logps/chosen": -1.3072175979614258, "logps/rejected": -1.7791482210159302, "loss": 3.6159, "rewards/accuracies": 0.84375, "rewards/chosen": -13.072174072265625, "rewards/margins": 4.719306945800781, "rewards/rejected": -17.79148292541504, "step": 148 }, { "epoch": 0.33700876448968053, "grad_norm": 92.8889637457248, "learning_rate": 6.720138437790139e-07, "logits/chosen": -18.851449966430664, "logits/rejected": -19.006174087524414, "logps/chosen": -1.1900596618652344, "logps/rejected": -1.6059695482254028, "loss": 3.0262, "rewards/accuracies": 0.71875, "rewards/chosen": -11.900595664978027, "rewards/margins": 4.1590986251831055, "rewards/rejected": -16.059694290161133, "step": 149 }, { "epoch": 0.33927056827820185, "grad_norm": 128.41082842918968, "learning_rate": 6.696845925509848e-07, "logits/chosen": -17.904184341430664, "logits/rejected": -17.44746208190918, "logps/chosen": -1.3113856315612793, "logps/rejected": -1.5605354309082031, "loss": 3.9659, "rewards/accuracies": 0.625, "rewards/chosen": -13.113856315612793, "rewards/margins": 2.491497755050659, "rewards/rejected": -15.605354309082031, "step": 150 }, { "epoch": 0.3415323720667232, "grad_norm": 120.26296110157324, "learning_rate": 6.673384535414718e-07, "logits/chosen": -18.292760848999023, "logits/rejected": -18.07993507385254, "logps/chosen": -1.2153687477111816, "logps/rejected": -1.442406415939331, "loss": 4.4772, "rewards/accuracies": 0.5, "rewards/chosen": -12.153687477111816, "rewards/margins": 2.270376205444336, "rewards/rejected": -14.424064636230469, "step": 151 }, { "epoch": 0.3437941758552446, "grad_norm": 108.61106396576423, "learning_rate": 6.649755736668511e-07, "logits/chosen": -16.841121673583984, "logits/rejected": -16.501068115234375, "logps/chosen": -1.1176737546920776, "logps/rejected": -1.6219063997268677, "loss": 2.9905, "rewards/accuracies": 0.90625, "rewards/chosen": -11.176735877990723, "rewards/margins": 5.042326927185059, "rewards/rejected": -16.21906280517578, "step": 152 }, { "epoch": 0.3460559796437659, "grad_norm": 128.2647564085794, "learning_rate": 6.625961008918192e-07, "logits/chosen": -18.715444564819336, "logits/rejected": -18.511932373046875, "logps/chosen": -1.2932363748550415, "logps/rejected": -1.488201379776001, "loss": 3.3092, "rewards/accuracies": 0.625, "rewards/chosen": -12.932364463806152, "rewards/margins": 1.9496493339538574, "rewards/rejected": -14.882014274597168, "step": 153 }, { "epoch": 0.34831778343228725, "grad_norm": 119.57670893186517, "learning_rate": 6.602001842201289e-07, "logits/chosen": -16.945566177368164, "logits/rejected": -17.048381805419922, "logps/chosen": -1.1972765922546387, "logps/rejected": -1.4838308095932007, "loss": 3.6387, "rewards/accuracies": 0.75, "rewards/chosen": -11.972765922546387, "rewards/margins": 2.8655428886413574, "rewards/rejected": -14.838308334350586, "step": 154 }, { "epoch": 0.3505795872208086, "grad_norm": 133.44404947093932, "learning_rate": 6.577879736852571e-07, "logits/chosen": -17.318836212158203, "logits/rejected": -17.2701473236084, "logps/chosen": -1.3334428071975708, "logps/rejected": -1.554374098777771, "loss": 3.9227, "rewards/accuracies": 0.71875, "rewards/chosen": -13.334427833557129, "rewards/margins": 2.2093122005462646, "rewards/rejected": -15.543739318847656, "step": 155 }, { "epoch": 0.35284139100932993, "grad_norm": 108.58327489587623, "learning_rate": 6.553596203410112e-07, "logits/chosen": -16.674957275390625, "logits/rejected": -16.426877975463867, "logps/chosen": -1.0741811990737915, "logps/rejected": -1.548369288444519, "loss": 3.0135, "rewards/accuracies": 0.84375, "rewards/chosen": -10.741811752319336, "rewards/margins": 4.741880893707275, "rewards/rejected": -15.483692169189453, "step": 156 }, { "epoch": 0.3551031947978513, "grad_norm": 114.21004986072782, "learning_rate": 6.529152762520688e-07, "logits/chosen": -18.453733444213867, "logits/rejected": -18.391489028930664, "logps/chosen": -1.3322038650512695, "logps/rejected": -1.5323253870010376, "loss": 3.8828, "rewards/accuracies": 0.625, "rewards/chosen": -13.322039604187012, "rewards/margins": 2.0012147426605225, "rewards/rejected": -15.323253631591797, "step": 157 }, { "epoch": 0.3573649985863726, "grad_norm": 140.31074175994047, "learning_rate": 6.504550944844558e-07, "logits/chosen": -16.778514862060547, "logits/rejected": -16.645111083984375, "logps/chosen": -1.3454078435897827, "logps/rejected": -1.8546462059020996, "loss": 3.6056, "rewards/accuracies": 0.8125, "rewards/chosen": -13.454076766967773, "rewards/margins": 5.092383861541748, "rewards/rejected": -18.546463012695312, "step": 158 }, { "epoch": 0.359626802374894, "grad_norm": 137.59691231656896, "learning_rate": 6.479792290959613e-07, "logits/chosen": -16.80532455444336, "logits/rejected": -16.812410354614258, "logps/chosen": -1.4085781574249268, "logps/rejected": -1.807809591293335, "loss": 3.3238, "rewards/accuracies": 0.75, "rewards/chosen": -14.085782051086426, "rewards/margins": 3.9923133850097656, "rewards/rejected": -18.078096389770508, "step": 159 }, { "epoch": 0.36188860616341534, "grad_norm": 122.31193582134684, "learning_rate": 6.454878351264906e-07, "logits/chosen": -17.32520866394043, "logits/rejected": -17.327320098876953, "logps/chosen": -1.2008891105651855, "logps/rejected": -1.3473906517028809, "loss": 3.8266, "rewards/accuracies": 0.65625, "rewards/chosen": -12.008890151977539, "rewards/margins": 1.4650166034698486, "rewards/rejected": -13.473907470703125, "step": 160 }, { "epoch": 0.36415040995193665, "grad_norm": 131.3884611405496, "learning_rate": 6.429810685883565e-07, "logits/chosen": -16.57655143737793, "logits/rejected": -16.592082977294922, "logps/chosen": -1.295839786529541, "logps/rejected": -1.7100400924682617, "loss": 3.3158, "rewards/accuracies": 0.71875, "rewards/chosen": -12.958398818969727, "rewards/margins": 4.142002582550049, "rewards/rejected": -17.100400924682617, "step": 161 }, { "epoch": 0.366412213740458, "grad_norm": 131.23411742145908, "learning_rate": 6.404590864565088e-07, "logits/chosen": -17.949878692626953, "logits/rejected": -18.036605834960938, "logps/chosen": -1.171650767326355, "logps/rejected": -1.3193507194519043, "loss": 3.9769, "rewards/accuracies": 0.53125, "rewards/chosen": -11.716507911682129, "rewards/margins": 1.4770005941390991, "rewards/rejected": -13.19350814819336, "step": 162 }, { "epoch": 0.3686740175289794, "grad_norm": 118.65175820446706, "learning_rate": 6.379220466587063e-07, "logits/chosen": -19.885251998901367, "logits/rejected": -19.330791473388672, "logps/chosen": -1.2915012836456299, "logps/rejected": -1.452092170715332, "loss": 3.4104, "rewards/accuracies": 0.71875, "rewards/chosen": -12.91501235961914, "rewards/margins": 1.6059094667434692, "rewards/rejected": -14.520920753479004, "step": 163 }, { "epoch": 0.3709358213175007, "grad_norm": 115.53093238478114, "learning_rate": 6.353701080656254e-07, "logits/chosen": -18.273351669311523, "logits/rejected": -18.36724090576172, "logps/chosen": -1.3921793699264526, "logps/rejected": -1.6417995691299438, "loss": 3.3543, "rewards/accuracies": 0.6875, "rewards/chosen": -13.921795845031738, "rewards/margins": 2.496201276779175, "rewards/rejected": -16.41799545288086, "step": 164 }, { "epoch": 0.37319762510602206, "grad_norm": 124.5033686839827, "learning_rate": 6.32803430480913e-07, "logits/chosen": -18.828781127929688, "logits/rejected": -18.2823429107666, "logps/chosen": -1.3336889743804932, "logps/rejected": -1.6634752750396729, "loss": 3.7914, "rewards/accuracies": 0.625, "rewards/chosen": -13.336889266967773, "rewards/margins": 3.297863721847534, "rewards/rejected": -16.63475227355957, "step": 165 }, { "epoch": 0.3754594288945434, "grad_norm": 147.97213663568766, "learning_rate": 6.302221746311782e-07, "logits/chosen": -16.221012115478516, "logits/rejected": -15.638816833496094, "logps/chosen": -1.24800443649292, "logps/rejected": -1.5752636194229126, "loss": 3.9877, "rewards/accuracies": 0.65625, "rewards/chosen": -12.480045318603516, "rewards/margins": 3.272590398788452, "rewards/rejected": -15.752635955810547, "step": 166 }, { "epoch": 0.37772123268306473, "grad_norm": 117.72793765671894, "learning_rate": 6.276265021559288e-07, "logits/chosen": -17.692129135131836, "logits/rejected": -17.678510665893555, "logps/chosen": -1.4085066318511963, "logps/rejected": -1.5413269996643066, "loss": 3.7453, "rewards/accuracies": 0.59375, "rewards/chosen": -14.085065841674805, "rewards/margins": 1.3282032012939453, "rewards/rejected": -15.41326904296875, "step": 167 }, { "epoch": 0.3799830364715861, "grad_norm": 123.6576549431064, "learning_rate": 6.250165755974487e-07, "logits/chosen": -18.768051147460938, "logits/rejected": -18.692035675048828, "logps/chosen": -1.2646162509918213, "logps/rejected": -1.4007298946380615, "loss": 3.4565, "rewards/accuracies": 0.59375, "rewards/chosen": -12.646160125732422, "rewards/margins": 1.361138939857483, "rewards/rejected": -14.00730037689209, "step": 168 }, { "epoch": 0.3822448402601074, "grad_norm": 111.39581567132997, "learning_rate": 6.223925583906192e-07, "logits/chosen": -18.353418350219727, "logits/rejected": -17.735267639160156, "logps/chosen": -1.327247142791748, "logps/rejected": -1.6749954223632812, "loss": 3.351, "rewards/accuracies": 0.65625, "rewards/chosen": -13.272473335266113, "rewards/margins": 3.477482318878174, "rewards/rejected": -16.749956130981445, "step": 169 }, { "epoch": 0.3845066440486288, "grad_norm": 115.58098332357874, "learning_rate": 6.19754614852685e-07, "logits/chosen": -17.524866104125977, "logits/rejected": -17.590625762939453, "logps/chosen": -1.1806279420852661, "logps/rejected": -1.5952324867248535, "loss": 3.6054, "rewards/accuracies": 0.75, "rewards/chosen": -11.806280136108398, "rewards/margins": 4.146044731140137, "rewards/rejected": -15.952325820922852, "step": 170 }, { "epoch": 0.38676844783715014, "grad_norm": 107.33538738128671, "learning_rate": 6.171029101729644e-07, "logits/chosen": -17.01272964477539, "logits/rejected": -17.01720428466797, "logps/chosen": -1.2651264667510986, "logps/rejected": -1.5476595163345337, "loss": 3.5012, "rewards/accuracies": 0.65625, "rewards/chosen": -12.651265144348145, "rewards/margins": 2.8253297805786133, "rewards/rejected": -15.476594924926758, "step": 171 }, { "epoch": 0.38903025162567145, "grad_norm": 125.94524393171766, "learning_rate": 6.144376104025055e-07, "logits/chosen": -16.98217010498047, "logits/rejected": -16.957050323486328, "logps/chosen": -1.1778655052185059, "logps/rejected": -1.4646776914596558, "loss": 3.2286, "rewards/accuracies": 0.71875, "rewards/chosen": -11.778654098510742, "rewards/margins": 2.8681228160858154, "rewards/rejected": -14.64677619934082, "step": 172 }, { "epoch": 0.3912920554141928, "grad_norm": 115.30228389834534, "learning_rate": 6.117588824436873e-07, "logits/chosen": -17.979293823242188, "logits/rejected": -17.900651931762695, "logps/chosen": -1.158996820449829, "logps/rejected": -1.3833591938018799, "loss": 3.6577, "rewards/accuracies": 0.59375, "rewards/chosen": -11.589967727661133, "rewards/margins": 2.243624448776245, "rewards/rejected": -13.83359146118164, "step": 173 }, { "epoch": 0.3935538592027142, "grad_norm": 136.92412625465224, "learning_rate": 6.090668940397688e-07, "logits/chosen": -17.41019058227539, "logits/rejected": -17.082996368408203, "logps/chosen": -1.200378179550171, "logps/rejected": -1.5303970575332642, "loss": 3.3817, "rewards/accuracies": 0.75, "rewards/chosen": -12.003782272338867, "rewards/margins": 3.3001890182495117, "rewards/rejected": -15.303971290588379, "step": 174 }, { "epoch": 0.3958156629912355, "grad_norm": 97.25464380215708, "learning_rate": 6.063618137643844e-07, "logits/chosen": -17.9305419921875, "logits/rejected": -17.675626754760742, "logps/chosen": -1.0930852890014648, "logps/rejected": -1.3468796014785767, "loss": 3.0673, "rewards/accuracies": 0.65625, "rewards/chosen": -10.930851936340332, "rewards/margins": 2.5379440784454346, "rewards/rejected": -13.468796730041504, "step": 175 }, { "epoch": 0.39807746677975686, "grad_norm": 109.33547647771188, "learning_rate": 6.03643811010988e-07, "logits/chosen": -18.435937881469727, "logits/rejected": -18.244789123535156, "logps/chosen": -1.421694278717041, "logps/rejected": -1.6195881366729736, "loss": 3.4374, "rewards/accuracies": 0.625, "rewards/chosen": -14.216943740844727, "rewards/margins": 1.9789376258850098, "rewards/rejected": -16.195880889892578, "step": 176 }, { "epoch": 0.4003392705682782, "grad_norm": 124.54512017176646, "learning_rate": 6.009130559822453e-07, "logits/chosen": -18.292753219604492, "logits/rejected": -18.013795852661133, "logps/chosen": -1.2830564975738525, "logps/rejected": -1.6480597257614136, "loss": 3.5887, "rewards/accuracies": 0.625, "rewards/chosen": -12.830564498901367, "rewards/margins": 3.6500320434570312, "rewards/rejected": -16.4805965423584, "step": 177 }, { "epoch": 0.40260107435679954, "grad_norm": 117.11523823927375, "learning_rate": 5.981697196793758e-07, "logits/chosen": -17.92546844482422, "logits/rejected": -17.709861755371094, "logps/chosen": -1.328133225440979, "logps/rejected": -1.6237107515335083, "loss": 3.5286, "rewards/accuracies": 0.71875, "rewards/chosen": -13.281332015991211, "rewards/margins": 2.9557762145996094, "rewards/rejected": -16.23710823059082, "step": 178 }, { "epoch": 0.4048628781453209, "grad_norm": 135.21248374147646, "learning_rate": 5.954139738914446e-07, "logits/chosen": -16.00653648376465, "logits/rejected": -16.205612182617188, "logps/chosen": -1.5146088600158691, "logps/rejected": -1.722532868385315, "loss": 3.3813, "rewards/accuracies": 0.78125, "rewards/chosen": -15.146087646484375, "rewards/margins": 2.079242467880249, "rewards/rejected": -17.225330352783203, "step": 179 }, { "epoch": 0.4071246819338422, "grad_norm": 118.75409568384903, "learning_rate": 5.92645991184605e-07, "logits/chosen": -18.836807250976562, "logits/rejected": -18.096431732177734, "logps/chosen": -1.328464388847351, "logps/rejected": -1.7455822229385376, "loss": 3.0959, "rewards/accuracies": 0.71875, "rewards/chosen": -13.284643173217773, "rewards/margins": 4.171177387237549, "rewards/rejected": -17.455821990966797, "step": 180 }, { "epoch": 0.4093864857223636, "grad_norm": 149.61294643110665, "learning_rate": 5.898659448912917e-07, "logits/chosen": -19.704021453857422, "logits/rejected": -19.59141731262207, "logps/chosen": -1.322415828704834, "logps/rejected": -1.6818199157714844, "loss": 3.99, "rewards/accuracies": 0.65625, "rewards/chosen": -13.224160194396973, "rewards/margins": 3.5940399169921875, "rewards/rejected": -16.818199157714844, "step": 181 }, { "epoch": 0.41164828951088495, "grad_norm": 122.65957791889335, "learning_rate": 5.870740090993676e-07, "logits/chosen": -18.265836715698242, "logits/rejected": -18.42105484008789, "logps/chosen": -1.5347073078155518, "logps/rejected": -1.7209949493408203, "loss": 3.171, "rewards/accuracies": 0.625, "rewards/chosen": -15.347073554992676, "rewards/margins": 1.8628755807876587, "rewards/rejected": -17.209949493408203, "step": 182 }, { "epoch": 0.41391009329940626, "grad_norm": 135.01368423233092, "learning_rate": 5.842703586412214e-07, "logits/chosen": -18.84733009338379, "logits/rejected": -18.886695861816406, "logps/chosen": -1.4349242448806763, "logps/rejected": -1.703200340270996, "loss": 3.7337, "rewards/accuracies": 0.75, "rewards/chosen": -14.3492431640625, "rewards/margins": 2.6827609539031982, "rewards/rejected": -17.03200340270996, "step": 183 }, { "epoch": 0.4161718970879276, "grad_norm": 106.57687497552105, "learning_rate": 5.814551690828203e-07, "logits/chosen": -18.91258430480957, "logits/rejected": -18.3468017578125, "logps/chosen": -1.2373554706573486, "logps/rejected": -1.6082764863967896, "loss": 3.0833, "rewards/accuracies": 0.78125, "rewards/chosen": -12.373554229736328, "rewards/margins": 3.709210157394409, "rewards/rejected": -16.082765579223633, "step": 184 }, { "epoch": 0.418433700876449, "grad_norm": 139.70919536594133, "learning_rate": 5.786286167127155e-07, "logits/chosen": -18.484155654907227, "logits/rejected": -18.344846725463867, "logps/chosen": -1.4353551864624023, "logps/rejected": -1.933018684387207, "loss": 3.4633, "rewards/accuracies": 0.8125, "rewards/chosen": -14.353551864624023, "rewards/margins": 4.976635932922363, "rewards/rejected": -19.33018684387207, "step": 185 }, { "epoch": 0.4206955046649703, "grad_norm": 128.37055319468598, "learning_rate": 5.757908785310031e-07, "logits/chosen": -17.390743255615234, "logits/rejected": -17.32730484008789, "logps/chosen": -1.4048118591308594, "logps/rejected": -1.8442890644073486, "loss": 3.7353, "rewards/accuracies": 0.78125, "rewards/chosen": -14.048118591308594, "rewards/margins": 4.394770622253418, "rewards/rejected": -18.442888259887695, "step": 186 }, { "epoch": 0.42295730845349166, "grad_norm": 125.14297515296605, "learning_rate": 5.729421322382399e-07, "logits/chosen": -16.7410888671875, "logits/rejected": -16.759994506835938, "logps/chosen": -1.1190869808197021, "logps/rejected": -1.3602699041366577, "loss": 3.4939, "rewards/accuracies": 0.71875, "rewards/chosen": -11.19087028503418, "rewards/margins": 2.4118287563323975, "rewards/rejected": -13.602697372436523, "step": 187 }, { "epoch": 0.42521911224201303, "grad_norm": 121.636291085123, "learning_rate": 5.700825562243163e-07, "logits/chosen": -17.965713500976562, "logits/rejected": -17.82717514038086, "logps/chosen": -1.3181742429733276, "logps/rejected": -1.5977309942245483, "loss": 3.2506, "rewards/accuracies": 0.75, "rewards/chosen": -13.181741714477539, "rewards/margins": 2.7955687046051025, "rewards/rejected": -15.977310180664062, "step": 188 }, { "epoch": 0.42748091603053434, "grad_norm": 115.58317098156645, "learning_rate": 5.672123295572854e-07, "logits/chosen": -16.11078453063965, "logits/rejected": -15.989376068115234, "logps/chosen": -1.318684458732605, "logps/rejected": -1.7406071424484253, "loss": 3.2275, "rewards/accuracies": 0.71875, "rewards/chosen": -13.186845779418945, "rewards/margins": 4.219226837158203, "rewards/rejected": -17.40607261657715, "step": 189 }, { "epoch": 0.4297427198190557, "grad_norm": 110.76706668825072, "learning_rate": 5.643316319721487e-07, "logits/chosen": -21.12446403503418, "logits/rejected": -21.084503173828125, "logps/chosen": -1.7526358366012573, "logps/rejected": -1.8315396308898926, "loss": 3.7724, "rewards/accuracies": 0.59375, "rewards/chosen": -17.52635955810547, "rewards/margins": 0.789039134979248, "rewards/rejected": -18.315397262573242, "step": 190 }, { "epoch": 0.432004523607577, "grad_norm": 128.02917137545202, "learning_rate": 5.614406438596026e-07, "logits/chosen": -18.617406845092773, "logits/rejected": -18.170780181884766, "logps/chosen": -1.6118402481079102, "logps/rejected": -1.8639785051345825, "loss": 3.8619, "rewards/accuracies": 0.65625, "rewards/chosen": -16.1184024810791, "rewards/margins": 2.521383285522461, "rewards/rejected": -18.639785766601562, "step": 191 }, { "epoch": 0.4342663273960984, "grad_norm": 128.62006828713473, "learning_rate": 5.585395462547406e-07, "logits/chosen": -17.90593719482422, "logits/rejected": -18.07216453552246, "logps/chosen": -1.6061056852340698, "logps/rejected": -1.8645837306976318, "loss": 3.6755, "rewards/accuracies": 0.65625, "rewards/chosen": -16.06105613708496, "rewards/margins": 2.5847792625427246, "rewards/rejected": -18.645837783813477, "step": 192 }, { "epoch": 0.43652813118461975, "grad_norm": 115.59247682163482, "learning_rate": 5.55628520825718e-07, "logits/chosen": -17.22564125061035, "logits/rejected": -17.25448989868164, "logps/chosen": -1.4208124876022339, "logps/rejected": -1.9312503337860107, "loss": 3.3126, "rewards/accuracies": 0.8125, "rewards/chosen": -14.208124160766602, "rewards/margins": 5.104379177093506, "rewards/rejected": -19.312503814697266, "step": 193 }, { "epoch": 0.43878993497314106, "grad_norm": 125.18412016900433, "learning_rate": 5.527077498623752e-07, "logits/chosen": -16.629623413085938, "logits/rejected": -16.644580841064453, "logps/chosen": -1.4364906549453735, "logps/rejected": -1.7086538076400757, "loss": 3.5968, "rewards/accuracies": 0.6875, "rewards/chosen": -14.364906311035156, "rewards/margins": 2.7216320037841797, "rewards/rejected": -17.086536407470703, "step": 194 }, { "epoch": 0.4410517387616624, "grad_norm": 95.71934222963769, "learning_rate": 5.497774162648228e-07, "logits/chosen": -17.9370059967041, "logits/rejected": -17.442899703979492, "logps/chosen": -1.521501064300537, "logps/rejected": -2.050462484359741, "loss": 2.8302, "rewards/accuracies": 0.8125, "rewards/chosen": -15.215010643005371, "rewards/margins": 5.289614677429199, "rewards/rejected": -20.50462532043457, "step": 195 }, { "epoch": 0.4433135425501838, "grad_norm": 118.37601502769598, "learning_rate": 5.468377035319882e-07, "logits/chosen": -18.176965713500977, "logits/rejected": -17.80450439453125, "logps/chosen": -1.5505520105361938, "logps/rejected": -2.088866949081421, "loss": 3.251, "rewards/accuracies": 0.6875, "rewards/chosen": -15.505517959594727, "rewards/margins": 5.383152008056641, "rewards/rejected": -20.888671875, "step": 196 }, { "epoch": 0.4455753463387051, "grad_norm": 114.61725176586181, "learning_rate": 5.438887957501248e-07, "logits/chosen": -18.173625946044922, "logits/rejected": -18.10104751586914, "logps/chosen": -1.6463440656661987, "logps/rejected": -1.9076793193817139, "loss": 3.6095, "rewards/accuracies": 0.71875, "rewards/chosen": -16.46343994140625, "rewards/margins": 2.6133527755737305, "rewards/rejected": -19.076791763305664, "step": 197 }, { "epoch": 0.44783715012722647, "grad_norm": 131.97444146862202, "learning_rate": 5.409308775812844e-07, "logits/chosen": -17.92854118347168, "logits/rejected": -18.174327850341797, "logps/chosen": -1.7027937173843384, "logps/rejected": -1.9399760961532593, "loss": 3.6652, "rewards/accuracies": 0.65625, "rewards/chosen": -17.027938842773438, "rewards/margins": 2.3718223571777344, "rewards/rejected": -19.39975929260254, "step": 198 }, { "epoch": 0.45009895391574783, "grad_norm": 112.00245200831505, "learning_rate": 5.379641342517541e-07, "logits/chosen": -17.90815544128418, "logits/rejected": -17.855358123779297, "logps/chosen": -1.369492530822754, "logps/rejected": -1.7647912502288818, "loss": 3.6049, "rewards/accuracies": 0.65625, "rewards/chosen": -13.694926261901855, "rewards/margins": 3.9529881477355957, "rewards/rejected": -17.64791488647461, "step": 199 }, { "epoch": 0.45236075770426915, "grad_norm": 110.39676860636777, "learning_rate": 5.349887515404564e-07, "logits/chosen": -19.14180564880371, "logits/rejected": -18.728225708007812, "logps/chosen": -1.567470669746399, "logps/rejected": -1.8445343971252441, "loss": 3.0688, "rewards/accuracies": 0.75, "rewards/chosen": -15.674705505371094, "rewards/margins": 2.7706375122070312, "rewards/rejected": -18.445341110229492, "step": 200 }, { "epoch": 0.4546225614927905, "grad_norm": 112.66708655546361, "learning_rate": 5.320049157673163e-07, "logits/chosen": -19.58551597595215, "logits/rejected": -19.09307289123535, "logps/chosen": -1.4883310794830322, "logps/rejected": -1.705352783203125, "loss": 3.2351, "rewards/accuracies": 0.625, "rewards/chosen": -14.88331127166748, "rewards/margins": 2.1702170372009277, "rewards/rejected": -17.05352783203125, "step": 201 }, { "epoch": 0.4568843652813118, "grad_norm": 140.2319323230872, "learning_rate": 5.290128137815938e-07, "logits/chosen": -18.30789566040039, "logits/rejected": -18.139724731445312, "logps/chosen": -1.4757699966430664, "logps/rejected": -1.8749364614486694, "loss": 3.259, "rewards/accuracies": 0.78125, "rewards/chosen": -14.75770092010498, "rewards/margins": 3.991664409637451, "rewards/rejected": -18.749366760253906, "step": 202 }, { "epoch": 0.4591461690698332, "grad_norm": 124.01739804659297, "learning_rate": 5.260126329501828e-07, "logits/chosen": -18.476545333862305, "logits/rejected": -18.295202255249023, "logps/chosen": -1.3175721168518066, "logps/rejected": -1.8804268836975098, "loss": 2.8801, "rewards/accuracies": 0.84375, "rewards/chosen": -13.175721168518066, "rewards/margins": 5.628549098968506, "rewards/rejected": -18.804269790649414, "step": 203 }, { "epoch": 0.46140797285835455, "grad_norm": 110.8800697134872, "learning_rate": 5.230045611458789e-07, "logits/chosen": -19.596027374267578, "logits/rejected": -19.41700553894043, "logps/chosen": -1.2929422855377197, "logps/rejected": -1.7962149381637573, "loss": 3.0127, "rewards/accuracies": 0.8125, "rewards/chosen": -12.929424285888672, "rewards/margins": 5.032725811004639, "rewards/rejected": -17.962148666381836, "step": 204 }, { "epoch": 0.46366977664687586, "grad_norm": 130.9926070201208, "learning_rate": 5.199887867356143e-07, "logits/chosen": -18.125207901000977, "logits/rejected": -18.14871597290039, "logps/chosen": -1.4900728464126587, "logps/rejected": -1.9675356149673462, "loss": 3.1097, "rewards/accuracies": 0.75, "rewards/chosen": -14.900728225708008, "rewards/margins": 4.774627685546875, "rewards/rejected": -19.675355911254883, "step": 205 }, { "epoch": 0.46593158043539723, "grad_norm": 120.13166100619164, "learning_rate": 5.16965498568662e-07, "logits/chosen": -18.744281768798828, "logits/rejected": -18.2139835357666, "logps/chosen": -1.6062824726104736, "logps/rejected": -2.2554736137390137, "loss": 3.0776, "rewards/accuracies": 0.84375, "rewards/chosen": -16.062824249267578, "rewards/margins": 6.491910934448242, "rewards/rejected": -22.554733276367188, "step": 206 }, { "epoch": 0.4681933842239186, "grad_norm": 120.48417226561662, "learning_rate": 5.139348859648098e-07, "logits/chosen": -18.755203247070312, "logits/rejected": -18.599624633789062, "logps/chosen": -1.2531086206436157, "logps/rejected": -1.658897042274475, "loss": 3.4743, "rewards/accuracies": 0.71875, "rewards/chosen": -12.531085968017578, "rewards/margins": 4.05788516998291, "rewards/rejected": -16.588970184326172, "step": 207 }, { "epoch": 0.4704551880124399, "grad_norm": 108.6230474054679, "learning_rate": 5.10897138702506e-07, "logits/chosen": -19.518808364868164, "logits/rejected": -19.44478416442871, "logps/chosen": -1.500132441520691, "logps/rejected": -1.9223947525024414, "loss": 2.9948, "rewards/accuracies": 0.75, "rewards/chosen": -15.001323699951172, "rewards/margins": 4.2226243019104, "rewards/rejected": -19.223947525024414, "step": 208 }, { "epoch": 0.4727169918009613, "grad_norm": 137.90239347147582, "learning_rate": 5.078524470069743e-07, "logits/chosen": -20.06089973449707, "logits/rejected": -19.945919036865234, "logps/chosen": -1.5609779357910156, "logps/rejected": -2.011448860168457, "loss": 3.3272, "rewards/accuracies": 0.78125, "rewards/chosen": -15.609781265258789, "rewards/margins": 4.504709720611572, "rewards/rejected": -20.114490509033203, "step": 209 }, { "epoch": 0.47497879558948264, "grad_norm": 122.82261924752474, "learning_rate": 5.048010015383021e-07, "logits/chosen": -20.49317169189453, "logits/rejected": -20.084096908569336, "logps/chosen": -1.8195672035217285, "logps/rejected": -2.4391162395477295, "loss": 3.0204, "rewards/accuracies": 0.875, "rewards/chosen": -18.1956729888916, "rewards/margins": 6.195489406585693, "rewards/rejected": -24.39116096496582, "step": 210 }, { "epoch": 0.47724059937800395, "grad_norm": 133.40914227876348, "learning_rate": 5.01742993379502e-07, "logits/chosen": -20.487716674804688, "logits/rejected": -20.366500854492188, "logps/chosen": -1.6833590269088745, "logps/rejected": -1.998462200164795, "loss": 3.3079, "rewards/accuracies": 0.625, "rewards/chosen": -16.833589553833008, "rewards/margins": 3.151031017303467, "rewards/rejected": -19.984619140625, "step": 211 }, { "epoch": 0.4795024031665253, "grad_norm": 121.21962647812443, "learning_rate": 4.986786140245446e-07, "logits/chosen": -18.02095603942871, "logits/rejected": -18.15108299255371, "logps/chosen": -1.5042206048965454, "logps/rejected": -1.7882376909255981, "loss": 3.3782, "rewards/accuracies": 0.8125, "rewards/chosen": -15.042205810546875, "rewards/margins": 2.8401715755462646, "rewards/rejected": -17.88237762451172, "step": 212 }, { "epoch": 0.4817642069550466, "grad_norm": 138.8873467278285, "learning_rate": 4.956080553663687e-07, "logits/chosen": -19.02423095703125, "logits/rejected": -18.977191925048828, "logps/chosen": -1.8565832376480103, "logps/rejected": -2.2188644409179688, "loss": 3.2943, "rewards/accuracies": 0.75, "rewards/chosen": -18.565834045410156, "rewards/margins": 3.622810125350952, "rewards/rejected": -22.188644409179688, "step": 213 }, { "epoch": 0.484026010743568, "grad_norm": 120.38379558424705, "learning_rate": 4.925315096848636e-07, "logits/chosen": -17.092893600463867, "logits/rejected": -17.469482421875, "logps/chosen": -1.5928668975830078, "logps/rejected": -2.1185383796691895, "loss": 3.2942, "rewards/accuracies": 0.6875, "rewards/chosen": -15.928670883178711, "rewards/margins": 5.256712913513184, "rewards/rejected": -21.185382843017578, "step": 214 }, { "epoch": 0.48628781453208936, "grad_norm": 141.47624101645354, "learning_rate": 4.894491696348293e-07, "logits/chosen": -18.64133644104004, "logits/rejected": -18.508480072021484, "logps/chosen": -1.7584447860717773, "logps/rejected": -1.9836417436599731, "loss": 3.8231, "rewards/accuracies": 0.65625, "rewards/chosen": -17.58444595336914, "rewards/margins": 2.2519688606262207, "rewards/rejected": -19.836416244506836, "step": 215 }, { "epoch": 0.48854961832061067, "grad_norm": 100.6834936019963, "learning_rate": 4.863612282339116e-07, "logits/chosen": -18.912193298339844, "logits/rejected": -18.51605224609375, "logps/chosen": -1.4050565958023071, "logps/rejected": -1.7702962160110474, "loss": 3.2087, "rewards/accuracies": 0.71875, "rewards/chosen": -14.050565719604492, "rewards/margins": 3.6523966789245605, "rewards/rejected": -17.70296287536621, "step": 216 }, { "epoch": 0.49081142210913203, "grad_norm": 126.90139310026458, "learning_rate": 4.832678788505161e-07, "logits/chosen": -20.156646728515625, "logits/rejected": -20.06357192993164, "logps/chosen": -1.8053876161575317, "logps/rejected": -2.161924362182617, "loss": 3.2957, "rewards/accuracies": 0.625, "rewards/chosen": -18.053876876831055, "rewards/margins": 3.565363883972168, "rewards/rejected": -21.619239807128906, "step": 217 }, { "epoch": 0.4930732258976534, "grad_norm": 112.75413161542235, "learning_rate": 4.801693151916985e-07, "logits/chosen": -18.10401153564453, "logits/rejected": -18.311925888061523, "logps/chosen": -1.7536580562591553, "logps/rejected": -2.2070531845092773, "loss": 3.1204, "rewards/accuracies": 0.78125, "rewards/chosen": -17.53658103942871, "rewards/margins": 4.533949375152588, "rewards/rejected": -22.070531845092773, "step": 218 }, { "epoch": 0.4953350296861747, "grad_norm": 120.36363917960784, "learning_rate": 4.770657312910354e-07, "logits/chosen": -19.36819839477539, "logits/rejected": -19.35448455810547, "logps/chosen": -1.7574162483215332, "logps/rejected": -2.383183002471924, "loss": 3.6944, "rewards/accuracies": 0.75, "rewards/chosen": -17.57416343688965, "rewards/margins": 6.257665634155273, "rewards/rejected": -23.831829071044922, "step": 219 }, { "epoch": 0.4975968334746961, "grad_norm": 123.44879094752798, "learning_rate": 4.739573214964729e-07, "logits/chosen": -18.070960998535156, "logits/rejected": -17.84283447265625, "logps/chosen": -1.280721664428711, "logps/rejected": -1.6026414632797241, "loss": 2.9387, "rewards/accuracies": 0.71875, "rewards/chosen": -12.80721664428711, "rewards/margins": 3.2191972732543945, "rewards/rejected": -16.02641487121582, "step": 220 }, { "epoch": 0.49985863726321744, "grad_norm": 170.10224468225707, "learning_rate": 4.7084428045815733e-07, "logits/chosen": -19.83563995361328, "logits/rejected": -19.78580665588379, "logps/chosen": -1.7471987009048462, "logps/rejected": -2.135683059692383, "loss": 3.8038, "rewards/accuracies": 0.65625, "rewards/chosen": -17.471986770629883, "rewards/margins": 3.884843349456787, "rewards/rejected": -21.356828689575195, "step": 221 }, { "epoch": 0.5021204410517388, "grad_norm": 133.7068988108329, "learning_rate": 4.677268031162457e-07, "logits/chosen": -18.73598861694336, "logits/rejected": -18.53685188293457, "logps/chosen": -1.8922369480133057, "logps/rejected": -2.408550977706909, "loss": 3.5777, "rewards/accuracies": 0.71875, "rewards/chosen": -18.9223690032959, "rewards/margins": 5.163141250610352, "rewards/rejected": -24.085508346557617, "step": 222 }, { "epoch": 0.5043822448402601, "grad_norm": 124.33610987958505, "learning_rate": 4.646050846886985e-07, "logits/chosen": -17.20172882080078, "logits/rejected": -17.508880615234375, "logps/chosen": -1.3590439558029175, "logps/rejected": -1.6888562440872192, "loss": 3.4386, "rewards/accuracies": 0.59375, "rewards/chosen": -13.59044075012207, "rewards/margins": 3.2981221675872803, "rewards/rejected": -16.88856315612793, "step": 223 }, { "epoch": 0.5066440486287814, "grad_norm": 142.45176890673545, "learning_rate": 4.6147932065905494e-07, "logits/chosen": -18.200490951538086, "logits/rejected": -17.826982498168945, "logps/chosen": -1.4690287113189697, "logps/rejected": -2.0389232635498047, "loss": 3.4706, "rewards/accuracies": 0.6875, "rewards/chosen": -14.690287590026855, "rewards/margins": 5.698945999145508, "rewards/rejected": -20.38923454284668, "step": 224 }, { "epoch": 0.5089058524173028, "grad_norm": 122.4324863969919, "learning_rate": 4.5834970676419214e-07, "logits/chosen": -18.388614654541016, "logits/rejected": -18.511974334716797, "logps/chosen": -1.8286144733428955, "logps/rejected": -2.2092440128326416, "loss": 3.3465, "rewards/accuracies": 0.71875, "rewards/chosen": -18.28614616394043, "rewards/margins": 3.8062963485717773, "rewards/rejected": -22.09244155883789, "step": 225 }, { "epoch": 0.5111676562058242, "grad_norm": 129.2266776919472, "learning_rate": 4.552164389820673e-07, "logits/chosen": -19.732501983642578, "logits/rejected": -19.419715881347656, "logps/chosen": -1.65671706199646, "logps/rejected": -1.9091390371322632, "loss": 3.195, "rewards/accuracies": 0.65625, "rewards/chosen": -16.567171096801758, "rewards/margins": 2.5242207050323486, "rewards/rejected": -19.09139060974121, "step": 226 }, { "epoch": 0.5134294599943455, "grad_norm": 133.10160051050664, "learning_rate": 4.5207971351944605e-07, "logits/chosen": -18.20813751220703, "logits/rejected": -17.936553955078125, "logps/chosen": -1.5020235776901245, "logps/rejected": -2.2087697982788086, "loss": 3.6883, "rewards/accuracies": 0.65625, "rewards/chosen": -15.02023696899414, "rewards/margins": 7.067460536956787, "rewards/rejected": -22.087697982788086, "step": 227 }, { "epoch": 0.5156912637828668, "grad_norm": 115.75597435267949, "learning_rate": 4.489397267996157e-07, "logits/chosen": -18.27444839477539, "logits/rejected": -18.018796920776367, "logps/chosen": -1.620775580406189, "logps/rejected": -2.2751684188842773, "loss": 3.0586, "rewards/accuracies": 0.84375, "rewards/chosen": -16.2077579498291, "rewards/margins": 6.543926239013672, "rewards/rejected": -22.75168228149414, "step": 228 }, { "epoch": 0.5179530675713881, "grad_norm": 119.59564297014721, "learning_rate": 4.45796675450085e-07, "logits/chosen": -19.229650497436523, "logits/rejected": -19.270090103149414, "logps/chosen": -1.7479207515716553, "logps/rejected": -1.9961845874786377, "loss": 3.46, "rewards/accuracies": 0.65625, "rewards/chosen": -17.47920799255371, "rewards/margins": 2.482638120651245, "rewards/rejected": -19.96184730529785, "step": 229 }, { "epoch": 0.5202148713599095, "grad_norm": 123.01980120189968, "learning_rate": 4.4265075629027126e-07, "logits/chosen": -20.439476013183594, "logits/rejected": -20.24441146850586, "logps/chosen": -1.8146342039108276, "logps/rejected": -2.263535976409912, "loss": 3.4224, "rewards/accuracies": 0.65625, "rewards/chosen": -18.146343231201172, "rewards/margins": 4.489017486572266, "rewards/rejected": -22.635360717773438, "step": 230 }, { "epoch": 0.5224766751484309, "grad_norm": 107.58798664449722, "learning_rate": 4.3950216631917563e-07, "logits/chosen": -19.177587509155273, "logits/rejected": -18.99146270751953, "logps/chosen": -1.755948543548584, "logps/rejected": -2.040555477142334, "loss": 2.9987, "rewards/accuracies": 0.625, "rewards/chosen": -17.559486389160156, "rewards/margins": 2.846068859100342, "rewards/rejected": -20.405555725097656, "step": 231 }, { "epoch": 0.5247384789369522, "grad_norm": 109.78374887710692, "learning_rate": 4.3635110270304676e-07, "logits/chosen": -18.429826736450195, "logits/rejected": -18.861085891723633, "logps/chosen": -1.559441328048706, "logps/rejected": -1.994086742401123, "loss": 2.6686, "rewards/accuracies": 0.78125, "rewards/chosen": -15.59441089630127, "rewards/margins": 4.3464555740356445, "rewards/rejected": -19.940868377685547, "step": 232 }, { "epoch": 0.5270002827254736, "grad_norm": 116.99222466276613, "learning_rate": 4.331977627630339e-07, "logits/chosen": -17.902793884277344, "logits/rejected": -18.096345901489258, "logps/chosen": -1.382148265838623, "logps/rejected": -1.986084222793579, "loss": 2.6777, "rewards/accuracies": 0.875, "rewards/chosen": -13.82148265838623, "rewards/margins": 6.039360046386719, "rewards/rejected": -19.860841751098633, "step": 233 }, { "epoch": 0.5292620865139949, "grad_norm": 111.82478148043788, "learning_rate": 4.300423439628313e-07, "logits/chosen": -18.652141571044922, "logits/rejected": -18.740739822387695, "logps/chosen": -1.7629899978637695, "logps/rejected": -2.2177019119262695, "loss": 2.6484, "rewards/accuracies": 0.78125, "rewards/chosen": -17.629901885986328, "rewards/margins": 4.547117233276367, "rewards/rejected": -22.177017211914062, "step": 234 }, { "epoch": 0.5315238903025162, "grad_norm": 124.7698836200117, "learning_rate": 4.268850438963118e-07, "logits/chosen": -19.93507957458496, "logits/rejected": -19.908302307128906, "logps/chosen": -1.6999324560165405, "logps/rejected": -2.0379884243011475, "loss": 3.3517, "rewards/accuracies": 0.75, "rewards/chosen": -16.999324798583984, "rewards/margins": 3.380560874938965, "rewards/rejected": -20.379884719848633, "step": 235 }, { "epoch": 0.5337856940910376, "grad_norm": 130.0202969035073, "learning_rate": 4.2372606027515463e-07, "logits/chosen": -16.75927734375, "logits/rejected": -16.6347713470459, "logps/chosen": -1.7100436687469482, "logps/rejected": -2.0675430297851562, "loss": 3.3574, "rewards/accuracies": 0.75, "rewards/chosen": -17.100439071655273, "rewards/margins": 3.574993133544922, "rewards/rejected": -20.675430297851562, "step": 236 }, { "epoch": 0.536047497879559, "grad_norm": 142.3393610446344, "learning_rate": 4.2056559091646387e-07, "logits/chosen": -19.50155258178711, "logits/rejected": -19.233562469482422, "logps/chosen": -1.7303612232208252, "logps/rejected": -1.9693727493286133, "loss": 3.8961, "rewards/accuracies": 0.71875, "rewards/chosen": -17.303613662719727, "rewards/margins": 2.390113353729248, "rewards/rejected": -19.693727493286133, "step": 237 }, { "epoch": 0.5383093016680803, "grad_norm": 132.7580435084033, "learning_rate": 4.1740383373038116e-07, "logits/chosen": -19.230268478393555, "logits/rejected": -19.026260375976562, "logps/chosen": -1.7516860961914062, "logps/rejected": -2.302055835723877, "loss": 2.958, "rewards/accuracies": 0.84375, "rewards/chosen": -17.516862869262695, "rewards/margins": 5.503696441650391, "rewards/rejected": -23.020557403564453, "step": 238 }, { "epoch": 0.5405711054566016, "grad_norm": 146.2605729576581, "learning_rate": 4.1424098670769255e-07, "logits/chosen": -16.876083374023438, "logits/rejected": -16.858041763305664, "logps/chosen": -1.4845470190048218, "logps/rejected": -1.819509506225586, "loss": 3.4692, "rewards/accuracies": 0.75, "rewards/chosen": -14.845470428466797, "rewards/margins": 3.349626302719116, "rewards/rejected": -18.195096969604492, "step": 239 }, { "epoch": 0.542832909245123, "grad_norm": 103.3850975548694, "learning_rate": 4.1107724790743007e-07, "logits/chosen": -19.218101501464844, "logits/rejected": -19.07253646850586, "logps/chosen": -1.6440057754516602, "logps/rejected": -2.002814531326294, "loss": 2.7882, "rewards/accuracies": 0.875, "rewards/chosen": -16.4400577545166, "rewards/margins": 3.5880849361419678, "rewards/rejected": -20.028141021728516, "step": 240 }, { "epoch": 0.5450947130336443, "grad_norm": 108.59641705360205, "learning_rate": 4.0791281544446947e-07, "logits/chosen": -18.29979705810547, "logits/rejected": -18.103771209716797, "logps/chosen": -1.5138269662857056, "logps/rejected": -1.9497716426849365, "loss": 2.4937, "rewards/accuracies": 0.84375, "rewards/chosen": -15.138269424438477, "rewards/margins": 4.3594465255737305, "rewards/rejected": -19.49771499633789, "step": 241 }, { "epoch": 0.5473565168221657, "grad_norm": 118.47748367490442, "learning_rate": 4.0474788747712416e-07, "logits/chosen": -16.5893611907959, "logits/rejected": -16.470932006835938, "logps/chosen": -1.4450711011886597, "logps/rejected": -1.7044168710708618, "loss": 3.6487, "rewards/accuracies": 0.65625, "rewards/chosen": -14.450712203979492, "rewards/margins": 2.5934557914733887, "rewards/rejected": -17.04416847229004, "step": 242 }, { "epoch": 0.549618320610687, "grad_norm": 127.18536302103081, "learning_rate": 4.0158266219473573e-07, "logits/chosen": -19.53125762939453, "logits/rejected": -19.866533279418945, "logps/chosen": -1.3576165437698364, "logps/rejected": -1.6340572834014893, "loss": 2.8172, "rewards/accuracies": 0.75, "rewards/chosen": -13.576166152954102, "rewards/margins": 2.7644076347351074, "rewards/rejected": -16.340572357177734, "step": 243 }, { "epoch": 0.5518801243992084, "grad_norm": 144.24875011893795, "learning_rate": 3.984173378052643e-07, "logits/chosen": -17.787691116333008, "logits/rejected": -17.381996154785156, "logps/chosen": -1.4772697687149048, "logps/rejected": -1.9439443349838257, "loss": 2.7811, "rewards/accuracies": 0.78125, "rewards/chosen": -14.772696495056152, "rewards/margins": 4.666746139526367, "rewards/rejected": -19.439443588256836, "step": 244 }, { "epoch": 0.5541419281877297, "grad_norm": 130.02624554472013, "learning_rate": 3.9525211252287585e-07, "logits/chosen": -17.32337760925293, "logits/rejected": -17.11507797241211, "logps/chosen": -1.8103289604187012, "logps/rejected": -2.3584718704223633, "loss": 2.8682, "rewards/accuracies": 0.75, "rewards/chosen": -18.103288650512695, "rewards/margins": 5.4814324378967285, "rewards/rejected": -23.584720611572266, "step": 245 }, { "epoch": 0.556403731976251, "grad_norm": 112.27330200229477, "learning_rate": 3.920871845555305e-07, "logits/chosen": -19.526695251464844, "logits/rejected": -19.684539794921875, "logps/chosen": -1.6799914836883545, "logps/rejected": -2.0144991874694824, "loss": 3.2575, "rewards/accuracies": 0.75, "rewards/chosen": -16.799915313720703, "rewards/margins": 3.3450779914855957, "rewards/rejected": -20.14499282836914, "step": 246 }, { "epoch": 0.5586655357647724, "grad_norm": 123.49123400492937, "learning_rate": 3.8892275209256984e-07, "logits/chosen": -17.89940643310547, "logits/rejected": -18.42051124572754, "logps/chosen": -1.7134385108947754, "logps/rejected": -2.140021324157715, "loss": 3.0396, "rewards/accuracies": 0.78125, "rewards/chosen": -17.134387969970703, "rewards/margins": 4.26582670211792, "rewards/rejected": -21.40021324157715, "step": 247 }, { "epoch": 0.5609273395532938, "grad_norm": 135.4244307593356, "learning_rate": 3.8575901329230747e-07, "logits/chosen": -19.93168067932129, "logits/rejected": -19.66005516052246, "logps/chosen": -2.0851705074310303, "logps/rejected": -2.714930534362793, "loss": 3.4958, "rewards/accuracies": 0.71875, "rewards/chosen": -20.85170555114746, "rewards/margins": 6.297600746154785, "rewards/rejected": -27.14930534362793, "step": 248 }, { "epoch": 0.5631891433418151, "grad_norm": 126.1113881766967, "learning_rate": 3.8259616626961886e-07, "logits/chosen": -19.275150299072266, "logits/rejected": -19.156465530395508, "logps/chosen": -1.4787800312042236, "logps/rejected": -1.7590197324752808, "loss": 3.2882, "rewards/accuracies": 0.78125, "rewards/chosen": -14.787800788879395, "rewards/margins": 2.8023955821990967, "rewards/rejected": -17.59019660949707, "step": 249 }, { "epoch": 0.5654509471303364, "grad_norm": 143.11865197858194, "learning_rate": 3.794344090835362e-07, "logits/chosen": -19.26443099975586, "logits/rejected": -18.946765899658203, "logps/chosen": -1.713568925857544, "logps/rejected": -2.3817105293273926, "loss": 3.9405, "rewards/accuracies": 0.75, "rewards/chosen": -17.13568878173828, "rewards/margins": 6.681418418884277, "rewards/rejected": -23.817108154296875, "step": 250 }, { "epoch": 0.5677127509188578, "grad_norm": 135.07580917666817, "learning_rate": 3.7627393972484534e-07, "logits/chosen": -19.917835235595703, "logits/rejected": -19.873958587646484, "logps/chosen": -1.6692298650741577, "logps/rejected": -1.9403576850891113, "loss": 3.9729, "rewards/accuracies": 0.71875, "rewards/chosen": -16.692298889160156, "rewards/margins": 2.711277961730957, "rewards/rejected": -19.40357780456543, "step": 251 }, { "epoch": 0.5699745547073791, "grad_norm": 121.70724549497747, "learning_rate": 3.7311495610368823e-07, "logits/chosen": -19.70743179321289, "logits/rejected": -19.559131622314453, "logps/chosen": -1.725498914718628, "logps/rejected": -1.9476670026779175, "loss": 3.2505, "rewards/accuracies": 0.6875, "rewards/chosen": -17.254987716674805, "rewards/margins": 2.221681833267212, "rewards/rejected": -19.476669311523438, "step": 252 }, { "epoch": 0.5722363584959005, "grad_norm": 92.60358297039429, "learning_rate": 3.699576560371689e-07, "logits/chosen": -19.507673263549805, "logits/rejected": -19.240455627441406, "logps/chosen": -1.6785533428192139, "logps/rejected": -2.16404128074646, "loss": 2.668, "rewards/accuracies": 0.8125, "rewards/chosen": -16.785533905029297, "rewards/margins": 4.854878902435303, "rewards/rejected": -21.640413284301758, "step": 253 }, { "epoch": 0.5744981622844219, "grad_norm": 117.8494457797105, "learning_rate": 3.66802237236966e-07, "logits/chosen": -17.930335998535156, "logits/rejected": -17.775684356689453, "logps/chosen": -1.4922497272491455, "logps/rejected": -1.9657816886901855, "loss": 2.9119, "rewards/accuracies": 0.875, "rewards/chosen": -14.92249584197998, "rewards/margins": 4.735319137573242, "rewards/rejected": -19.65781593322754, "step": 254 }, { "epoch": 0.5767599660729432, "grad_norm": 128.0769904494541, "learning_rate": 3.636488972969532e-07, "logits/chosen": -18.133464813232422, "logits/rejected": -18.1317195892334, "logps/chosen": -1.7718310356140137, "logps/rejected": -2.1774401664733887, "loss": 3.32, "rewards/accuracies": 0.65625, "rewards/chosen": -17.71830940246582, "rewards/margins": 4.056089878082275, "rewards/rejected": -21.77440071105957, "step": 255 }, { "epoch": 0.5790217698614645, "grad_norm": 109.14078747974439, "learning_rate": 3.604978336808244e-07, "logits/chosen": -18.034282684326172, "logits/rejected": -17.809715270996094, "logps/chosen": -1.6046905517578125, "logps/rejected": -2.003675937652588, "loss": 3.0045, "rewards/accuracies": 0.84375, "rewards/chosen": -16.046907424926758, "rewards/margins": 3.9898502826690674, "rewards/rejected": -20.03675651550293, "step": 256 }, { "epoch": 0.5812835736499858, "grad_norm": 143.3661499166921, "learning_rate": 3.5734924370972876e-07, "logits/chosen": -18.07189178466797, "logits/rejected": -17.913497924804688, "logps/chosen": -1.4232511520385742, "logps/rejected": -1.7245614528656006, "loss": 3.0709, "rewards/accuracies": 0.75, "rewards/chosen": -14.232512474060059, "rewards/margins": 3.0131046772003174, "rewards/rejected": -17.245615005493164, "step": 257 }, { "epoch": 0.5835453774385072, "grad_norm": 115.02007432023626, "learning_rate": 3.5420332454991504e-07, "logits/chosen": -18.96527862548828, "logits/rejected": -18.798969268798828, "logps/chosen": -1.7819788455963135, "logps/rejected": -2.11525559425354, "loss": 3.3907, "rewards/accuracies": 0.78125, "rewards/chosen": -17.819787979125977, "rewards/margins": 3.3327670097351074, "rewards/rejected": -21.152557373046875, "step": 258 }, { "epoch": 0.5858071812270286, "grad_norm": 116.46242436554796, "learning_rate": 3.510602732003843e-07, "logits/chosen": -18.923112869262695, "logits/rejected": -19.192468643188477, "logps/chosen": -1.7312378883361816, "logps/rejected": -2.397825002670288, "loss": 2.9306, "rewards/accuracies": 0.71875, "rewards/chosen": -17.312379837036133, "rewards/margins": 6.665870189666748, "rewards/rejected": -23.978248596191406, "step": 259 }, { "epoch": 0.5880689850155499, "grad_norm": 130.25971956929465, "learning_rate": 3.4792028648055396e-07, "logits/chosen": -18.882343292236328, "logits/rejected": -18.996551513671875, "logps/chosen": -1.6002156734466553, "logps/rejected": -2.0233724117279053, "loss": 2.9946, "rewards/accuracies": 0.84375, "rewards/chosen": -16.00215721130371, "rewards/margins": 4.231566905975342, "rewards/rejected": -20.23372459411621, "step": 260 }, { "epoch": 0.5903307888040712, "grad_norm": 116.58608387387143, "learning_rate": 3.447835610179327e-07, "logits/chosen": -18.31661033630371, "logits/rejected": -18.64508819580078, "logps/chosen": -1.852226972579956, "logps/rejected": -2.575822591781616, "loss": 2.6653, "rewards/accuracies": 0.84375, "rewards/chosen": -18.52227020263672, "rewards/margins": 7.235957145690918, "rewards/rejected": -25.75822639465332, "step": 261 }, { "epoch": 0.5925925925925926, "grad_norm": 131.22286636501386, "learning_rate": 3.416502932358079e-07, "logits/chosen": -19.86322021484375, "logits/rejected": -19.80363655090332, "logps/chosen": -1.677018642425537, "logps/rejected": -1.9988960027694702, "loss": 3.2105, "rewards/accuracies": 0.75, "rewards/chosen": -16.770187377929688, "rewards/margins": 3.2187743186950684, "rewards/rejected": -19.98896026611328, "step": 262 }, { "epoch": 0.5948543963811139, "grad_norm": 113.95397777389032, "learning_rate": 3.385206793409451e-07, "logits/chosen": -16.9749813079834, "logits/rejected": -16.70389175415039, "logps/chosen": -1.5683423280715942, "logps/rejected": -1.973022699356079, "loss": 2.895, "rewards/accuracies": 0.8125, "rewards/chosen": -15.68342399597168, "rewards/margins": 4.046802520751953, "rewards/rejected": -19.730226516723633, "step": 263 }, { "epoch": 0.5971162001696353, "grad_norm": 136.6059459741964, "learning_rate": 3.3539491531130163e-07, "logits/chosen": -17.935535430908203, "logits/rejected": -17.720043182373047, "logps/chosen": -1.4503566026687622, "logps/rejected": -1.7065664529800415, "loss": 3.1485, "rewards/accuracies": 0.71875, "rewards/chosen": -14.50356674194336, "rewards/margins": 2.5620980262756348, "rewards/rejected": -17.065662384033203, "step": 264 }, { "epoch": 0.5993780039581567, "grad_norm": 137.85024895532698, "learning_rate": 3.3227319688375426e-07, "logits/chosen": -19.27477264404297, "logits/rejected": -19.29145050048828, "logps/chosen": -1.9958823919296265, "logps/rejected": -2.369366407394409, "loss": 3.4478, "rewards/accuracies": 0.71875, "rewards/chosen": -19.958826065063477, "rewards/margins": 3.7348380088806152, "rewards/rejected": -23.693662643432617, "step": 265 }, { "epoch": 0.601639807746678, "grad_norm": 123.86696879933385, "learning_rate": 3.291557195418427e-07, "logits/chosen": -18.97182273864746, "logits/rejected": -18.623531341552734, "logps/chosen": -1.6371110677719116, "logps/rejected": -2.1235477924346924, "loss": 3.124, "rewards/accuracies": 0.6875, "rewards/chosen": -16.371112823486328, "rewards/margins": 4.864367485046387, "rewards/rejected": -21.235477447509766, "step": 266 }, { "epoch": 0.6039016115351993, "grad_norm": 160.30929493983072, "learning_rate": 3.260426785035272e-07, "logits/chosen": -18.151859283447266, "logits/rejected": -18.189985275268555, "logps/chosen": -1.5115103721618652, "logps/rejected": -1.9283788204193115, "loss": 3.4577, "rewards/accuracies": 0.71875, "rewards/chosen": -15.115103721618652, "rewards/margins": 4.168684959411621, "rewards/rejected": -19.283788681030273, "step": 267 }, { "epoch": 0.6061634153237206, "grad_norm": 118.85038786779553, "learning_rate": 3.229342687089646e-07, "logits/chosen": -17.767433166503906, "logits/rejected": -17.30542755126953, "logps/chosen": -1.7307448387145996, "logps/rejected": -2.2132887840270996, "loss": 3.2675, "rewards/accuracies": 0.75, "rewards/chosen": -17.30744743347168, "rewards/margins": 4.825439453125, "rewards/rejected": -22.132884979248047, "step": 268 }, { "epoch": 0.608425219112242, "grad_norm": 135.72304791395987, "learning_rate": 3.1983068480830143e-07, "logits/chosen": -17.994487762451172, "logits/rejected": -17.969486236572266, "logps/chosen": -1.7408254146575928, "logps/rejected": -2.30222225189209, "loss": 3.0847, "rewards/accuracies": 0.84375, "rewards/chosen": -17.408254623413086, "rewards/margins": 5.613969326019287, "rewards/rejected": -23.0222225189209, "step": 269 }, { "epoch": 0.6106870229007634, "grad_norm": 136.8409835051581, "learning_rate": 3.1673212114948387e-07, "logits/chosen": -18.464635848999023, "logits/rejected": -18.186416625976562, "logps/chosen": -1.8000985383987427, "logps/rejected": -2.355130672454834, "loss": 2.2817, "rewards/accuracies": 0.875, "rewards/chosen": -18.000986099243164, "rewards/margins": 5.550319671630859, "rewards/rejected": -23.551307678222656, "step": 270 }, { "epoch": 0.6129488266892847, "grad_norm": 128.71421710643776, "learning_rate": 3.1363877176608845e-07, "logits/chosen": -18.273387908935547, "logits/rejected": -18.52509117126465, "logps/chosen": -1.7283263206481934, "logps/rejected": -2.170729637145996, "loss": 2.919, "rewards/accuracies": 0.75, "rewards/chosen": -17.28326416015625, "rewards/margins": 4.424034118652344, "rewards/rejected": -21.707298278808594, "step": 271 }, { "epoch": 0.615210630477806, "grad_norm": 130.4882843458114, "learning_rate": 3.1055083036517076e-07, "logits/chosen": -18.288068771362305, "logits/rejected": -17.75768280029297, "logps/chosen": -1.6948351860046387, "logps/rejected": -2.2144925594329834, "loss": 3.3302, "rewards/accuracies": 0.71875, "rewards/chosen": -16.948348999023438, "rewards/margins": 5.196574687957764, "rewards/rejected": -22.14492416381836, "step": 272 }, { "epoch": 0.6174724342663274, "grad_norm": 119.37105820236062, "learning_rate": 3.074684903151364e-07, "logits/chosen": -17.694923400878906, "logits/rejected": -17.45270538330078, "logps/chosen": -1.4144465923309326, "logps/rejected": -1.7332209348678589, "loss": 3.1823, "rewards/accuracies": 0.71875, "rewards/chosen": -14.144466400146484, "rewards/margins": 3.1877427101135254, "rewards/rejected": -17.33220863342285, "step": 273 }, { "epoch": 0.6197342380548487, "grad_norm": 126.9873723041469, "learning_rate": 3.0439194463363136e-07, "logits/chosen": -19.154897689819336, "logits/rejected": -19.074947357177734, "logps/chosen": -1.6273291110992432, "logps/rejected": -2.1156094074249268, "loss": 3.1957, "rewards/accuracies": 0.65625, "rewards/chosen": -16.27328872680664, "rewards/margins": 4.882803916931152, "rewards/rejected": -21.156095504760742, "step": 274 }, { "epoch": 0.6219960418433701, "grad_norm": 98.82421414344171, "learning_rate": 3.0132138597545537e-07, "logits/chosen": -18.89469337463379, "logits/rejected": -18.92743492126465, "logps/chosen": -1.8614561557769775, "logps/rejected": -2.298145294189453, "loss": 2.4871, "rewards/accuracies": 0.625, "rewards/chosen": -18.614561080932617, "rewards/margins": 4.366891860961914, "rewards/rejected": -22.9814510345459, "step": 275 }, { "epoch": 0.6242578456318915, "grad_norm": 116.98836789151454, "learning_rate": 2.982570066204981e-07, "logits/chosen": -17.621952056884766, "logits/rejected": -17.41912841796875, "logps/chosen": -1.7095118761062622, "logps/rejected": -2.267876625061035, "loss": 2.8961, "rewards/accuracies": 0.71875, "rewards/chosen": -17.09511947631836, "rewards/margins": 5.583648204803467, "rewards/rejected": -22.678768157958984, "step": 276 }, { "epoch": 0.6265196494204128, "grad_norm": 139.7828630658467, "learning_rate": 2.951989984616979e-07, "logits/chosen": -18.495176315307617, "logits/rejected": -18.713180541992188, "logps/chosen": -1.794584035873413, "logps/rejected": -2.701646327972412, "loss": 3.3014, "rewards/accuracies": 0.8125, "rewards/chosen": -17.94584083557129, "rewards/margins": 9.070621490478516, "rewards/rejected": -27.016462326049805, "step": 277 }, { "epoch": 0.6287814532089341, "grad_norm": 104.32477594909946, "learning_rate": 2.9214755299302584e-07, "logits/chosen": -18.10324478149414, "logits/rejected": -18.533466339111328, "logps/chosen": -1.4521610736846924, "logps/rejected": -2.136770486831665, "loss": 2.7298, "rewards/accuracies": 0.75, "rewards/chosen": -14.521611213684082, "rewards/margins": 6.84609317779541, "rewards/rejected": -21.367706298828125, "step": 278 }, { "epoch": 0.6310432569974554, "grad_norm": 129.12312580227652, "learning_rate": 2.89102861297494e-07, "logits/chosen": -16.307287216186523, "logits/rejected": -16.62302589416504, "logps/chosen": -1.5291308164596558, "logps/rejected": -1.9218378067016602, "loss": 3.2843, "rewards/accuracies": 0.6875, "rewards/chosen": -15.29130744934082, "rewards/margins": 3.9270708560943604, "rewards/rejected": -19.218379974365234, "step": 279 }, { "epoch": 0.6333050607859768, "grad_norm": 119.86450692791884, "learning_rate": 2.860651140351902e-07, "logits/chosen": -17.81388282775879, "logits/rejected": -17.59682273864746, "logps/chosen": -1.4970345497131348, "logps/rejected": -2.2067017555236816, "loss": 3.0318, "rewards/accuracies": 0.8125, "rewards/chosen": -14.970344543457031, "rewards/margins": 7.096673011779785, "rewards/rejected": -22.0670166015625, "step": 280 }, { "epoch": 0.6355668645744982, "grad_norm": 138.33272392046018, "learning_rate": 2.830345014313381e-07, "logits/chosen": -18.549711227416992, "logits/rejected": -18.178396224975586, "logps/chosen": -1.5726195573806763, "logps/rejected": -2.220799684524536, "loss": 3.0664, "rewards/accuracies": 0.875, "rewards/chosen": -15.726195335388184, "rewards/margins": 6.481801986694336, "rewards/rejected": -22.20799446105957, "step": 281 }, { "epoch": 0.6378286683630195, "grad_norm": 121.49306068303021, "learning_rate": 2.800112132643856e-07, "logits/chosen": -18.666532516479492, "logits/rejected": -18.698705673217773, "logps/chosen": -1.9378407001495361, "logps/rejected": -2.5402820110321045, "loss": 3.0881, "rewards/accuracies": 0.8125, "rewards/chosen": -19.378408432006836, "rewards/margins": 6.024411678314209, "rewards/rejected": -25.40281867980957, "step": 282 }, { "epoch": 0.6400904721515408, "grad_norm": 114.50153480696909, "learning_rate": 2.7699543885412105e-07, "logits/chosen": -18.842344284057617, "logits/rejected": -19.002525329589844, "logps/chosen": -1.7454712390899658, "logps/rejected": -2.214015483856201, "loss": 2.7145, "rewards/accuracies": 0.8125, "rewards/chosen": -17.454710006713867, "rewards/margins": 4.685445308685303, "rewards/rejected": -22.14015769958496, "step": 283 }, { "epoch": 0.6423522759400622, "grad_norm": 126.9731178541442, "learning_rate": 2.7398736704981725e-07, "logits/chosen": -17.94224739074707, "logits/rejected": -18.106706619262695, "logps/chosen": -1.8006949424743652, "logps/rejected": -2.4084813594818115, "loss": 2.7514, "rewards/accuracies": 0.78125, "rewards/chosen": -18.00695037841797, "rewards/margins": 6.077863693237305, "rewards/rejected": -24.084814071655273, "step": 284 }, { "epoch": 0.6446140797285835, "grad_norm": 121.79509986844188, "learning_rate": 2.709871862184063e-07, "logits/chosen": -16.98878288269043, "logits/rejected": -17.01874542236328, "logps/chosen": -1.8407750129699707, "logps/rejected": -2.2451648712158203, "loss": 3.3275, "rewards/accuracies": 0.6875, "rewards/chosen": -18.407751083374023, "rewards/margins": 4.0438995361328125, "rewards/rejected": -22.451650619506836, "step": 285 }, { "epoch": 0.6468758835171049, "grad_norm": 108.87797931776414, "learning_rate": 2.679950842326837e-07, "logits/chosen": -18.95654296875, "logits/rejected": -18.801700592041016, "logps/chosen": -1.6954306364059448, "logps/rejected": -2.5442655086517334, "loss": 2.5918, "rewards/accuracies": 0.78125, "rewards/chosen": -16.95430564880371, "rewards/margins": 8.488348960876465, "rewards/rejected": -25.442655563354492, "step": 286 }, { "epoch": 0.6491376873056263, "grad_norm": 111.41714747114163, "learning_rate": 2.6501124845954363e-07, "logits/chosen": -16.922765731811523, "logits/rejected": -16.570079803466797, "logps/chosen": -1.5942519903182983, "logps/rejected": -2.0841176509857178, "loss": 2.7103, "rewards/accuracies": 0.75, "rewards/chosen": -15.942520141601562, "rewards/margins": 4.898656368255615, "rewards/rejected": -20.841175079345703, "step": 287 }, { "epoch": 0.6513994910941476, "grad_norm": 111.54831143350387, "learning_rate": 2.62035865748246e-07, "logits/chosen": -19.410310745239258, "logits/rejected": -19.599136352539062, "logps/chosen": -1.7219384908676147, "logps/rejected": -2.0403764247894287, "loss": 3.257, "rewards/accuracies": 0.8125, "rewards/chosen": -17.219385147094727, "rewards/margins": 3.184377670288086, "rewards/rejected": -20.403764724731445, "step": 288 }, { "epoch": 0.6536612948826689, "grad_norm": 129.1650835585718, "learning_rate": 2.5906912241871554e-07, "logits/chosen": -19.173494338989258, "logits/rejected": -19.192523956298828, "logps/chosen": -1.6308451890945435, "logps/rejected": -2.0142531394958496, "loss": 3.5351, "rewards/accuracies": 0.8125, "rewards/chosen": -16.308452606201172, "rewards/margins": 3.8340790271759033, "rewards/rejected": -20.142528533935547, "step": 289 }, { "epoch": 0.6559230986711903, "grad_norm": 114.97395819625001, "learning_rate": 2.561112042498753e-07, "logits/chosen": -17.663278579711914, "logits/rejected": -17.458215713500977, "logps/chosen": -1.433032751083374, "logps/rejected": -1.9120677709579468, "loss": 3.1725, "rewards/accuracies": 0.75, "rewards/chosen": -14.330328941345215, "rewards/margins": 4.79034948348999, "rewards/rejected": -19.12067985534668, "step": 290 }, { "epoch": 0.6581849024597116, "grad_norm": 118.80435385837328, "learning_rate": 2.5316229646801195e-07, "logits/chosen": -19.93079948425293, "logits/rejected": -19.657909393310547, "logps/chosen": -1.6756254434585571, "logps/rejected": -2.254075288772583, "loss": 2.7255, "rewards/accuracies": 0.78125, "rewards/chosen": -16.756254196166992, "rewards/margins": 5.7845001220703125, "rewards/rejected": -22.540752410888672, "step": 291 }, { "epoch": 0.660446706248233, "grad_norm": 120.26763355250853, "learning_rate": 2.5022258373517714e-07, "logits/chosen": -18.864389419555664, "logits/rejected": -18.524669647216797, "logps/chosen": -1.6191332340240479, "logps/rejected": -2.0536766052246094, "loss": 2.8735, "rewards/accuracies": 0.71875, "rewards/chosen": -16.191333770751953, "rewards/margins": 4.345433235168457, "rewards/rejected": -20.536766052246094, "step": 292 }, { "epoch": 0.6627085100367544, "grad_norm": 147.0811073810554, "learning_rate": 2.4729225013762474e-07, "logits/chosen": -18.751914978027344, "logits/rejected": -18.83761215209961, "logps/chosen": -1.7884494066238403, "logps/rejected": -2.187746524810791, "loss": 3.8995, "rewards/accuracies": 0.65625, "rewards/chosen": -17.884492874145508, "rewards/margins": 3.9929721355438232, "rewards/rejected": -21.877464294433594, "step": 293 }, { "epoch": 0.6649703138252756, "grad_norm": 148.90534716218426, "learning_rate": 2.4437147917428203e-07, "logits/chosen": -18.826107025146484, "logits/rejected": -18.503259658813477, "logps/chosen": -1.7195425033569336, "logps/rejected": -2.187434434890747, "loss": 3.0299, "rewards/accuracies": 0.8125, "rewards/chosen": -17.195425033569336, "rewards/margins": 4.678918838500977, "rewards/rejected": -21.87434196472168, "step": 294 }, { "epoch": 0.667232117613797, "grad_norm": 127.90476462202498, "learning_rate": 2.414604537452595e-07, "logits/chosen": -18.674943923950195, "logits/rejected": -18.60759735107422, "logps/chosen": -1.7201087474822998, "logps/rejected": -2.014519214630127, "loss": 3.128, "rewards/accuracies": 0.625, "rewards/chosen": -17.201087951660156, "rewards/margins": 2.9441049098968506, "rewards/rejected": -20.145193099975586, "step": 295 }, { "epoch": 0.6694939214023183, "grad_norm": 108.9145846232443, "learning_rate": 2.385593561403974e-07, "logits/chosen": -19.400646209716797, "logits/rejected": -19.273517608642578, "logps/chosen": -1.726077914237976, "logps/rejected": -2.1319644451141357, "loss": 2.882, "rewards/accuracies": 0.65625, "rewards/chosen": -17.260780334472656, "rewards/margins": 4.058864593505859, "rewards/rejected": -21.319643020629883, "step": 296 }, { "epoch": 0.6717557251908397, "grad_norm": 112.86761120489062, "learning_rate": 2.3566836802785119e-07, "logits/chosen": -18.81859016418457, "logits/rejected": -18.859493255615234, "logps/chosen": -1.9398648738861084, "logps/rejected": -2.3005058765411377, "loss": 2.8161, "rewards/accuracies": 0.6875, "rewards/chosen": -19.39864730834961, "rewards/margins": 3.60640811920166, "rewards/rejected": -23.005056381225586, "step": 297 }, { "epoch": 0.6740175289793611, "grad_norm": 118.40226934077113, "learning_rate": 2.327876704427146e-07, "logits/chosen": -18.128990173339844, "logits/rejected": -18.05478858947754, "logps/chosen": -1.7885990142822266, "logps/rejected": -2.1967928409576416, "loss": 3.3258, "rewards/accuracies": 0.71875, "rewards/chosen": -17.885990142822266, "rewards/margins": 4.08193826675415, "rewards/rejected": -21.96792984008789, "step": 298 }, { "epoch": 0.6762793327678824, "grad_norm": 153.7897090271479, "learning_rate": 2.2991744377568358e-07, "logits/chosen": -17.88959312438965, "logits/rejected": -17.185943603515625, "logps/chosen": -1.6429543495178223, "logps/rejected": -2.067078113555908, "loss": 3.74, "rewards/accuracies": 0.75, "rewards/chosen": -16.429542541503906, "rewards/margins": 4.241240501403809, "rewards/rejected": -20.67078399658203, "step": 299 }, { "epoch": 0.6785411365564037, "grad_norm": 133.58644267056656, "learning_rate": 2.270578677617601e-07, "logits/chosen": -18.508695602416992, "logits/rejected": -18.557147979736328, "logps/chosen": -1.617353916168213, "logps/rejected": -2.0458405017852783, "loss": 3.4581, "rewards/accuracies": 0.71875, "rewards/chosen": -16.173538208007812, "rewards/margins": 4.2848663330078125, "rewards/rejected": -20.458406448364258, "step": 300 }, { "epoch": 0.6808029403449251, "grad_norm": 116.99592631499642, "learning_rate": 2.242091214689971e-07, "logits/chosen": -18.887380599975586, "logits/rejected": -18.278608322143555, "logps/chosen": -1.7732751369476318, "logps/rejected": -2.1223254203796387, "loss": 2.8134, "rewards/accuracies": 0.78125, "rewards/chosen": -17.732751846313477, "rewards/margins": 3.4905025959014893, "rewards/rejected": -21.223255157470703, "step": 301 }, { "epoch": 0.6830647441334464, "grad_norm": 129.92140054225982, "learning_rate": 2.2137138328728456e-07, "logits/chosen": -18.269765853881836, "logits/rejected": -17.92385482788086, "logps/chosen": -1.8314377069473267, "logps/rejected": -2.02689266204834, "loss": 3.2214, "rewards/accuracies": 0.625, "rewards/chosen": -18.314376831054688, "rewards/margins": 1.9545530080795288, "rewards/rejected": -20.26892852783203, "step": 302 }, { "epoch": 0.6853265479219678, "grad_norm": 120.44838708614284, "learning_rate": 2.1854483091717974e-07, "logits/chosen": -17.881437301635742, "logits/rejected": -17.71358299255371, "logps/chosen": -1.6855335235595703, "logps/rejected": -2.2010655403137207, "loss": 2.7716, "rewards/accuracies": 0.8125, "rewards/chosen": -16.855335235595703, "rewards/margins": 5.155317783355713, "rewards/rejected": -22.010652542114258, "step": 303 }, { "epoch": 0.6875883517104892, "grad_norm": 142.94697133093283, "learning_rate": 2.1572964135877863e-07, "logits/chosen": -17.533218383789062, "logits/rejected": -17.355274200439453, "logps/chosen": -1.5027070045471191, "logps/rejected": -2.0504214763641357, "loss": 3.376, "rewards/accuracies": 0.75, "rewards/chosen": -15.027069091796875, "rewards/margins": 5.477144241333008, "rewards/rejected": -20.504213333129883, "step": 304 }, { "epoch": 0.6898501554990104, "grad_norm": 114.71461725743285, "learning_rate": 2.1292599090063245e-07, "logits/chosen": -18.869152069091797, "logits/rejected": -18.81059455871582, "logps/chosen": -1.6714305877685547, "logps/rejected": -2.1169075965881348, "loss": 2.7332, "rewards/accuracies": 0.78125, "rewards/chosen": -16.714305877685547, "rewards/margins": 4.454771518707275, "rewards/rejected": -21.169076919555664, "step": 305 }, { "epoch": 0.6921119592875318, "grad_norm": 111.93201937391214, "learning_rate": 2.1013405510870824e-07, "logits/chosen": -18.295650482177734, "logits/rejected": -18.45261573791504, "logps/chosen": -1.8770661354064941, "logps/rejected": -2.236387252807617, "loss": 3.3382, "rewards/accuracies": 0.625, "rewards/chosen": -18.770662307739258, "rewards/margins": 3.593210220336914, "rewards/rejected": -22.36387062072754, "step": 306 }, { "epoch": 0.6943737630760531, "grad_norm": 129.49294218978284, "learning_rate": 2.0735400881539494e-07, "logits/chosen": -20.06885528564453, "logits/rejected": -20.67595672607422, "logps/chosen": -1.699569821357727, "logps/rejected": -2.1293015480041504, "loss": 3.2719, "rewards/accuracies": 0.78125, "rewards/chosen": -16.995698928833008, "rewards/margins": 4.29731559753418, "rewards/rejected": -21.293014526367188, "step": 307 }, { "epoch": 0.6966355668645745, "grad_norm": 126.68283700786048, "learning_rate": 2.0458602610855536e-07, "logits/chosen": -16.85354995727539, "logits/rejected": -17.0955753326416, "logps/chosen": -1.633847713470459, "logps/rejected": -2.1871325969696045, "loss": 2.746, "rewards/accuracies": 0.875, "rewards/chosen": -16.338476181030273, "rewards/margins": 5.532848834991455, "rewards/rejected": -21.87132453918457, "step": 308 }, { "epoch": 0.6988973706530959, "grad_norm": 121.46567102959813, "learning_rate": 2.0183028032062422e-07, "logits/chosen": -18.197134017944336, "logits/rejected": -18.313335418701172, "logps/chosen": -1.7123744487762451, "logps/rejected": -2.3391215801239014, "loss": 3.301, "rewards/accuracies": 0.6875, "rewards/chosen": -17.123743057250977, "rewards/margins": 6.267471790313721, "rewards/rejected": -23.391216278076172, "step": 309 }, { "epoch": 0.7011591744416172, "grad_norm": 124.40646143719266, "learning_rate": 1.9908694401775473e-07, "logits/chosen": -19.83294677734375, "logits/rejected": -20.086868286132812, "logps/chosen": -2.0055930614471436, "logps/rejected": -2.405197858810425, "loss": 3.0917, "rewards/accuracies": 0.8125, "rewards/chosen": -20.055932998657227, "rewards/margins": 3.996046781539917, "rewards/rejected": -24.051979064941406, "step": 310 }, { "epoch": 0.7034209782301385, "grad_norm": 126.95244540601655, "learning_rate": 1.9635618898901196e-07, "logits/chosen": -19.060583114624023, "logits/rejected": -19.149402618408203, "logps/chosen": -1.9011938571929932, "logps/rejected": -2.284623146057129, "loss": 3.0781, "rewards/accuracies": 0.65625, "rewards/chosen": -19.011938095092773, "rewards/margins": 3.834294080734253, "rewards/rejected": -22.846233367919922, "step": 311 }, { "epoch": 0.7056827820186599, "grad_norm": 131.1304953783553, "learning_rate": 1.9363818623561565e-07, "logits/chosen": -18.06791114807129, "logits/rejected": -17.94767189025879, "logps/chosen": -1.763685941696167, "logps/rejected": -2.0710325241088867, "loss": 3.5313, "rewards/accuracies": 0.6875, "rewards/chosen": -17.636857986450195, "rewards/margins": 3.0734646320343018, "rewards/rejected": -20.710325241088867, "step": 312 }, { "epoch": 0.7079445858071812, "grad_norm": 116.35057290069598, "learning_rate": 1.9093310596023108e-07, "logits/chosen": -18.00191307067871, "logits/rejected": -18.05375099182129, "logps/chosen": -1.9328045845031738, "logps/rejected": -2.434842348098755, "loss": 2.5712, "rewards/accuracies": 0.78125, "rewards/chosen": -19.328044891357422, "rewards/margins": 5.020379543304443, "rewards/rejected": -24.348424911499023, "step": 313 }, { "epoch": 0.7102063895957026, "grad_norm": 107.72415682056965, "learning_rate": 1.8824111755631274e-07, "logits/chosen": -17.74974250793457, "logits/rejected": -17.714256286621094, "logps/chosen": -1.6889841556549072, "logps/rejected": -2.1979172229766846, "loss": 3.324, "rewards/accuracies": 0.84375, "rewards/chosen": -16.889841079711914, "rewards/margins": 5.089331150054932, "rewards/rejected": -21.97917366027832, "step": 314 }, { "epoch": 0.712468193384224, "grad_norm": 175.2751395769359, "learning_rate": 1.8556238959749457e-07, "logits/chosen": -20.16362762451172, "logits/rejected": -20.45577049255371, "logps/chosen": -1.9660546779632568, "logps/rejected": -2.3499526977539062, "loss": 3.5857, "rewards/accuracies": 0.71875, "rewards/chosen": -19.660547256469727, "rewards/margins": 3.8389804363250732, "rewards/rejected": -23.49952507019043, "step": 315 }, { "epoch": 0.7147299971727452, "grad_norm": 119.01395158521336, "learning_rate": 1.8289708982703562e-07, "logits/chosen": -18.191469192504883, "logits/rejected": -18.05630111694336, "logps/chosen": -1.6164871454238892, "logps/rejected": -2.0442566871643066, "loss": 3.1091, "rewards/accuracies": 0.78125, "rewards/chosen": -16.164871215820312, "rewards/margins": 4.277695655822754, "rewards/rejected": -20.442567825317383, "step": 316 }, { "epoch": 0.7169918009612666, "grad_norm": 110.39790952014322, "learning_rate": 1.802453851473151e-07, "logits/chosen": -18.25019073486328, "logits/rejected": -18.184785842895508, "logps/chosen": -1.9268181324005127, "logps/rejected": -2.5494563579559326, "loss": 2.5819, "rewards/accuracies": 0.75, "rewards/chosen": -19.26818084716797, "rewards/margins": 6.226382732391357, "rewards/rejected": -25.494564056396484, "step": 317 }, { "epoch": 0.719253604749788, "grad_norm": 111.59669789809674, "learning_rate": 1.7760744160938093e-07, "logits/chosen": -19.184326171875, "logits/rejected": -19.069150924682617, "logps/chosen": -1.9187712669372559, "logps/rejected": -2.417238473892212, "loss": 2.5089, "rewards/accuracies": 0.90625, "rewards/chosen": -19.187713623046875, "rewards/margins": 4.984671592712402, "rewards/rejected": -24.172386169433594, "step": 318 }, { "epoch": 0.7215154085383093, "grad_norm": 111.40164812220848, "learning_rate": 1.7498342440255135e-07, "logits/chosen": -17.5487060546875, "logits/rejected": -17.807178497314453, "logps/chosen": -1.8459084033966064, "logps/rejected": -2.48807430267334, "loss": 3.5431, "rewards/accuracies": 0.84375, "rewards/chosen": -18.459083557128906, "rewards/margins": 6.42165994644165, "rewards/rejected": -24.88074493408203, "step": 319 }, { "epoch": 0.7237772123268307, "grad_norm": 124.12971999012613, "learning_rate": 1.7237349784407115e-07, "logits/chosen": -17.986967086791992, "logits/rejected": -18.200471878051758, "logps/chosen": -2.0284173488616943, "logps/rejected": -2.5237107276916504, "loss": 3.7015, "rewards/accuracies": 0.59375, "rewards/chosen": -20.2841739654541, "rewards/margins": 4.952933311462402, "rewards/rejected": -25.237106323242188, "step": 320 }, { "epoch": 0.726039016115352, "grad_norm": 141.09967606925343, "learning_rate": 1.6977782536882178e-07, "logits/chosen": -16.887096405029297, "logits/rejected": -16.802282333374023, "logps/chosen": -1.782692790031433, "logps/rejected": -2.303356170654297, "loss": 3.1257, "rewards/accuracies": 0.8125, "rewards/chosen": -17.82692527770996, "rewards/margins": 5.206636428833008, "rewards/rejected": -23.0335636138916, "step": 321 }, { "epoch": 0.7283008199038733, "grad_norm": 116.38070332785638, "learning_rate": 1.6719656951908708e-07, "logits/chosen": -17.198162078857422, "logits/rejected": -16.910192489624023, "logps/chosen": -1.3144806623458862, "logps/rejected": -1.8561556339263916, "loss": 2.7638, "rewards/accuracies": 0.75, "rewards/chosen": -13.144807815551758, "rewards/margins": 5.416749954223633, "rewards/rejected": -18.561553955078125, "step": 322 }, { "epoch": 0.7305626236923947, "grad_norm": 114.11746852210314, "learning_rate": 1.6462989193437453e-07, "logits/chosen": -17.512184143066406, "logits/rejected": -17.920053482055664, "logps/chosen": -1.9415867328643799, "logps/rejected": -2.1908974647521973, "loss": 3.6178, "rewards/accuracies": 0.59375, "rewards/chosen": -19.415868759155273, "rewards/margins": 2.493105888366699, "rewards/rejected": -21.908973693847656, "step": 323 }, { "epoch": 0.732824427480916, "grad_norm": 113.99856611080652, "learning_rate": 1.6207795334129365e-07, "logits/chosen": -19.32732582092285, "logits/rejected": -19.14191246032715, "logps/chosen": -1.6971518993377686, "logps/rejected": -2.295579433441162, "loss": 2.8077, "rewards/accuracies": 0.75, "rewards/chosen": -16.971519470214844, "rewards/margins": 5.98427677154541, "rewards/rejected": -22.955795288085938, "step": 324 }, { "epoch": 0.7350862312694374, "grad_norm": 111.98084422068199, "learning_rate": 1.5954091354349121e-07, "logits/chosen": -17.98455238342285, "logits/rejected": -17.884262084960938, "logps/chosen": -1.7183001041412354, "logps/rejected": -2.17026424407959, "loss": 2.8751, "rewards/accuracies": 0.75, "rewards/chosen": -17.183000564575195, "rewards/margins": 4.519641876220703, "rewards/rejected": -21.7026424407959, "step": 325 }, { "epoch": 0.7373480350579588, "grad_norm": 232.39770149735068, "learning_rate": 1.5701893141164364e-07, "logits/chosen": -18.812040328979492, "logits/rejected": -18.633085250854492, "logps/chosen": -1.676187515258789, "logps/rejected": -2.3979151248931885, "loss": 3.0272, "rewards/accuracies": 0.875, "rewards/chosen": -16.761873245239258, "rewards/margins": 7.217278480529785, "rewards/rejected": -23.979150772094727, "step": 326 }, { "epoch": 0.73960983884648, "grad_norm": 120.7387881104363, "learning_rate": 1.545121648735093e-07, "logits/chosen": -18.34151840209961, "logits/rejected": -18.516704559326172, "logps/chosen": -1.6959328651428223, "logps/rejected": -2.0101141929626465, "loss": 3.0935, "rewards/accuracies": 0.625, "rewards/chosen": -16.95932960510254, "rewards/margins": 3.1418118476867676, "rewards/rejected": -20.10114097595215, "step": 327 }, { "epoch": 0.7418716426350014, "grad_norm": 116.46893556878314, "learning_rate": 1.5202077090403863e-07, "logits/chosen": -16.94781494140625, "logits/rejected": -16.98765754699707, "logps/chosen": -1.6322197914123535, "logps/rejected": -2.0489912033081055, "loss": 2.9396, "rewards/accuracies": 0.8125, "rewards/chosen": -16.32219696044922, "rewards/margins": 4.1677141189575195, "rewards/rejected": -20.489913940429688, "step": 328 }, { "epoch": 0.7441334464235227, "grad_norm": 146.64902887166238, "learning_rate": 1.495449055155443e-07, "logits/chosen": -16.194108963012695, "logits/rejected": -16.39000701904297, "logps/chosen": -1.4700114727020264, "logps/rejected": -1.9279454946517944, "loss": 3.1195, "rewards/accuracies": 0.78125, "rewards/chosen": -14.700116157531738, "rewards/margins": 4.579338550567627, "rewards/rejected": -19.279455184936523, "step": 329 }, { "epoch": 0.7463952502120441, "grad_norm": 128.77629572500186, "learning_rate": 1.4708472374793112e-07, "logits/chosen": -18.751955032348633, "logits/rejected": -18.25170135498047, "logps/chosen": -1.6594680547714233, "logps/rejected": -2.1272830963134766, "loss": 3.568, "rewards/accuracies": 0.75, "rewards/chosen": -16.594682693481445, "rewards/margins": 4.67814826965332, "rewards/rejected": -21.272830963134766, "step": 330 }, { "epoch": 0.7486570540005655, "grad_norm": 116.85830667937259, "learning_rate": 1.4464037965898878e-07, "logits/chosen": -19.034826278686523, "logits/rejected": -18.4996395111084, "logps/chosen": -1.6908671855926514, "logps/rejected": -2.253139019012451, "loss": 3.1892, "rewards/accuracies": 0.84375, "rewards/chosen": -16.908668518066406, "rewards/margins": 5.622718811035156, "rewards/rejected": -22.531389236450195, "step": 331 }, { "epoch": 0.7509188577890868, "grad_norm": 128.75404728697518, "learning_rate": 1.4221202631474282e-07, "logits/chosen": -18.098434448242188, "logits/rejected": -18.39754867553711, "logps/chosen": -1.7314308881759644, "logps/rejected": -2.248021125793457, "loss": 3.2902, "rewards/accuracies": 0.71875, "rewards/chosen": -17.314308166503906, "rewards/margins": 5.165902137756348, "rewards/rejected": -22.480209350585938, "step": 332 }, { "epoch": 0.7531806615776081, "grad_norm": 112.23640069941514, "learning_rate": 1.3979981577987113e-07, "logits/chosen": -17.052106857299805, "logits/rejected": -17.000144958496094, "logps/chosen": -1.8308027982711792, "logps/rejected": -2.2044148445129395, "loss": 2.8652, "rewards/accuracies": 0.75, "rewards/chosen": -18.308027267456055, "rewards/margins": 3.73612117767334, "rewards/rejected": -22.04414939880371, "step": 333 }, { "epoch": 0.7554424653661295, "grad_norm": 120.8440355705056, "learning_rate": 1.374038991081807e-07, "logits/chosen": -17.621667861938477, "logits/rejected": -17.636934280395508, "logps/chosen": -1.6944687366485596, "logps/rejected": -2.1847054958343506, "loss": 2.8811, "rewards/accuracies": 0.75, "rewards/chosen": -16.944686889648438, "rewards/margins": 4.902366638183594, "rewards/rejected": -21.84705352783203, "step": 334 }, { "epoch": 0.7577042691546508, "grad_norm": 126.66430839817343, "learning_rate": 1.3502442633314882e-07, "logits/chosen": -16.78680992126465, "logits/rejected": -16.919919967651367, "logps/chosen": -1.619330883026123, "logps/rejected": -2.069342851638794, "loss": 2.8007, "rewards/accuracies": 0.84375, "rewards/chosen": -16.193309783935547, "rewards/margins": 4.500118255615234, "rewards/rejected": -20.69342803955078, "step": 335 }, { "epoch": 0.7599660729431722, "grad_norm": 107.65531245820142, "learning_rate": 1.3266154645852815e-07, "logits/chosen": -18.672462463378906, "logits/rejected": -18.989612579345703, "logps/chosen": -1.6392340660095215, "logps/rejected": -2.228325366973877, "loss": 2.9037, "rewards/accuracies": 0.75, "rewards/chosen": -16.39234161376953, "rewards/margins": 5.890913009643555, "rewards/rejected": -22.283252716064453, "step": 336 }, { "epoch": 0.7622278767316936, "grad_norm": 104.66828130875484, "learning_rate": 1.303154074490152e-07, "logits/chosen": -17.055776596069336, "logits/rejected": -16.77735137939453, "logps/chosen": -1.4908720254898071, "logps/rejected": -1.799816608428955, "loss": 2.7556, "rewards/accuracies": 0.75, "rewards/chosen": -14.908721923828125, "rewards/margins": 3.089444875717163, "rewards/rejected": -17.998165130615234, "step": 337 }, { "epoch": 0.7644896805202148, "grad_norm": 117.88462800241341, "learning_rate": 1.2798615622098616e-07, "logits/chosen": -17.4500732421875, "logits/rejected": -17.197757720947266, "logps/chosen": -1.6963038444519043, "logps/rejected": -2.2104175090789795, "loss": 2.8171, "rewards/accuracies": 0.6875, "rewards/chosen": -16.96303939819336, "rewards/margins": 5.141136646270752, "rewards/rejected": -22.104175567626953, "step": 338 }, { "epoch": 0.7667514843087362, "grad_norm": 106.90568181230161, "learning_rate": 1.2567393863329523e-07, "logits/chosen": -18.870460510253906, "logits/rejected": -18.87204360961914, "logps/chosen": -1.8440241813659668, "logps/rejected": -2.3904788494110107, "loss": 2.8668, "rewards/accuracies": 0.71875, "rewards/chosen": -18.44024085998535, "rewards/margins": 5.464546203613281, "rewards/rejected": -23.904788970947266, "step": 339 }, { "epoch": 0.7690132880972576, "grad_norm": 135.41776419808772, "learning_rate": 1.233788994781423e-07, "logits/chosen": -17.011192321777344, "logits/rejected": -17.013751983642578, "logps/chosen": -1.4503196477890015, "logps/rejected": -2.010632276535034, "loss": 3.3367, "rewards/accuracies": 0.78125, "rewards/chosen": -14.50319766998291, "rewards/margins": 5.603124618530273, "rewards/rejected": -20.106321334838867, "step": 340 }, { "epoch": 0.7712750918857789, "grad_norm": 136.82224989040978, "learning_rate": 1.2110118247200468e-07, "logits/chosen": -18.286027908325195, "logits/rejected": -18.17170524597168, "logps/chosen": -1.6681314706802368, "logps/rejected": -2.0645222663879395, "loss": 2.9115, "rewards/accuracies": 0.8125, "rewards/chosen": -16.68131446838379, "rewards/margins": 3.963907241821289, "rewards/rejected": -20.645221710205078, "step": 341 }, { "epoch": 0.7735368956743003, "grad_norm": 116.57287182904345, "learning_rate": 1.1884093024663933e-07, "logits/chosen": -16.591590881347656, "logits/rejected": -16.540773391723633, "logps/chosen": -1.636472463607788, "logps/rejected": -2.1073007583618164, "loss": 2.8411, "rewards/accuracies": 0.65625, "rewards/chosen": -16.36472511291504, "rewards/margins": 4.708281517028809, "rewards/rejected": -21.07300567626953, "step": 342 }, { "epoch": 0.7757986994628217, "grad_norm": 118.5480993837639, "learning_rate": 1.1659828434014886e-07, "logits/chosen": -17.95990562438965, "logits/rejected": -17.720420837402344, "logps/chosen": -1.635303020477295, "logps/rejected": -2.0385963916778564, "loss": 3.0565, "rewards/accuracies": 0.6875, "rewards/chosen": -16.353031158447266, "rewards/margins": 4.032935619354248, "rewards/rejected": -20.38596534729004, "step": 343 }, { "epoch": 0.7780605032513429, "grad_norm": 134.89450131828505, "learning_rate": 1.143733851881203e-07, "logits/chosen": -19.509780883789062, "logits/rejected": -19.24722671508789, "logps/chosen": -1.545256495475769, "logps/rejected": -2.114616632461548, "loss": 2.6658, "rewards/accuracies": 0.8125, "rewards/chosen": -15.452564239501953, "rewards/margins": 5.693601608276367, "rewards/rejected": -21.146167755126953, "step": 344 }, { "epoch": 0.7803223070398643, "grad_norm": 123.25488488665061, "learning_rate": 1.1216637211483005e-07, "logits/chosen": -18.04479217529297, "logits/rejected": -18.232608795166016, "logps/chosen": -1.771346092224121, "logps/rejected": -2.349630355834961, "loss": 3.2986, "rewards/accuracies": 0.6875, "rewards/chosen": -17.71346092224121, "rewards/margins": 5.782842636108398, "rewards/rejected": -23.49630355834961, "step": 345 }, { "epoch": 0.7825841108283856, "grad_norm": 114.24822425854607, "learning_rate": 1.0997738332451936e-07, "logits/chosen": -18.91605567932129, "logits/rejected": -18.952014923095703, "logps/chosen": -1.9931975603103638, "logps/rejected": -2.6114182472229004, "loss": 2.7017, "rewards/accuracies": 0.8125, "rewards/chosen": -19.931976318359375, "rewards/margins": 6.182207107543945, "rewards/rejected": -26.11418342590332, "step": 346 }, { "epoch": 0.784845914616907, "grad_norm": 101.71447993811626, "learning_rate": 1.0780655589274031e-07, "logits/chosen": -19.68770408630371, "logits/rejected": -19.459531784057617, "logps/chosen": -1.9268380403518677, "logps/rejected": -2.4731695652008057, "loss": 3.0762, "rewards/accuracies": 0.6875, "rewards/chosen": -19.26837921142578, "rewards/margins": 5.463315010070801, "rewards/rejected": -24.731693267822266, "step": 347 }, { "epoch": 0.7871077184054284, "grad_norm": 144.33485472555742, "learning_rate": 1.056540257577712e-07, "logits/chosen": -19.237892150878906, "logits/rejected": -19.197168350219727, "logps/chosen": -2.0048787593841553, "logps/rejected": -2.5931644439697266, "loss": 2.7076, "rewards/accuracies": 0.75, "rewards/chosen": -20.048786163330078, "rewards/margins": 5.882862091064453, "rewards/rejected": -25.93164825439453, "step": 348 }, { "epoch": 0.7893695221939496, "grad_norm": 112.5748530177231, "learning_rate": 1.0351992771210554e-07, "logits/chosen": -18.623769760131836, "logits/rejected": -19.13688850402832, "logps/chosen": -1.8369648456573486, "logps/rejected": -2.3405067920684814, "loss": 3.2742, "rewards/accuracies": 0.71875, "rewards/chosen": -18.369647979736328, "rewards/margins": 5.035419464111328, "rewards/rejected": -23.405067443847656, "step": 349 }, { "epoch": 0.791631325982471, "grad_norm": 132.898356490048, "learning_rate": 1.0140439539400953e-07, "logits/chosen": -18.275182723999023, "logits/rejected": -18.35052490234375, "logps/chosen": -2.0253753662109375, "logps/rejected": -2.4307503700256348, "loss": 3.2702, "rewards/accuracies": 0.75, "rewards/chosen": -20.253755569458008, "rewards/margins": 4.053745746612549, "rewards/rejected": -24.307498931884766, "step": 350 }, { "epoch": 0.7938931297709924, "grad_norm": 119.8096502311399, "learning_rate": 9.930756127915488e-08, "logits/chosen": -20.30059051513672, "logits/rejected": -20.41552734375, "logps/chosen": -1.89390230178833, "logps/rejected": -2.320417642593384, "loss": 2.8442, "rewards/accuracies": 0.75, "rewards/chosen": -18.939023971557617, "rewards/margins": 4.265152454376221, "rewards/rejected": -23.204177856445312, "step": 351 }, { "epoch": 0.7961549335595137, "grad_norm": 126.98574065425882, "learning_rate": 9.722955667232242e-08, "logits/chosen": -16.467742919921875, "logits/rejected": -16.47926902770996, "logps/chosen": -1.5406644344329834, "logps/rejected": -2.026040554046631, "loss": 3.5477, "rewards/accuracies": 0.75, "rewards/chosen": -15.406644821166992, "rewards/margins": 4.853761672973633, "rewards/rejected": -20.260406494140625, "step": 352 }, { "epoch": 0.7984167373480351, "grad_norm": 136.73209013329856, "learning_rate": 9.517051169918016e-08, "logits/chosen": -17.076210021972656, "logits/rejected": -17.026386260986328, "logps/chosen": -1.750929832458496, "logps/rejected": -2.2710447311401367, "loss": 3.3139, "rewards/accuracies": 0.75, "rewards/chosen": -17.509296417236328, "rewards/margins": 5.2011494636535645, "rewards/rejected": -22.71044921875, "step": 353 }, { "epoch": 0.8006785411365565, "grad_norm": 130.1019327736892, "learning_rate": 9.313055529813412e-08, "logits/chosen": -18.05898666381836, "logits/rejected": -18.257951736450195, "logps/chosen": -1.6652144193649292, "logps/rejected": -1.9794695377349854, "loss": 3.2462, "rewards/accuracies": 0.6875, "rewards/chosen": -16.652145385742188, "rewards/margins": 3.142549991607666, "rewards/rejected": -19.794694900512695, "step": 354 }, { "epoch": 0.8029403449250777, "grad_norm": 135.42727334061703, "learning_rate": 9.110981521225532e-08, "logits/chosen": -17.604793548583984, "logits/rejected": -17.597965240478516, "logps/chosen": -1.6499242782592773, "logps/rejected": -2.056095600128174, "loss": 3.3403, "rewards/accuracies": 0.6875, "rewards/chosen": -16.499242782592773, "rewards/margins": 4.061714172363281, "rewards/rejected": -20.560955047607422, "step": 355 }, { "epoch": 0.8052021487135991, "grad_norm": 119.05199787120601, "learning_rate": 8.910841798127884e-08, "logits/chosen": -17.721969604492188, "logits/rejected": -18.05399513244629, "logps/chosen": -1.4811893701553345, "logps/rejected": -1.954594612121582, "loss": 3.4896, "rewards/accuracies": 0.71875, "rewards/chosen": -14.811893463134766, "rewards/margins": 4.734054088592529, "rewards/rejected": -19.545948028564453, "step": 356 }, { "epoch": 0.8074639525021204, "grad_norm": 115.38476702241651, "learning_rate": 8.712648893368139e-08, "logits/chosen": -18.29002571105957, "logits/rejected": -18.369548797607422, "logps/chosen": -2.055358409881592, "logps/rejected": -2.6065731048583984, "loss": 3.0386, "rewards/accuracies": 0.6875, "rewards/chosen": -20.553585052490234, "rewards/margins": 5.512145042419434, "rewards/rejected": -26.065731048583984, "step": 357 }, { "epoch": 0.8097257562906418, "grad_norm": 136.37770427632142, "learning_rate": 8.516415217883186e-08, "logits/chosen": -20.281574249267578, "logits/rejected": -20.5699405670166, "logps/chosen": -1.7435851097106934, "logps/rejected": -2.0679283142089844, "loss": 3.2075, "rewards/accuracies": 0.75, "rewards/chosen": -17.435850143432617, "rewards/margins": 3.2434327602386475, "rewards/rejected": -20.679283142089844, "step": 358 }, { "epoch": 0.8119875600791632, "grad_norm": 135.87264464627583, "learning_rate": 8.32215305992209e-08, "logits/chosen": -18.004587173461914, "logits/rejected": -17.975828170776367, "logps/chosen": -1.6789181232452393, "logps/rejected": -2.0826172828674316, "loss": 3.5249, "rewards/accuracies": 0.71875, "rewards/chosen": -16.789180755615234, "rewards/margins": 4.036990165710449, "rewards/rejected": -20.826171875, "step": 359 }, { "epoch": 0.8142493638676844, "grad_norm": 102.16900938424125, "learning_rate": 8.129874584276448e-08, "logits/chosen": -19.01247787475586, "logits/rejected": -18.749008178710938, "logps/chosen": -1.819298505783081, "logps/rejected": -2.235302209854126, "loss": 2.6286, "rewards/accuracies": 0.75, "rewards/chosen": -18.19298553466797, "rewards/margins": 4.160037517547607, "rewards/rejected": -22.353023529052734, "step": 360 }, { "epoch": 0.8165111676562058, "grad_norm": 118.97964507525137, "learning_rate": 7.939591831518746e-08, "logits/chosen": -18.50431251525879, "logits/rejected": -18.590957641601562, "logps/chosen": -1.4700491428375244, "logps/rejected": -1.7618913650512695, "loss": 3.4084, "rewards/accuracies": 0.75, "rewards/chosen": -14.700489044189453, "rewards/margins": 2.9184250831604004, "rewards/rejected": -17.618913650512695, "step": 361 }, { "epoch": 0.8187729714447272, "grad_norm": 116.66310673813236, "learning_rate": 7.751316717248304e-08, "logits/chosen": -17.54795265197754, "logits/rejected": -17.81288719177246, "logps/chosen": -1.8244284391403198, "logps/rejected": -2.45658016204834, "loss": 2.5685, "rewards/accuracies": 0.78125, "rewards/chosen": -18.244285583496094, "rewards/margins": 6.321516990661621, "rewards/rejected": -24.565799713134766, "step": 362 }, { "epoch": 0.8210347752332485, "grad_norm": 135.45249586464112, "learning_rate": 7.565061031345142e-08, "logits/chosen": -17.44479751586914, "logits/rejected": -17.825634002685547, "logps/chosen": -1.5290935039520264, "logps/rejected": -2.0649290084838867, "loss": 2.6049, "rewards/accuracies": 0.71875, "rewards/chosen": -15.290933609008789, "rewards/margins": 5.358358860015869, "rewards/rejected": -20.6492919921875, "step": 363 }, { "epoch": 0.8232965790217699, "grad_norm": 156.30830120318217, "learning_rate": 7.380836437231686e-08, "logits/chosen": -17.245084762573242, "logits/rejected": -17.161640167236328, "logps/chosen": -1.7969930171966553, "logps/rejected": -2.382586717605591, "loss": 3.0618, "rewards/accuracies": 0.6875, "rewards/chosen": -17.969932556152344, "rewards/margins": 5.8559346199035645, "rewards/rejected": -23.82586669921875, "step": 364 }, { "epoch": 0.8255583828102913, "grad_norm": 102.01777871530572, "learning_rate": 7.198654471142371e-08, "logits/chosen": -15.612630844116211, "logits/rejected": -15.740878105163574, "logps/chosen": -1.6975904703140259, "logps/rejected": -2.106090545654297, "loss": 2.2693, "rewards/accuracies": 0.78125, "rewards/chosen": -16.97590446472168, "rewards/margins": 4.084999084472656, "rewards/rejected": -21.06090545654297, "step": 365 }, { "epoch": 0.8278201865988125, "grad_norm": 143.32145816638297, "learning_rate": 7.01852654140132e-08, "logits/chosen": -16.77008056640625, "logits/rejected": -16.502208709716797, "logps/chosen": -1.930238962173462, "logps/rejected": -2.095974922180176, "loss": 3.3324, "rewards/accuracies": 0.65625, "rewards/chosen": -19.302391052246094, "rewards/margins": 1.6573582887649536, "rewards/rejected": -20.959747314453125, "step": 366 }, { "epoch": 0.8300819903873339, "grad_norm": 104.17056966782296, "learning_rate": 6.840463927707833e-08, "logits/chosen": -19.091691970825195, "logits/rejected": -18.677135467529297, "logps/chosen": -1.845261573791504, "logps/rejected": -2.5466208457946777, "loss": 2.7259, "rewards/accuracies": 0.75, "rewards/chosen": -18.45261573791504, "rewards/margins": 7.0135955810546875, "rewards/rejected": -25.46621322631836, "step": 367 }, { "epoch": 0.8323437941758552, "grad_norm": 108.38820652434924, "learning_rate": 6.664477780430138e-08, "logits/chosen": -18.874038696289062, "logits/rejected": -18.718944549560547, "logps/chosen": -1.7501500844955444, "logps/rejected": -2.1758456230163574, "loss": 3.0257, "rewards/accuracies": 0.71875, "rewards/chosen": -17.50149917602539, "rewards/margins": 4.256957530975342, "rewards/rejected": -21.758455276489258, "step": 368 }, { "epoch": 0.8346055979643766, "grad_norm": 132.12208838073096, "learning_rate": 6.49057911990711e-08, "logits/chosen": -20.289962768554688, "logits/rejected": -20.444843292236328, "logps/chosen": -1.7201656103134155, "logps/rejected": -2.0844554901123047, "loss": 3.4713, "rewards/accuracies": 0.75, "rewards/chosen": -17.201656341552734, "rewards/margins": 3.642897605895996, "rewards/rejected": -20.84455108642578, "step": 369 }, { "epoch": 0.836867401752898, "grad_norm": 109.87950428664602, "learning_rate": 6.318778835758189e-08, "logits/chosen": -19.79219627380371, "logits/rejected": -19.581737518310547, "logps/chosen": -1.884903907775879, "logps/rejected": -2.484591007232666, "loss": 2.5576, "rewards/accuracies": 0.8125, "rewards/chosen": -18.84903907775879, "rewards/margins": 5.996870994567871, "rewards/rejected": -24.845909118652344, "step": 370 }, { "epoch": 0.8391292055414192, "grad_norm": 132.43862675142938, "learning_rate": 6.149087686201433e-08, "logits/chosen": -17.08002471923828, "logits/rejected": -17.242450714111328, "logps/chosen": -1.42905592918396, "logps/rejected": -1.849015712738037, "loss": 3.3921, "rewards/accuracies": 0.78125, "rewards/chosen": -14.290557861328125, "rewards/margins": 4.199598789215088, "rewards/rejected": -18.490156173706055, "step": 371 }, { "epoch": 0.8413910093299406, "grad_norm": 125.15917954045561, "learning_rate": 5.98151629737988e-08, "logits/chosen": -18.34781265258789, "logits/rejected": -18.730987548828125, "logps/chosen": -1.8118157386779785, "logps/rejected": -2.531665802001953, "loss": 2.9388, "rewards/accuracies": 0.84375, "rewards/chosen": -18.1181583404541, "rewards/margins": 7.19849967956543, "rewards/rejected": -25.31665802001953, "step": 372 }, { "epoch": 0.843652813118462, "grad_norm": 103.52444879535686, "learning_rate": 5.816075162696097e-08, "logits/chosen": -17.558019638061523, "logits/rejected": -17.425859451293945, "logps/chosen": -1.3931989669799805, "logps/rejected": -2.0169644355773926, "loss": 2.5297, "rewards/accuracies": 0.90625, "rewards/chosen": -13.931989669799805, "rewards/margins": 6.237652778625488, "rewards/rejected": -20.16964340209961, "step": 373 }, { "epoch": 0.8459146169069833, "grad_norm": 96.8291597215048, "learning_rate": 5.6527746421551046e-08, "logits/chosen": -19.054040908813477, "logits/rejected": -18.775854110717773, "logps/chosen": -1.6858869791030884, "logps/rejected": -2.047640800476074, "loss": 3.141, "rewards/accuracies": 0.6875, "rewards/chosen": -16.858869552612305, "rewards/margins": 3.617537260055542, "rewards/rejected": -20.47640609741211, "step": 374 }, { "epoch": 0.8481764206955047, "grad_norm": 111.02813607168243, "learning_rate": 5.4916249617156064e-08, "logits/chosen": -18.283065795898438, "logits/rejected": -17.85355567932129, "logps/chosen": -1.6720712184906006, "logps/rejected": -2.2261600494384766, "loss": 3.0392, "rewards/accuracies": 0.78125, "rewards/chosen": -16.720712661743164, "rewards/margins": 5.540886402130127, "rewards/rejected": -22.261598587036133, "step": 375 }, { "epoch": 0.8504382244840261, "grad_norm": 113.59608952374487, "learning_rate": 5.332636212649646e-08, "logits/chosen": -17.41098976135254, "logits/rejected": -17.46200180053711, "logps/chosen": -1.504152774810791, "logps/rejected": -1.9162389039993286, "loss": 3.0122, "rewards/accuracies": 0.875, "rewards/chosen": -15.041528701782227, "rewards/margins": 4.120862007141113, "rewards/rejected": -19.162389755249023, "step": 376 }, { "epoch": 0.8527000282725473, "grad_norm": 113.17297439644031, "learning_rate": 5.17581835091069e-08, "logits/chosen": -18.82924461364746, "logits/rejected": -19.1933536529541, "logps/chosen": -1.8666414022445679, "logps/rejected": -2.4097957611083984, "loss": 3.0043, "rewards/accuracies": 0.6875, "rewards/chosen": -18.666414260864258, "rewards/margins": 5.431545257568359, "rewards/rejected": -24.097959518432617, "step": 377 }, { "epoch": 0.8549618320610687, "grad_norm": 127.65744686159971, "learning_rate": 5.02118119651016e-08, "logits/chosen": -15.968871116638184, "logits/rejected": -16.066356658935547, "logps/chosen": -1.7414379119873047, "logps/rejected": -2.270836353302002, "loss": 2.8791, "rewards/accuracies": 0.6875, "rewards/chosen": -17.41438102722168, "rewards/margins": 5.293981075286865, "rewards/rejected": -22.708358764648438, "step": 378 }, { "epoch": 0.85722363584959, "grad_norm": 128.0416423580519, "learning_rate": 4.868734432902526e-08, "logits/chosen": -15.94872760772705, "logits/rejected": -15.913581848144531, "logps/chosen": -1.6559503078460693, "logps/rejected": -2.2543857097625732, "loss": 3.2796, "rewards/accuracies": 0.75, "rewards/chosen": -16.55950355529785, "rewards/margins": 5.984354496002197, "rewards/rejected": -22.543859481811523, "step": 379 }, { "epoch": 0.8594854396381114, "grad_norm": 137.44029074786243, "learning_rate": 4.7184876063789134e-08, "logits/chosen": -16.0161075592041, "logits/rejected": -16.125635147094727, "logps/chosen": -1.7773343324661255, "logps/rejected": -2.296335458755493, "loss": 3.0373, "rewards/accuracies": 0.71875, "rewards/chosen": -17.773344039916992, "rewards/margins": 5.190011024475098, "rewards/rejected": -22.963354110717773, "step": 380 }, { "epoch": 0.8617472434266328, "grad_norm": 97.95472528249057, "learning_rate": 4.570450125469314e-08, "logits/chosen": -18.46808624267578, "logits/rejected": -18.239410400390625, "logps/chosen": -1.8006447553634644, "logps/rejected": -2.444537878036499, "loss": 2.4475, "rewards/accuracies": 0.75, "rewards/chosen": -18.006446838378906, "rewards/margins": 6.4389328956604, "rewards/rejected": -24.44538116455078, "step": 381 }, { "epoch": 0.864009047215154, "grad_norm": 122.46444272567702, "learning_rate": 4.424631260353378e-08, "logits/chosen": -16.532258987426758, "logits/rejected": -16.949682235717773, "logps/chosen": -1.4457993507385254, "logps/rejected": -1.9387412071228027, "loss": 3.2723, "rewards/accuracies": 0.8125, "rewards/chosen": -14.457992553710938, "rewards/margins": 4.929420471191406, "rewards/rejected": -19.387413024902344, "step": 382 }, { "epoch": 0.8662708510036754, "grad_norm": 134.6321015186439, "learning_rate": 4.281040142280008e-08, "logits/chosen": -17.987564086914062, "logits/rejected": -17.679292678833008, "logps/chosen": -1.4789788722991943, "logps/rejected": -1.9366073608398438, "loss": 2.4358, "rewards/accuracies": 0.8125, "rewards/chosen": -14.789788246154785, "rewards/margins": 4.5762858390808105, "rewards/rejected": -19.366071701049805, "step": 383 }, { "epoch": 0.8685326547921968, "grad_norm": 145.23607203406814, "learning_rate": 4.1396857629954286e-08, "logits/chosen": -19.37828254699707, "logits/rejected": -19.550512313842773, "logps/chosen": -2.0876235961914062, "logps/rejected": -2.755703926086426, "loss": 3.0501, "rewards/accuracies": 0.6875, "rewards/chosen": -20.876237869262695, "rewards/margins": 6.6808037757873535, "rewards/rejected": -27.55704116821289, "step": 384 }, { "epoch": 0.8707944585807181, "grad_norm": 95.46804083715921, "learning_rate": 4.000576974180232e-08, "logits/chosen": -17.347396850585938, "logits/rejected": -17.552875518798828, "logps/chosen": -1.7520135641098022, "logps/rejected": -2.1543803215026855, "loss": 2.6766, "rewards/accuracies": 0.6875, "rewards/chosen": -17.5201358795166, "rewards/margins": 4.023664474487305, "rewards/rejected": -21.543800354003906, "step": 385 }, { "epoch": 0.8730562623692395, "grad_norm": 104.95835540317567, "learning_rate": 3.8637224868950066e-08, "logits/chosen": -18.283233642578125, "logits/rejected": -18.088119506835938, "logps/chosen": -1.747474193572998, "logps/rejected": -2.1311683654785156, "loss": 2.9522, "rewards/accuracies": 0.8125, "rewards/chosen": -17.474742889404297, "rewards/margins": 3.8369407653808594, "rewards/rejected": -21.311681747436523, "step": 386 }, { "epoch": 0.8753180661577609, "grad_norm": 109.46707890564362, "learning_rate": 3.729130871034885e-08, "logits/chosen": -17.69164276123047, "logits/rejected": -17.34153938293457, "logps/chosen": -1.60804283618927, "logps/rejected": -2.0555553436279297, "loss": 2.7504, "rewards/accuracies": 0.75, "rewards/chosen": -16.080427169799805, "rewards/margins": 4.475124835968018, "rewards/rejected": -20.555551528930664, "step": 387 }, { "epoch": 0.8775798699462821, "grad_norm": 124.79737385782774, "learning_rate": 3.596810554792888e-08, "logits/chosen": -19.268070220947266, "logits/rejected": -19.46062469482422, "logps/chosen": -2.0554943084716797, "logps/rejected": -2.616269826889038, "loss": 3.2245, "rewards/accuracies": 0.8125, "rewards/chosen": -20.554943084716797, "rewards/margins": 5.607754230499268, "rewards/rejected": -26.162696838378906, "step": 388 }, { "epoch": 0.8798416737348035, "grad_norm": 122.92689968096272, "learning_rate": 3.466769824132116e-08, "logits/chosen": -19.075674057006836, "logits/rejected": -19.091054916381836, "logps/chosen": -1.9592108726501465, "logps/rejected": -2.5127036571502686, "loss": 2.9941, "rewards/accuracies": 0.8125, "rewards/chosen": -19.59210968017578, "rewards/margins": 5.534926414489746, "rewards/rejected": -25.12703514099121, "step": 389 }, { "epoch": 0.8821034775233249, "grad_norm": 129.9790697640605, "learning_rate": 3.339016822266925e-08, "logits/chosen": -17.7946834564209, "logits/rejected": -17.704071044921875, "logps/chosen": -1.8595997095108032, "logps/rejected": -2.2788615226745605, "loss": 2.2966, "rewards/accuracies": 0.84375, "rewards/chosen": -18.595996856689453, "rewards/margins": 4.192615985870361, "rewards/rejected": -22.788612365722656, "step": 390 }, { "epoch": 0.8843652813118462, "grad_norm": 165.30674333822296, "learning_rate": 3.213559549152958e-08, "logits/chosen": -17.560834884643555, "logits/rejected": -17.52518081665039, "logps/chosen": -1.4146690368652344, "logps/rejected": -2.0134286880493164, "loss": 3.5443, "rewards/accuracies": 0.875, "rewards/chosen": -14.146690368652344, "rewards/margins": 5.987596035003662, "rewards/rejected": -20.134286880493164, "step": 391 }, { "epoch": 0.8866270851003676, "grad_norm": 125.85896431798938, "learning_rate": 3.090405860986203e-08, "logits/chosen": -19.027587890625, "logits/rejected": -19.275127410888672, "logps/chosen": -2.2725133895874023, "logps/rejected": -2.906642436981201, "loss": 2.978, "rewards/accuracies": 0.8125, "rewards/chosen": -22.72513198852539, "rewards/margins": 6.3412885665893555, "rewards/rejected": -29.066425323486328, "step": 392 }, { "epoch": 0.8888888888888888, "grad_norm": 128.26387031170609, "learning_rate": 2.9695634697110315e-08, "logits/chosen": -17.81490135192871, "logits/rejected": -17.856733322143555, "logps/chosen": -1.7137458324432373, "logps/rejected": -2.1935040950775146, "loss": 3.3315, "rewards/accuracies": 0.75, "rewards/chosen": -17.13745880126953, "rewards/margins": 4.797582149505615, "rewards/rejected": -21.935041427612305, "step": 393 }, { "epoch": 0.8911506926774102, "grad_norm": 133.54866529087667, "learning_rate": 2.8510399425372766e-08, "logits/chosen": -17.018762588500977, "logits/rejected": -17.156879425048828, "logps/chosen": -1.6060205698013306, "logps/rejected": -2.3053719997406006, "loss": 2.6726, "rewards/accuracies": 0.90625, "rewards/chosen": -16.060205459594727, "rewards/margins": 6.993513584136963, "rewards/rejected": -23.05371856689453, "step": 394 }, { "epoch": 0.8934124964659316, "grad_norm": 133.68043982311156, "learning_rate": 2.734842701466329e-08, "logits/chosen": -19.136150360107422, "logits/rejected": -19.161407470703125, "logps/chosen": -1.6094989776611328, "logps/rejected": -1.981292486190796, "loss": 3.0683, "rewards/accuracies": 0.75, "rewards/chosen": -16.094987869262695, "rewards/margins": 3.7179362773895264, "rewards/rejected": -19.812923431396484, "step": 395 }, { "epoch": 0.8956743002544529, "grad_norm": 135.95939370288835, "learning_rate": 2.6209790228264438e-08, "logits/chosen": -17.601152420043945, "logits/rejected": -17.81585693359375, "logps/chosen": -2.0195772647857666, "logps/rejected": -2.464128017425537, "loss": 3.0865, "rewards/accuracies": 0.71875, "rewards/chosen": -20.19577407836914, "rewards/margins": 4.445508003234863, "rewards/rejected": -24.64128303527832, "step": 396 }, { "epoch": 0.8979361040429743, "grad_norm": 113.2954374562523, "learning_rate": 2.5094560368170305e-08, "logits/chosen": -17.47164535522461, "logits/rejected": -17.385438919067383, "logps/chosen": -1.6676338911056519, "logps/rejected": -2.1376919746398926, "loss": 2.9399, "rewards/accuracies": 0.6875, "rewards/chosen": -16.676340103149414, "rewards/margins": 4.7005791664123535, "rewards/rejected": -21.376916885375977, "step": 397 }, { "epoch": 0.9001979078314957, "grad_norm": 109.04886917574666, "learning_rate": 2.4002807270621893e-08, "logits/chosen": -19.112560272216797, "logits/rejected": -19.00596046447754, "logps/chosen": -1.6903434991836548, "logps/rejected": -2.2326083183288574, "loss": 2.8406, "rewards/accuracies": 0.75, "rewards/chosen": -16.90343475341797, "rewards/margins": 5.422649383544922, "rewards/rejected": -22.32608413696289, "step": 398 }, { "epoch": 0.9024597116200169, "grad_norm": 118.17958359886174, "learning_rate": 2.293459930173354e-08, "logits/chosen": -19.049705505371094, "logits/rejected": -19.230878829956055, "logps/chosen": -1.889772891998291, "logps/rejected": -2.3055403232574463, "loss": 3.2104, "rewards/accuracies": 0.71875, "rewards/chosen": -18.897727966308594, "rewards/margins": 4.157675743103027, "rewards/rejected": -23.055404663085938, "step": 399 }, { "epoch": 0.9047215154085383, "grad_norm": 118.64459034636026, "learning_rate": 2.189000335321256e-08, "logits/chosen": -16.958843231201172, "logits/rejected": -16.984729766845703, "logps/chosen": -1.7274019718170166, "logps/rejected": -2.112072467803955, "loss": 3.2286, "rewards/accuracies": 0.75, "rewards/chosen": -17.274019241333008, "rewards/margins": 3.8467049598693848, "rewards/rejected": -21.120725631713867, "step": 400 }, { "epoch": 0.9069833191970597, "grad_norm": 136.85158094391608, "learning_rate": 2.086908483816954e-08, "logits/chosen": -18.37076187133789, "logits/rejected": -18.361103057861328, "logps/chosen": -2.0497868061065674, "logps/rejected": -2.2769107818603516, "loss": 3.0689, "rewards/accuracies": 0.75, "rewards/chosen": -20.497867584228516, "rewards/margins": 2.2712390422821045, "rewards/rejected": -22.769105911254883, "step": 401 }, { "epoch": 0.909245122985581, "grad_norm": 123.9239261032842, "learning_rate": 1.9871907687022717e-08, "logits/chosen": -16.64785385131836, "logits/rejected": -16.638736724853516, "logps/chosen": -1.5310957431793213, "logps/rejected": -2.1431174278259277, "loss": 3.429, "rewards/accuracies": 0.8125, "rewards/chosen": -15.310956954956055, "rewards/margins": 6.120217323303223, "rewards/rejected": -21.43117332458496, "step": 402 }, { "epoch": 0.9115069267741024, "grad_norm": 112.34416737954284, "learning_rate": 1.889853434349451e-08, "logits/chosen": -18.59053611755371, "logits/rejected": -18.618457794189453, "logps/chosen": -1.6284946203231812, "logps/rejected": -2.1159729957580566, "loss": 3.0198, "rewards/accuracies": 0.90625, "rewards/chosen": -16.28494644165039, "rewards/margins": 4.874783992767334, "rewards/rejected": -21.159730911254883, "step": 403 }, { "epoch": 0.9137687305626236, "grad_norm": 124.44884198830263, "learning_rate": 1.7949025760701164e-08, "logits/chosen": -18.346927642822266, "logits/rejected": -17.982345581054688, "logps/chosen": -1.8247474431991577, "logps/rejected": -2.060636043548584, "loss": 3.2447, "rewards/accuracies": 0.65625, "rewards/chosen": -18.247474670410156, "rewards/margins": 2.358887195587158, "rewards/rejected": -20.606359481811523, "step": 404 }, { "epoch": 0.916030534351145, "grad_norm": 98.85780040911116, "learning_rate": 1.7023441397336023e-08, "logits/chosen": -16.48066520690918, "logits/rejected": -16.472976684570312, "logps/chosen": -1.3015496730804443, "logps/rejected": -2.006908416748047, "loss": 3.0345, "rewards/accuracies": 0.78125, "rewards/chosen": -13.015497207641602, "rewards/margins": 7.053586006164551, "rewards/rejected": -20.069082260131836, "step": 405 }, { "epoch": 0.9182923381396664, "grad_norm": 155.0460005854357, "learning_rate": 1.6121839213945854e-08, "logits/chosen": -19.387483596801758, "logits/rejected": -19.041393280029297, "logps/chosen": -1.9884074926376343, "logps/rejected": -2.7867484092712402, "loss": 2.8585, "rewards/accuracies": 0.8125, "rewards/chosen": -19.884077072143555, "rewards/margins": 7.983407974243164, "rewards/rejected": -27.867483139038086, "step": 406 }, { "epoch": 0.9205541419281877, "grad_norm": 115.17173619747005, "learning_rate": 1.5244275669301777e-08, "logits/chosen": -18.749929428100586, "logits/rejected": -18.742963790893555, "logps/chosen": -1.801578164100647, "logps/rejected": -2.3081681728363037, "loss": 2.7902, "rewards/accuracies": 0.71875, "rewards/chosen": -18.015783309936523, "rewards/margins": 5.06589937210083, "rewards/rejected": -23.081682205200195, "step": 407 }, { "epoch": 0.9228159457167091, "grad_norm": 128.6499296049485, "learning_rate": 1.4390805716863398e-08, "logits/chosen": -15.303824424743652, "logits/rejected": -15.512129783630371, "logps/chosen": -1.629596471786499, "logps/rejected": -2.028701066970825, "loss": 3.3744, "rewards/accuracies": 0.625, "rewards/chosen": -16.29596519470215, "rewards/margins": 3.991044521331787, "rewards/rejected": -20.287010192871094, "step": 408 }, { "epoch": 0.9250777495052305, "grad_norm": 119.27282282658015, "learning_rate": 1.3561482801337908e-08, "logits/chosen": -20.444072723388672, "logits/rejected": -20.429006576538086, "logps/chosen": -1.7733900547027588, "logps/rejected": -2.1421661376953125, "loss": 2.9589, "rewards/accuracies": 0.6875, "rewards/chosen": -17.73390007019043, "rewards/margins": 3.687760353088379, "rewards/rejected": -21.421661376953125, "step": 409 }, { "epoch": 0.9273395532937517, "grad_norm": 128.38615627616883, "learning_rate": 1.2756358855332904e-08, "logits/chosen": -19.65103530883789, "logits/rejected": -19.755090713500977, "logps/chosen": -2.041804552078247, "logps/rejected": -2.587003231048584, "loss": 3.5126, "rewards/accuracies": 0.78125, "rewards/chosen": -20.418046951293945, "rewards/margins": 5.45198917388916, "rewards/rejected": -25.87003517150879, "step": 410 }, { "epoch": 0.9296013570822731, "grad_norm": 124.92752994294887, "learning_rate": 1.1975484296105154e-08, "logits/chosen": -18.58397102355957, "logits/rejected": -18.57633399963379, "logps/chosen": -1.8586208820343018, "logps/rejected": -2.2781119346618652, "loss": 2.6842, "rewards/accuracies": 0.71875, "rewards/chosen": -18.58620834350586, "rewards/margins": 4.194911956787109, "rewards/rejected": -22.78112030029297, "step": 411 }, { "epoch": 0.9318631608707945, "grad_norm": 125.7273927211634, "learning_rate": 1.1218908022402374e-08, "logits/chosen": -17.77056121826172, "logits/rejected": -17.610393524169922, "logps/chosen": -1.4044468402862549, "logps/rejected": -1.957669973373413, "loss": 2.8821, "rewards/accuracies": 0.8125, "rewards/chosen": -14.04446792602539, "rewards/margins": 5.532229900360107, "rewards/rejected": -19.576698303222656, "step": 412 }, { "epoch": 0.9341249646593158, "grad_norm": 126.86404645931887, "learning_rate": 1.0486677411402079e-08, "logits/chosen": -18.397602081298828, "logits/rejected": -18.570650100708008, "logps/chosen": -1.7708725929260254, "logps/rejected": -2.3376171588897705, "loss": 3.1541, "rewards/accuracies": 0.65625, "rewards/chosen": -17.708724975585938, "rewards/margins": 5.66744327545166, "rewards/rejected": -23.376169204711914, "step": 413 }, { "epoch": 0.9363867684478372, "grad_norm": 124.4303497435617, "learning_rate": 9.778838315744353e-09, "logits/chosen": -18.644079208374023, "logits/rejected": -18.6198673248291, "logps/chosen": -1.7886202335357666, "logps/rejected": -2.106006383895874, "loss": 3.3825, "rewards/accuracies": 0.75, "rewards/chosen": -17.88620376586914, "rewards/margins": 3.1738624572753906, "rewards/rejected": -21.06006622314453, "step": 414 }, { "epoch": 0.9386485722363584, "grad_norm": 118.54959081850222, "learning_rate": 9.095435060660595e-09, "logits/chosen": -18.39704132080078, "logits/rejected": -18.45808982849121, "logps/chosen": -1.6519393920898438, "logps/rejected": -1.9818757772445679, "loss": 3.0169, "rewards/accuracies": 0.75, "rewards/chosen": -16.519393920898438, "rewards/margins": 3.299362897872925, "rewards/rejected": -19.818758010864258, "step": 415 }, { "epoch": 0.9409103760248798, "grad_norm": 115.40228572150829, "learning_rate": 8.436510441197864e-09, "logits/chosen": -20.38088607788086, "logits/rejected": -20.029354095458984, "logps/chosen": -1.6879628896713257, "logps/rejected": -1.8838168382644653, "loss": 3.2592, "rewards/accuracies": 0.6875, "rewards/chosen": -16.879629135131836, "rewards/margins": 1.9585394859313965, "rewards/rejected": -18.83816909790039, "step": 416 }, { "epoch": 0.9431721798134012, "grad_norm": 167.10931134069494, "learning_rate": 7.802105719539076e-09, "logits/chosen": -18.494495391845703, "logits/rejected": -18.538818359375, "logps/chosen": -1.9037325382232666, "logps/rejected": -2.5261826515197754, "loss": 3.6266, "rewards/accuracies": 0.75, "rewards/chosen": -19.037324905395508, "rewards/margins": 6.224499225616455, "rewards/rejected": -25.261825561523438, "step": 417 }, { "epoch": 0.9454339836019225, "grad_norm": 119.62949752586249, "learning_rate": 7.1922606224192e-09, "logits/chosen": -18.683055877685547, "logits/rejected": -18.973587036132812, "logps/chosen": -1.7631547451019287, "logps/rejected": -2.3043551445007324, "loss": 2.9354, "rewards/accuracies": 0.78125, "rewards/chosen": -17.631547927856445, "rewards/margins": 5.4120049476623535, "rewards/rejected": -23.043554306030273, "step": 418 }, { "epoch": 0.9476957873904439, "grad_norm": 110.69406219801077, "learning_rate": 6.6070133386372906e-09, "logits/chosen": -16.96223258972168, "logits/rejected": -17.232624053955078, "logps/chosen": -1.7214587926864624, "logps/rejected": -2.0780019760131836, "loss": 3.1999, "rewards/accuracies": 0.75, "rewards/chosen": -17.214588165283203, "rewards/margins": 3.5654308795928955, "rewards/rejected": -20.780017852783203, "step": 419 }, { "epoch": 0.9499575911789653, "grad_norm": 133.93900888371826, "learning_rate": 6.046400516665384e-09, "logits/chosen": -18.921737670898438, "logits/rejected": -19.057086944580078, "logps/chosen": -1.8862426280975342, "logps/rejected": -2.456373453140259, "loss": 3.1087, "rewards/accuracies": 0.78125, "rewards/chosen": -18.8624267578125, "rewards/margins": 5.70130729675293, "rewards/rejected": -24.56373405456543, "step": 420 }, { "epoch": 0.9522193949674865, "grad_norm": 116.67059208076354, "learning_rate": 5.510457262353396e-09, "logits/chosen": -18.74356460571289, "logits/rejected": -18.647714614868164, "logps/chosen": -1.5839942693710327, "logps/rejected": -2.068876028060913, "loss": 3.0213, "rewards/accuracies": 0.78125, "rewards/chosen": -15.839942932128906, "rewards/margins": 4.848816871643066, "rewards/rejected": -20.68876075744629, "step": 421 }, { "epoch": 0.9544811987560079, "grad_norm": 137.6503435987508, "learning_rate": 4.9992171367309265e-09, "logits/chosen": -17.830699920654297, "logits/rejected": -17.30995750427246, "logps/chosen": -1.6017370223999023, "logps/rejected": -2.2726082801818848, "loss": 2.7492, "rewards/accuracies": 0.84375, "rewards/chosen": -16.017372131347656, "rewards/margins": 6.708712577819824, "rewards/rejected": -22.726083755493164, "step": 422 }, { "epoch": 0.9567430025445293, "grad_norm": 122.50350701504888, "learning_rate": 4.5127121539052955e-09, "logits/chosen": -18.987272262573242, "logits/rejected": -18.7191162109375, "logps/chosen": -1.7801018953323364, "logps/rejected": -2.5131754875183105, "loss": 2.6308, "rewards/accuracies": 0.71875, "rewards/chosen": -17.80101776123047, "rewards/margins": 7.330737590789795, "rewards/rejected": -25.131757736206055, "step": 423 }, { "epoch": 0.9590048063330506, "grad_norm": 105.12784722468204, "learning_rate": 4.050972779057327e-09, "logits/chosen": -17.278427124023438, "logits/rejected": -17.121200561523438, "logps/chosen": -1.702017903327942, "logps/rejected": -2.172736883163452, "loss": 2.6915, "rewards/accuracies": 0.75, "rewards/chosen": -17.020179748535156, "rewards/margins": 4.707189559936523, "rewards/rejected": -21.727367401123047, "step": 424 }, { "epoch": 0.961266610121572, "grad_norm": 122.22803042526128, "learning_rate": 3.6140279265330477e-09, "logits/chosen": -18.193286895751953, "logits/rejected": -17.90346908569336, "logps/chosen": -1.8119601011276245, "logps/rejected": -2.272505283355713, "loss": 2.935, "rewards/accuracies": 0.75, "rewards/chosen": -18.119600296020508, "rewards/margins": 4.605450630187988, "rewards/rejected": -22.725051879882812, "step": 425 }, { "epoch": 0.9635284139100933, "grad_norm": 140.15479399613614, "learning_rate": 3.2019049580335853e-09, "logits/chosen": -17.40700340270996, "logits/rejected": -17.39166259765625, "logps/chosen": -1.8650894165039062, "logps/rejected": -2.274355888366699, "loss": 3.5647, "rewards/accuracies": 0.78125, "rewards/chosen": -18.65089225769043, "rewards/margins": 4.092666149139404, "rewards/rejected": -22.743558883666992, "step": 426 }, { "epoch": 0.9657902176986146, "grad_norm": 102.93156958129578, "learning_rate": 2.814629680901337e-09, "logits/chosen": -19.251096725463867, "logits/rejected": -19.292316436767578, "logps/chosen": -1.6867254972457886, "logps/rejected": -2.0900285243988037, "loss": 2.4974, "rewards/accuracies": 0.84375, "rewards/chosen": -16.86725425720215, "rewards/margins": 4.0330305099487305, "rewards/rejected": -20.900283813476562, "step": 427 }, { "epoch": 0.968052021487136, "grad_norm": 111.34250544518655, "learning_rate": 2.4522263465041937e-09, "logits/chosen": -19.024517059326172, "logits/rejected": -18.74802017211914, "logps/chosen": -2.0575406551361084, "logps/rejected": -2.7982211112976074, "loss": 2.5955, "rewards/accuracies": 0.875, "rewards/chosen": -20.575408935546875, "rewards/margins": 7.406803131103516, "rewards/rejected": -27.982210159301758, "step": 428 }, { "epoch": 0.9703138252756573, "grad_norm": 98.51128989688017, "learning_rate": 2.114717648716713e-09, "logits/chosen": -16.984386444091797, "logits/rejected": -16.8139591217041, "logps/chosen": -1.8296539783477783, "logps/rejected": -2.573110818862915, "loss": 3.0284, "rewards/accuracies": 0.84375, "rewards/chosen": -18.296539306640625, "rewards/margins": 7.434567451477051, "rewards/rejected": -25.731107711791992, "step": 429 }, { "epoch": 0.9725756290641787, "grad_norm": 124.4940263604112, "learning_rate": 1.802124722499121e-09, "logits/chosen": -18.865802764892578, "logits/rejected": -18.73249626159668, "logps/chosen": -1.7756928205490112, "logps/rejected": -2.6194663047790527, "loss": 2.5879, "rewards/accuracies": 0.875, "rewards/chosen": -17.75693130493164, "rewards/margins": 8.437736511230469, "rewards/rejected": -26.194665908813477, "step": 430 }, { "epoch": 0.9748374328527001, "grad_norm": 119.13158902037044, "learning_rate": 1.5144671425737499e-09, "logits/chosen": -17.51629638671875, "logits/rejected": -17.642141342163086, "logps/chosen": -1.799952745437622, "logps/rejected": -2.451775550842285, "loss": 3.0634, "rewards/accuracies": 0.75, "rewards/chosen": -17.999526977539062, "rewards/margins": 6.518229007720947, "rewards/rejected": -24.517757415771484, "step": 431 }, { "epoch": 0.9770992366412213, "grad_norm": 100.6745496186136, "learning_rate": 1.251762922199484e-09, "logits/chosen": -18.572729110717773, "logits/rejected": -19.301191329956055, "logps/chosen": -1.8852096796035767, "logps/rejected": -2.454303503036499, "loss": 2.4105, "rewards/accuracies": 0.8125, "rewards/chosen": -18.852096557617188, "rewards/margins": 5.690939903259277, "rewards/rejected": -24.54303550720215, "step": 432 }, { "epoch": 0.9793610404297427, "grad_norm": 119.30288980428828, "learning_rate": 1.0140285120433744e-09, "logits/chosen": -18.9143009185791, "logits/rejected": -18.95807456970215, "logps/chosen": -1.8828755617141724, "logps/rejected": -2.437493085861206, "loss": 3.3873, "rewards/accuracies": 0.75, "rewards/chosen": -18.828754425048828, "rewards/margins": 5.546175003051758, "rewards/rejected": -24.374929428100586, "step": 433 }, { "epoch": 0.9816228442182641, "grad_norm": 119.93045050852022, "learning_rate": 8.012787991508396e-10, "logits/chosen": -18.035734176635742, "logits/rejected": -17.416671752929688, "logps/chosen": -1.7183349132537842, "logps/rejected": -2.451599359512329, "loss": 2.4103, "rewards/accuracies": 0.78125, "rewards/chosen": -17.183349609375, "rewards/margins": 7.332643508911133, "rewards/rejected": -24.5159912109375, "step": 434 }, { "epoch": 0.9838846480067854, "grad_norm": 127.1741271306872, "learning_rate": 6.135271060133007e-10, "logits/chosen": -17.5001277923584, "logits/rejected": -17.65492057800293, "logps/chosen": -1.74495530128479, "logps/rejected": -2.323106527328491, "loss": 3.0668, "rewards/accuracies": 0.8125, "rewards/chosen": -17.449552536010742, "rewards/margins": 5.781513214111328, "rewards/rejected": -23.23106575012207, "step": 435 }, { "epoch": 0.9861464517953068, "grad_norm": 115.3718910974279, "learning_rate": 4.50785189733871e-10, "logits/chosen": -17.362075805664062, "logits/rejected": -17.160686492919922, "logps/chosen": -1.3833808898925781, "logps/rejected": -1.7379635572433472, "loss": 2.7748, "rewards/accuracies": 0.78125, "rewards/chosen": -13.833809852600098, "rewards/margins": 3.5458261966705322, "rewards/rejected": -17.379636764526367, "step": 436 }, { "epoch": 0.988408255583828, "grad_norm": 110.49410393455729, "learning_rate": 3.1306324129118935e-10, "logits/chosen": -17.78763198852539, "logits/rejected": -17.5814151763916, "logps/chosen": -1.6376947164535522, "logps/rejected": -2.1998562812805176, "loss": 3.0113, "rewards/accuracies": 0.8125, "rewards/chosen": -16.3769474029541, "rewards/margins": 5.621615886688232, "rewards/rejected": -21.99856185913086, "step": 437 }, { "epoch": 0.9906700593723494, "grad_norm": 142.4763338483451, "learning_rate": 2.003698849011748e-10, "logits/chosen": -19.646331787109375, "logits/rejected": -19.66240119934082, "logps/chosen": -2.0467026233673096, "logps/rejected": -2.477294921875, "loss": 3.3739, "rewards/accuracies": 0.6875, "rewards/chosen": -20.467025756835938, "rewards/margins": 4.305922985076904, "rewards/rejected": -24.772947311401367, "step": 438 }, { "epoch": 0.9929318631608708, "grad_norm": 130.7419382757851, "learning_rate": 1.1271217747714779e-10, "logits/chosen": -17.93435287475586, "logits/rejected": -17.90981674194336, "logps/chosen": -1.883331298828125, "logps/rejected": -2.1619515419006348, "loss": 3.3682, "rewards/accuracies": 0.65625, "rewards/chosen": -18.83331298828125, "rewards/margins": 2.786202907562256, "rewards/rejected": -21.619516372680664, "step": 439 }, { "epoch": 0.9951936669493922, "grad_norm": 125.07489041195862, "learning_rate": 5.0095608187739055e-11, "logits/chosen": -19.032190322875977, "logits/rejected": -19.182344436645508, "logps/chosen": -1.578109622001648, "logps/rejected": -1.948418378829956, "loss": 2.7569, "rewards/accuracies": 0.8125, "rewards/chosen": -15.781095504760742, "rewards/margins": 3.703087329864502, "rewards/rejected": -19.48418426513672, "step": 440 }, { "epoch": 0.9974554707379135, "grad_norm": 119.11218694159568, "learning_rate": 1.2524098113209092e-11, "logits/chosen": -16.846660614013672, "logits/rejected": -17.356082916259766, "logps/chosen": -1.736297845840454, "logps/rejected": -2.1138105392456055, "loss": 3.4049, "rewards/accuracies": 0.59375, "rewards/chosen": -17.36298179626465, "rewards/margins": 3.7751266956329346, "rewards/rejected": -21.138107299804688, "step": 441 }, { "epoch": 0.9997172745264349, "grad_norm": 120.08290315715726, "learning_rate": 0.0, "logits/chosen": -18.770984649658203, "logits/rejected": -18.760494232177734, "logps/chosen": -1.659979224205017, "logps/rejected": -2.181823492050171, "loss": 2.8512, "rewards/accuracies": 0.71875, "rewards/chosen": -16.59979248046875, "rewards/margins": 5.218443393707275, "rewards/rejected": -21.818235397338867, "step": 442 }, { "epoch": 0.9997172745264349, "step": 442, "total_flos": 227674672136192.0, "train_loss": 0.0, "train_runtime": 1.6273, "train_samples_per_second": 34774.982, "train_steps_per_second": 271.612 } ], "logging_steps": 1, "max_steps": 442, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 227674672136192.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }