diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,12 +10,12 @@ "log_history": [ { "epoch": 0.00031456432840515884, - "grad_norm": 0.0479649193584919, + "grad_norm": 0.04916731268167496, "learning_rate": 1.5723270440251573e-08, - "logits/chosen": -1.9399988651275635, - "logits/rejected": -1.95430588722229, - "logps/chosen": -37.35533905029297, - "logps/rejected": -35.944679260253906, + "logits/chosen": -1.942791223526001, + "logits/rejected": -1.9583369493484497, + "logps/chosen": -37.350425720214844, + "logps/rejected": -35.84906005859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,5263 +25,5263 @@ }, { "epoch": 0.0031456432840515887, - "grad_norm": 0.04902639612555504, + "grad_norm": 0.050606776028871536, "learning_rate": 1.5723270440251575e-07, - "logits/chosen": -1.8399639129638672, - "logits/rejected": -1.9113829135894775, - "logps/chosen": -33.15123748779297, - "logps/rejected": -34.97999572753906, + "logits/chosen": -1.8411260843276978, + "logits/rejected": -1.9121019840240479, + "logps/chosen": -33.17695617675781, + "logps/rejected": -35.013282775878906, "loss": 0.6932, - "rewards/accuracies": 0.4097222089767456, - "rewards/chosen": -0.00022805675689596683, - "rewards/margins": -0.00019479618640616536, - "rewards/rejected": -3.326057776575908e-05, + "rewards/accuracies": 0.4652777910232544, + "rewards/chosen": -0.000264569855062291, + "rewards/margins": 5.910781055717962e-06, + "rewards/rejected": -0.00027048063930124044, "step": 10 }, { "epoch": 0.0062912865681031774, - "grad_norm": 0.04345833510160446, + "grad_norm": 0.045017264783382416, "learning_rate": 3.144654088050315e-07, - "logits/chosen": -1.8433700799942017, - "logits/rejected": -1.8701088428497314, - "logps/chosen": -32.37605667114258, - "logps/rejected": -35.17049026489258, + "logits/chosen": -1.8443002700805664, + "logits/rejected": -1.8706448078155518, + "logps/chosen": -32.37808609008789, + "logps/rejected": -35.20783233642578, "loss": 0.6931, - "rewards/accuracies": 0.45625001192092896, - "rewards/chosen": -0.0001617725647520274, - "rewards/margins": -0.00015225948300212622, - "rewards/rejected": -9.513064469501842e-06, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.00013874513388145715, + "rewards/margins": 0.0003704810806084424, + "rewards/rejected": -0.00023173594672698528, "step": 20 }, { "epoch": 0.009436929852154765, - "grad_norm": 0.04357537627220154, + "grad_norm": 0.0454835444688797, "learning_rate": 4.716981132075472e-07, - "logits/chosen": -1.8218624591827393, - "logits/rejected": -1.8491008281707764, - "logps/chosen": -32.927635192871094, - "logps/rejected": -34.23331069946289, - "loss": 0.693, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 1.8031179934041575e-05, - "rewards/margins": 0.00020070603932254016, - "rewards/rejected": -0.00018267489213030785, + "logits/chosen": -1.8223037719726562, + "logits/rejected": -1.8500335216522217, + "logps/chosen": -32.97340774536133, + "logps/rejected": -34.259952545166016, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1447194083302747e-05, + "rewards/margins": 0.00015726670972071588, + "rewards/rejected": -0.00014581947471015155, "step": 30 }, { "epoch": 0.012582573136206355, - "grad_norm": 0.04584033414721489, + "grad_norm": 0.04719178006052971, "learning_rate": 6.28930817610063e-07, - "logits/chosen": -1.8482955694198608, - "logits/rejected": -1.844745397567749, - "logps/chosen": -33.68798828125, - "logps/rejected": -37.151004791259766, + "logits/chosen": -1.8486772775650024, + "logits/rejected": -1.8450359106063843, + "logps/chosen": -33.700775146484375, + "logps/rejected": -37.18444061279297, "loss": 0.693, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -0.0003600932250265032, - "rewards/margins": -7.291980000445619e-05, - "rewards/rejected": -0.00028717340319417417, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.00020007109560538083, + "rewards/margins": 0.00011606378393480554, + "rewards/rejected": -0.0003161348286084831, "step": 40 }, { "epoch": 0.015728216420257943, - "grad_norm": 0.04268745705485344, + "grad_norm": 0.0442265048623085, "learning_rate": 7.861635220125787e-07, - "logits/chosen": -1.8830944299697876, - "logits/rejected": -1.912956953048706, - "logps/chosen": -33.78385925292969, - "logps/rejected": -34.29187774658203, - "loss": 0.6928, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.0002359933714615181, - "rewards/margins": 0.000701805402059108, - "rewards/rejected": -0.0009377988171763718, + "logits/chosen": -1.8833131790161133, + "logits/rejected": -1.913037657737732, + "logps/chosen": -33.82078170776367, + "logps/rejected": -34.307395935058594, + "loss": 0.6929, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.00041835257434286177, + "rewards/margins": 0.00020049764134455472, + "rewards/rejected": -0.0006188501720316708, "step": 50 }, { "epoch": 0.01887385970430953, - "grad_norm": 0.04893243685364723, + "grad_norm": 0.05058182030916214, "learning_rate": 9.433962264150944e-07, - "logits/chosen": -1.7625595331192017, - "logits/rejected": -1.8228267431259155, - "logps/chosen": -33.46589279174805, - "logps/rejected": -36.136573791503906, + "logits/chosen": -1.7616949081420898, + "logits/rejected": -1.8223813772201538, + "logps/chosen": -33.48662567138672, + "logps/rejected": -36.167518615722656, "loss": 0.6927, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.0001076062471838668, - "rewards/margins": 0.001043324125930667, - "rewards/rejected": -0.0009357180679216981, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 6.629432027693838e-05, + "rewards/margins": 0.001108249882236123, + "rewards/rejected": -0.0010419555474072695, "step": 60 }, { "epoch": 0.02201950298836112, - "grad_norm": 0.046153198927640915, + "grad_norm": 0.04823003336787224, "learning_rate": 1.1006289308176102e-06, - "logits/chosen": -1.7840734720230103, - "logits/rejected": -1.8304617404937744, - "logps/chosen": -33.84288787841797, - "logps/rejected": -36.21353530883789, + "logits/chosen": -1.782994270324707, + "logits/rejected": -1.8296699523925781, + "logps/chosen": -33.87117004394531, + "logps/rejected": -36.236976623535156, "loss": 0.6925, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.00046643256791867316, - "rewards/margins": 0.0013198342639952898, - "rewards/rejected": -0.0017862668028101325, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00032349638058803976, + "rewards/margins": 0.0015318433288484812, + "rewards/rejected": -0.0018553396221250296, "step": 70 }, { "epoch": 0.02516514627241271, - "grad_norm": 0.05236299708485603, + "grad_norm": 0.05422540009021759, "learning_rate": 1.257861635220126e-06, - "logits/chosen": -1.7575123310089111, - "logits/rejected": -1.7971513271331787, - "logps/chosen": -32.27585220336914, - "logps/rejected": -34.206329345703125, - "loss": 0.6923, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.00034783725277520716, - "rewards/margins": 0.0015550373354926705, - "rewards/rejected": -0.001902874791994691, + "logits/chosen": -1.7576888799667358, + "logits/rejected": -1.7962068319320679, + "logps/chosen": -32.26203155517578, + "logps/rejected": -34.199798583984375, + "loss": 0.6922, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -6.839539855718613e-05, + "rewards/margins": 0.0013875927543267608, + "rewards/rejected": -0.0014559882692992687, "step": 80 }, { "epoch": 0.028310789556464298, - "grad_norm": 0.05319148302078247, + "grad_norm": 0.055276062339544296, "learning_rate": 1.4150943396226415e-06, - "logits/chosen": -1.8102385997772217, - "logits/rejected": -1.8448301553726196, - "logps/chosen": -31.791019439697266, - "logps/rejected": -34.05156707763672, - "loss": 0.6919, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.001236448297277093, - "rewards/margins": 0.001894423388876021, - "rewards/rejected": -0.0031308718025684357, + "logits/chosen": -1.8102718591690063, + "logits/rejected": -1.844451904296875, + "logps/chosen": -31.800710678100586, + "logps/rejected": -34.06111526489258, + "loss": 0.6918, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0007771268719807267, + "rewards/margins": 0.0020528400782495737, + "rewards/rejected": -0.0028299668338149786, "step": 90 }, { "epoch": 0.031456432840515886, - "grad_norm": 0.054018791764974594, + "grad_norm": 0.04816555231809616, "learning_rate": 1.5723270440251573e-06, - "logits/chosen": -1.8018757104873657, - "logits/rejected": -1.843665361404419, - "logps/chosen": -34.18579864501953, - "logps/rejected": -35.275360107421875, - "loss": 0.6914, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.001453344477340579, - "rewards/margins": 0.004754130728542805, - "rewards/rejected": -0.003300786716863513, + "logits/chosen": -1.801669716835022, + "logits/rejected": -1.8445507287979126, + "logps/chosen": -34.190731048583984, + "logps/rejected": -35.301910400390625, + "loss": 0.6913, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0014222835889086127, + "rewards/margins": 0.004616752732545137, + "rewards/rejected": -0.003194468794390559, "step": 100 }, { "epoch": 0.031456432840515886, - "eval_logits/chosen": -1.6366691589355469, - "eval_logits/rejected": -1.6841371059417725, - "eval_logps/chosen": -32.712493896484375, - "eval_logps/rejected": -36.258174896240234, - "eval_loss": 0.6911582946777344, - "eval_rewards/accuracies": 0.6339552402496338, - "eval_rewards/chosen": 0.0005853726179338992, - "eval_rewards/margins": 0.0041807363741099834, - "eval_rewards/rejected": -0.0035953635815531015, - "eval_runtime": 219.2745, - "eval_samples_per_second": 97.672, - "eval_steps_per_second": 1.528, + "eval_logits/chosen": -1.6348028182983398, + "eval_logits/rejected": -1.6823533773422241, + "eval_logps/chosen": -32.72854995727539, + "eval_logps/rejected": -36.27182388305664, + "eval_loss": 0.6911075115203857, + "eval_rewards/accuracies": 0.6220149397850037, + "eval_rewards/chosen": 0.0006819414556957781, + "eval_rewards/margins": 0.00424056826159358, + "eval_rewards/rejected": -0.0035586270969361067, + "eval_runtime": 220.3412, + "eval_samples_per_second": 97.199, + "eval_steps_per_second": 1.52, "step": 100 }, { "epoch": 0.03460207612456748, - "grad_norm": 0.05021243169903755, + "grad_norm": 0.0516941212117672, "learning_rate": 1.7295597484276729e-06, - "logits/chosen": -1.800484299659729, - "logits/rejected": -1.8205846548080444, - "logps/chosen": -33.69269943237305, - "logps/rejected": -37.17658233642578, - "loss": 0.6908, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.0005980390124022961, - "rewards/margins": 0.0038381360936909914, - "rewards/rejected": -0.004436175338923931, + "logits/chosen": -1.7992357015609741, + "logits/rejected": -1.819049596786499, + "logps/chosen": -33.700836181640625, + "logps/rejected": -37.1832275390625, + "loss": 0.6907, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0007088495185598731, + "rewards/margins": 0.0035199751146137714, + "rewards/rejected": -0.004228824749588966, "step": 110 }, { "epoch": 0.03774771940861906, - "grad_norm": 0.05514535307884216, + "grad_norm": 0.05639781430363655, "learning_rate": 1.8867924528301889e-06, - "logits/chosen": -1.799552321434021, - "logits/rejected": -1.8297306299209595, - "logps/chosen": -32.277740478515625, - "logps/rejected": -34.390567779541016, - "loss": 0.6904, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.001782993203960359, - "rewards/margins": 0.008893580175936222, - "rewards/rejected": -0.007110586855560541, + "logits/chosen": -1.7982286214828491, + "logits/rejected": -1.8285636901855469, + "logps/chosen": -32.256858825683594, + "logps/rejected": -34.39281463623047, + "loss": 0.6903, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.0023287234362214804, + "rewards/margins": 0.009236618876457214, + "rewards/rejected": -0.00690789520740509, "step": 120 }, { "epoch": 0.04089336269267065, - "grad_norm": 0.05100173130631447, + "grad_norm": 0.052410390228033066, "learning_rate": 2.044025157232705e-06, - "logits/chosen": -1.8046401739120483, - "logits/rejected": -1.817368507385254, - "logps/chosen": -32.66090393066406, - "logps/rejected": -35.909088134765625, - "loss": 0.6903, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.003670961130410433, - "rewards/margins": 0.011476712301373482, - "rewards/rejected": -0.007805750705301762, + "logits/chosen": -1.8025617599487305, + "logits/rejected": -1.8154237270355225, + "logps/chosen": -32.6540641784668, + "logps/rejected": -35.91321563720703, + "loss": 0.6902, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0039227548986673355, + "rewards/margins": 0.011436911299824715, + "rewards/rejected": -0.007514156401157379, "step": 130 }, { "epoch": 0.04403900597672224, - "grad_norm": 0.057663802057504654, + "grad_norm": 0.05888905003666878, "learning_rate": 2.2012578616352204e-06, - "logits/chosen": -1.7695449590682983, - "logits/rejected": -1.8342878818511963, - "logps/chosen": -32.138465881347656, - "logps/rejected": -38.76416778564453, - "loss": 0.6877, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.00406468054279685, - "rewards/margins": 0.016144271939992905, - "rewards/rejected": -0.012079590931534767, + "logits/chosen": -1.7658809423446655, + "logits/rejected": -1.8309358358383179, + "logps/chosen": -32.095314025878906, + "logps/rejected": -38.73040771484375, + "loss": 0.6876, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004927013069391251, + "rewards/margins": 0.016417894512414932, + "rewards/rejected": -0.011490881443023682, "step": 140 }, { "epoch": 0.04718464926077383, - "grad_norm": 0.06243397668004036, + "grad_norm": 0.06400807201862335, "learning_rate": 2.358490566037736e-06, - "logits/chosen": -1.760240912437439, - "logits/rejected": -1.7993634939193726, - "logps/chosen": -34.989463806152344, - "logps/rejected": -36.670860290527344, + "logits/chosen": -1.7594534158706665, + "logits/rejected": -1.798018455505371, + "logps/chosen": -34.921791076660156, + "logps/rejected": -36.6182861328125, "loss": 0.6881, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.0018432287033647299, - "rewards/margins": 0.01047598011791706, - "rewards/rejected": -0.012319209054112434, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0009333366760984063, + "rewards/margins": 0.010404362343251705, + "rewards/rejected": -0.011337699368596077, "step": 150 }, { "epoch": 0.05033029254482542, - "grad_norm": 0.0660770907998085, + "grad_norm": 0.06766606122255325, "learning_rate": 2.515723270440252e-06, - "logits/chosen": -1.732052206993103, - "logits/rejected": -1.7964776754379272, - "logps/chosen": -30.869558334350586, - "logps/rejected": -36.11768341064453, - "loss": 0.6841, + "logits/chosen": -1.730738878250122, + "logits/rejected": -1.795153260231018, + "logps/chosen": -30.81954002380371, + "logps/rejected": -36.08769607543945, + "loss": 0.684, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.00874429289251566, - "rewards/margins": 0.01892954111099243, - "rewards/rejected": -0.010185247287154198, + "rewards/chosen": 0.009594108909368515, + "rewards/margins": 0.0193181075155735, + "rewards/rejected": -0.009723997674882412, "step": 160 }, { "epoch": 0.053475935828877004, - "grad_norm": 0.07039070129394531, + "grad_norm": 0.07456081360578537, "learning_rate": 2.6729559748427675e-06, - "logits/chosen": -1.7254928350448608, - "logits/rejected": -1.7313286066055298, - "logps/chosen": -31.875972747802734, - "logps/rejected": -36.539859771728516, - "loss": 0.6829, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.012786077335476875, - "rewards/margins": 0.02013152278959751, - "rewards/rejected": -0.007345445454120636, + "logits/chosen": -1.7231314182281494, + "logits/rejected": -1.7290071249008179, + "logps/chosen": -31.769222259521484, + "logps/rejected": -36.441688537597656, + "loss": 0.6827, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.014062667265534401, + "rewards/margins": 0.020307175815105438, + "rewards/rejected": -0.006244509480893612, "step": 170 }, { "epoch": 0.056621579112928595, - "grad_norm": 0.06682829558849335, + "grad_norm": 0.06826143711805344, "learning_rate": 2.830188679245283e-06, - "logits/chosen": -1.7438074350357056, - "logits/rejected": -1.761460542678833, - "logps/chosen": -31.37579345703125, - "logps/rejected": -37.373321533203125, + "logits/chosen": -1.739983320236206, + "logits/rejected": -1.7577593326568604, + "logps/chosen": -31.265216827392578, + "logps/rejected": -37.26996994018555, "loss": 0.6828, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.014985946007072926, - "rewards/margins": 0.030825484544038773, - "rewards/rejected": -0.015839537605643272, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016319114714860916, + "rewards/margins": 0.030752727761864662, + "rewards/rejected": -0.014433610253036022, "step": 180 }, { "epoch": 0.05976722239698018, - "grad_norm": 0.08220777660608292, + "grad_norm": 0.0847531110048294, "learning_rate": 2.987421383647799e-06, - "logits/chosen": -1.7134368419647217, - "logits/rejected": -1.7409776449203491, - "logps/chosen": -30.14202308654785, - "logps/rejected": -38.0638542175293, - "loss": 0.6786, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.008612741716206074, - "rewards/margins": 0.040046971291303635, - "rewards/rejected": -0.031434230506420135, + "logits/chosen": -1.7099155187606812, + "logits/rejected": -1.7371807098388672, + "logps/chosen": -30.013824462890625, + "logps/rejected": -37.95855712890625, + "loss": 0.6785, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.009831647388637066, + "rewards/margins": 0.03985407203435898, + "rewards/rejected": -0.030022427439689636, "step": 190 }, { "epoch": 0.06291286568103177, - "grad_norm": 0.08102578669786453, + "grad_norm": 0.0835421085357666, "learning_rate": 3.1446540880503146e-06, - "logits/chosen": -1.6293474435806274, - "logits/rejected": -1.715287208557129, - "logps/chosen": -27.261890411376953, - "logps/rejected": -36.432151794433594, - "loss": 0.6743, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.0217665433883667, - "rewards/margins": 0.03480926901102066, - "rewards/rejected": -0.013042723760008812, + "logits/chosen": -1.6245222091674805, + "logits/rejected": -1.711726427078247, + "logps/chosen": -27.14463233947754, + "logps/rejected": -36.3680305480957, + "loss": 0.6742, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.023222357034683228, + "rewards/margins": 0.03531500697135925, + "rewards/rejected": -0.01209265273064375, "step": 200 }, { "epoch": 0.06291286568103177, - "eval_logits/chosen": -1.4648962020874023, - "eval_logits/rejected": -1.5154403448104858, - "eval_logps/chosen": -32.85725784301758, - "eval_logps/rejected": -40.52320098876953, - "eval_loss": 0.6753061413764954, - "eval_rewards/accuracies": 0.6320895552635193, - "eval_rewards/chosen": -0.0008622497553005815, - "eval_rewards/margins": 0.04538334161043167, - "eval_rewards/rejected": -0.04624559357762337, - "eval_runtime": 214.8723, - "eval_samples_per_second": 99.673, - "eval_steps_per_second": 1.559, + "eval_logits/chosen": -1.4586008787155151, + "eval_logits/rejected": -1.5097368955612183, + "eval_logps/chosen": -32.76309585571289, + "eval_logps/rejected": -40.45964050292969, + "eval_loss": 0.6750917434692383, + "eval_rewards/accuracies": 0.6276119351387024, + "eval_rewards/chosen": 0.0003364614094607532, + "eval_rewards/margins": 0.0457732118666172, + "eval_rewards/rejected": -0.04543674364686012, + "eval_runtime": 216.3229, + "eval_samples_per_second": 99.005, + "eval_steps_per_second": 1.549, "step": 200 }, { "epoch": 0.06605850896508336, - "grad_norm": 0.08751504868268967, + "grad_norm": 0.09109613299369812, "learning_rate": 3.30188679245283e-06, - "logits/chosen": -1.6138765811920166, - "logits/rejected": -1.6505515575408936, - "logps/chosen": -36.41826629638672, - "logps/rejected": -38.340946197509766, - "loss": 0.6782, + "logits/chosen": -1.6076081991195679, + "logits/rejected": -1.6447021961212158, + "logps/chosen": -36.27522277832031, + "logps/rejected": -38.27228546142578, + "loss": 0.678, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.016763882711529732, - "rewards/margins": 0.015965834259986877, - "rewards/rejected": -0.03272971510887146, + "rewards/chosen": -0.015005774796009064, + "rewards/margins": 0.01694519817829132, + "rewards/rejected": -0.031950972974300385, "step": 210 }, { "epoch": 0.06920415224913495, - "grad_norm": 0.10140910744667053, + "grad_norm": 0.1037890687584877, "learning_rate": 3.4591194968553458e-06, - "logits/chosen": -1.572196364402771, - "logits/rejected": -1.608331322669983, - "logps/chosen": -35.66874694824219, - "logps/rejected": -40.756690979003906, - "loss": 0.6719, + "logits/chosen": -1.5653865337371826, + "logits/rejected": -1.602521300315857, + "logps/chosen": -35.69440460205078, + "logps/rejected": -40.78470230102539, + "loss": 0.6718, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.025554969906806946, - "rewards/margins": 0.03226945921778679, - "rewards/rejected": -0.057824425399303436, + "rewards/chosen": -0.025828268378973007, + "rewards/margins": 0.031778041273355484, + "rewards/rejected": -0.057606302201747894, "step": 220 }, { "epoch": 0.07234979553318653, - "grad_norm": 0.11134755611419678, + "grad_norm": 0.11334498226642609, "learning_rate": 3.6163522012578618e-06, - "logits/chosen": -1.6366207599639893, - "logits/rejected": -1.6505063772201538, - "logps/chosen": -37.33563232421875, - "logps/rejected": -43.3948974609375, - "loss": 0.6707, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.045298900455236435, - "rewards/margins": 0.026216819882392883, - "rewards/rejected": -0.07151572406291962, + "logits/chosen": -1.6311982870101929, + "logits/rejected": -1.6453990936279297, + "logps/chosen": -37.407859802246094, + "logps/rejected": -43.464874267578125, + "loss": 0.6706, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.04616266116499901, + "rewards/margins": 0.025804489850997925, + "rewards/rejected": -0.07196714729070663, "step": 230 }, { "epoch": 0.07549543881723812, - "grad_norm": 0.1564127802848816, + "grad_norm": 0.157041534781456, "learning_rate": 3.7735849056603777e-06, - "logits/chosen": -1.5884507894515991, - "logits/rejected": -1.6262544393539429, - "logps/chosen": -37.20269775390625, - "logps/rejected": -45.897308349609375, - "loss": 0.6668, + "logits/chosen": -1.5806419849395752, + "logits/rejected": -1.6178245544433594, + "logps/chosen": -37.05290985107422, + "logps/rejected": -45.771514892578125, + "loss": 0.6665, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.03480329364538193, - "rewards/margins": 0.06560282409191132, - "rewards/rejected": -0.10040611028671265, + "rewards/chosen": -0.033209070563316345, + "rewards/margins": 0.0657048299908638, + "rewards/rejected": -0.09891389310359955, "step": 240 }, { "epoch": 0.07864108210128971, - "grad_norm": 0.15535807609558105, + "grad_norm": 0.16409841179847717, "learning_rate": 3.930817610062894e-06, - "logits/chosen": -1.6724646091461182, - "logits/rejected": -1.6687465906143188, - "logps/chosen": -43.27534484863281, - "logps/rejected": -45.803104400634766, - "loss": 0.6624, + "logits/chosen": -1.6633002758026123, + "logits/rejected": -1.6584806442260742, + "logps/chosen": -43.37720489501953, + "logps/rejected": -45.945579528808594, + "loss": 0.6619, "rewards/accuracies": 0.5625, - "rewards/chosen": -0.10215433686971664, - "rewards/margins": 0.013999072834849358, - "rewards/rejected": -0.11615340411663055, + "rewards/chosen": -0.1028851717710495, + "rewards/margins": 0.014506662264466286, + "rewards/rejected": -0.11739183962345123, "step": 250 }, { "epoch": 0.0817867253853413, - "grad_norm": 0.18862789869308472, + "grad_norm": 0.2012060135602951, "learning_rate": 4.08805031446541e-06, - "logits/chosen": -1.624436378479004, - "logits/rejected": -1.695433259010315, - "logps/chosen": -44.69486999511719, - "logps/rejected": -51.40407180786133, - "loss": 0.6532, + "logits/chosen": -1.6090081930160522, + "logits/rejected": -1.6804996728897095, + "logps/chosen": -44.958030700683594, + "logps/rejected": -51.669647216796875, + "loss": 0.6527, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.11470717191696167, - "rewards/margins": 0.04582948237657547, - "rewards/rejected": -0.16053664684295654, + "rewards/chosen": -0.11712668091058731, + "rewards/margins": 0.04562956839799881, + "rewards/rejected": -0.16275624930858612, "step": 260 }, { "epoch": 0.0849323686693929, - "grad_norm": 0.19437934458255768, + "grad_norm": 0.20246672630310059, "learning_rate": 4.245283018867925e-06, - "logits/chosen": -1.4597517251968384, - "logits/rejected": -1.5478241443634033, - "logps/chosen": -43.16393280029297, - "logps/rejected": -59.01338577270508, - "loss": 0.6427, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.09754286706447601, - "rewards/margins": 0.1444810926914215, - "rewards/rejected": -0.24202391505241394, + "logits/chosen": -1.430743932723999, + "logits/rejected": -1.520498514175415, + "logps/chosen": -43.613075256347656, + "logps/rejected": -59.78533172607422, + "loss": 0.6418, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10202650725841522, + "rewards/margins": 0.14727506041526794, + "rewards/rejected": -0.24930159747600555, "step": 270 }, { "epoch": 0.08807801195344447, - "grad_norm": 0.2875131070613861, + "grad_norm": 0.30764126777648926, "learning_rate": 4.402515723270441e-06, - "logits/chosen": -1.4205321073532104, - "logits/rejected": -1.4631447792053223, - "logps/chosen": -54.769493103027344, - "logps/rejected": -60.56072998046875, - "loss": 0.6426, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.21598923206329346, - "rewards/margins": 0.04702833294868469, - "rewards/rejected": -0.26301756501197815, + "logits/chosen": -1.369045615196228, + "logits/rejected": -1.4158138036727905, + "logps/chosen": -56.09346389770508, + "logps/rejected": -61.998687744140625, + "loss": 0.6408, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.2292211800813675, + "rewards/margins": 0.048012204468250275, + "rewards/rejected": -0.2772333323955536, "step": 280 }, { "epoch": 0.09122365523749607, - "grad_norm": 0.5252009034156799, + "grad_norm": 0.529365062713623, "learning_rate": 4.559748427672957e-06, - "logits/chosen": -1.124021053314209, - "logits/rejected": -1.2152128219604492, - "logps/chosen": -69.76283264160156, - "logps/rejected": -94.03047943115234, - "loss": 0.6124, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.37234413623809814, - "rewards/margins": 0.22571516036987305, - "rewards/rejected": -0.5980592966079712, + "logits/chosen": -1.0204923152923584, + "logits/rejected": -1.1075626611709595, + "logps/chosen": -75.31793212890625, + "logps/rejected": -101.33333587646484, + "loss": 0.6085, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4275882840156555, + "rewards/margins": 0.2434498369693756, + "rewards/rejected": -0.6710380911827087, "step": 290 }, { "epoch": 0.09436929852154766, - "grad_norm": 0.5838403105735779, + "grad_norm": 0.5780752301216125, "learning_rate": 4.716981132075472e-06, - "logits/chosen": -0.8352106809616089, - "logits/rejected": -0.8499029278755188, - "logps/chosen": -92.40535736083984, - "logps/rejected": -115.4690933227539, - "loss": 0.6112, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5981942415237427, - "rewards/margins": 0.19615033268928528, - "rewards/rejected": -0.7943445444107056, + "logits/chosen": -0.8171870112419128, + "logits/rejected": -0.8328253626823425, + "logps/chosen": -90.27127838134766, + "logps/rejected": -113.9675521850586, + "loss": 0.6081, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5765252709388733, + "rewards/margins": 0.2026272714138031, + "rewards/rejected": -0.779152512550354, "step": 300 }, { "epoch": 0.09436929852154766, - "eval_logits/chosen": -0.4325442612171173, - "eval_logits/rejected": -0.5166311264038086, - "eval_logps/chosen": -82.86697387695312, - "eval_logps/rejected": -119.5517578125, - "eval_loss": 0.5905027389526367, - "eval_rewards/accuracies": 0.6630597114562988, - "eval_rewards/chosen": -0.5009594559669495, - "eval_rewards/margins": 0.33557161688804626, - "eval_rewards/rejected": -0.8365311026573181, - "eval_runtime": 215.0864, - "eval_samples_per_second": 99.574, - "eval_steps_per_second": 1.558, + "eval_logits/chosen": -0.38301563262939453, + "eval_logits/rejected": -0.47008848190307617, + "eval_logps/chosen": -84.73033142089844, + "eval_logps/rejected": -122.3552474975586, + "eval_loss": 0.5872128009796143, + "eval_rewards/accuracies": 0.6619402766227722, + "eval_rewards/chosen": -0.5193358659744263, + "eval_rewards/margins": 0.34505695104599, + "eval_rewards/rejected": -0.8643926978111267, + "eval_runtime": 216.4352, + "eval_samples_per_second": 98.953, + "eval_steps_per_second": 1.548, "step": 300 }, { "epoch": 0.09751494180559925, - "grad_norm": 0.7127551436424255, + "grad_norm": 0.7804479002952576, "learning_rate": 4.874213836477988e-06, - "logits/chosen": -0.6658456921577454, - "logits/rejected": -0.7291407585144043, - "logps/chosen": -92.9823226928711, - "logps/rejected": -138.02853393554688, - "loss": 0.5956, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6104769706726074, - "rewards/margins": 0.4045206904411316, - "rewards/rejected": -1.0149977207183838, + "logits/chosen": -0.6313827037811279, + "logits/rejected": -0.6968099474906921, + "logps/chosen": -96.16786193847656, + "logps/rejected": -143.1085968017578, + "loss": 0.5923, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6421507596969604, + "rewards/margins": 0.423184335231781, + "rewards/rejected": -1.0653350353240967, "step": 310 }, { "epoch": 0.10066058508965084, - "grad_norm": 0.8495454788208008, + "grad_norm": 0.9274519085884094, "learning_rate": 4.999993971158594e-06, - "logits/chosen": -0.7028144598007202, - "logits/rejected": -0.7583300471305847, - "logps/chosen": -128.24002075195312, - "logps/rejected": -170.99911499023438, - "loss": 0.5863, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.9406751394271851, - "rewards/margins": 0.40160757303237915, - "rewards/rejected": -1.3422826528549194, + "logits/chosen": -0.6999994516372681, + "logits/rejected": -0.7624176144599915, + "logps/chosen": -130.17660522460938, + "logps/rejected": -174.801513671875, + "loss": 0.5817, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9600201845169067, + "rewards/margins": 0.4198983609676361, + "rewards/rejected": -1.3799186944961548, "step": 320 }, { "epoch": 0.10380622837370242, - "grad_norm": 0.7416212558746338, + "grad_norm": 0.7980369925498962, "learning_rate": 4.9997829647624885e-06, - "logits/chosen": -0.5096332430839539, - "logits/rejected": -0.5753281712532043, - "logps/chosen": -136.32276916503906, - "logps/rejected": -190.13681030273438, - "loss": 0.5714, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.0311200618743896, - "rewards/margins": 0.5132460594177246, - "rewards/rejected": -1.5443661212921143, + "logits/chosen": -0.5264394879341125, + "logits/rejected": -0.5980736017227173, + "logps/chosen": -137.11154174804688, + "logps/rejected": -191.84661865234375, + "loss": 0.5692, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.0387710332870483, + "rewards/margins": 0.5226823091506958, + "rewards/rejected": -1.5614532232284546, "step": 330 }, { "epoch": 0.10695187165775401, - "grad_norm": 0.8752725720405579, + "grad_norm": 0.898904025554657, "learning_rate": 4.999270545372964e-06, - "logits/chosen": -0.7201881408691406, - "logits/rejected": -0.8150702714920044, - "logps/chosen": -124.40911865234375, - "logps/rejected": -179.231201171875, - "loss": 0.5488, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9127928018569946, - "rewards/margins": 0.5067313313484192, - "rewards/rejected": -1.419524073600769, + "logits/chosen": -0.8326930999755859, + "logits/rejected": -0.9346593618392944, + "logps/chosen": -126.531005859375, + "logps/rejected": -181.36660766601562, + "loss": 0.5452, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9337248802185059, + "rewards/margins": 0.5069459676742554, + "rewards/rejected": -1.4406708478927612, "step": 340 }, { "epoch": 0.1100975149418056, - "grad_norm": 1.2527039051055908, + "grad_norm": 1.0713428258895874, "learning_rate": 4.998456774775329e-06, - "logits/chosen": -0.7363126277923584, - "logits/rejected": -0.7730661630630493, - "logps/chosen": -173.3235321044922, - "logps/rejected": -224.24801635742188, - "loss": 0.5328, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3973206281661987, - "rewards/margins": 0.46879178285598755, - "rewards/rejected": -1.8661121129989624, + "logits/chosen": -0.9333699941635132, + "logits/rejected": -0.9694175720214844, + "logps/chosen": -173.23915100097656, + "logps/rejected": -227.34555053710938, + "loss": 0.5273, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3962385654449463, + "rewards/margins": 0.5004255175590515, + "rewards/rejected": -1.896664023399353, "step": 350 }, { "epoch": 0.11324315822585719, - "grad_norm": 0.9629844427108765, + "grad_norm": 0.9318549036979675, "learning_rate": 4.997341751090515e-06, - "logits/chosen": -0.8685577511787415, - "logits/rejected": -0.9733338356018066, - "logps/chosen": -174.112060546875, - "logps/rejected": -247.428466796875, - "loss": 0.4908, + "logits/chosen": -0.9799184799194336, + "logits/rejected": -1.085832118988037, + "logps/chosen": -178.21102905273438, + "logps/rejected": -254.60488891601562, + "loss": 0.4851, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4022471904754639, - "rewards/margins": 0.7104827761650085, - "rewards/rejected": -2.112730026245117, + "rewards/chosen": -1.4430134296417236, + "rewards/margins": 0.741083562374115, + "rewards/rejected": -2.1840968132019043, "step": 360 }, { "epoch": 0.11638880150990878, - "grad_norm": 1.126010775566101, + "grad_norm": 1.1828168630599976, "learning_rate": 4.995925608763244e-06, - "logits/chosen": -1.114950180053711, - "logits/rejected": -1.1976020336151123, - "logps/chosen": -169.6937255859375, - "logps/rejected": -253.0226593017578, - "loss": 0.5253, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3709325790405273, - "rewards/margins": 0.8127344250679016, - "rewards/rejected": -2.183666944503784, + "logits/chosen": -1.0449360609054565, + "logits/rejected": -1.1332659721374512, + "logps/chosen": -176.7970428466797, + "logps/rejected": -264.9158020019531, + "loss": 0.5227, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4418004751205444, + "rewards/margins": 0.8605502843856812, + "rewards/rejected": -2.3023507595062256, "step": 370 }, { "epoch": 0.11953444479396036, - "grad_norm": 1.2758018970489502, + "grad_norm": 1.250595211982727, "learning_rate": 4.994208518545819e-06, - "logits/chosen": -1.1732008457183838, - "logits/rejected": -1.3207279443740845, - "logps/chosen": -182.38046264648438, - "logps/rejected": -262.43511962890625, - "loss": 0.4701, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4985719919204712, - "rewards/margins": 0.7734432816505432, - "rewards/rejected": -2.27201509475708, + "logits/chosen": -1.1048619747161865, + "logits/rejected": -1.2548190355300903, + "logps/chosen": -191.09576416015625, + "logps/rejected": -273.36431884765625, + "loss": 0.4691, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.585456371307373, + "rewards/margins": 0.7954282164573669, + "rewards/rejected": -2.3808846473693848, "step": 380 }, { "epoch": 0.12268008807801195, - "grad_norm": 1.0300933122634888, + "grad_norm": 1.1175463199615479, "learning_rate": 4.992190687477535e-06, - "logits/chosen": -1.2033292055130005, - "logits/rejected": -1.3128149509429932, - "logps/chosen": -187.9945526123047, - "logps/rejected": -290.4914245605469, - "loss": 0.4584, + "logits/chosen": -1.1619799137115479, + "logits/rejected": -1.284158706665039, + "logps/chosen": -190.9915008544922, + "logps/rejected": -292.0174865722656, + "loss": 0.4549, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5572750568389893, - "rewards/margins": 0.9893418550491333, - "rewards/rejected": -2.546616792678833, + "rewards/chosen": -1.5870094299316406, + "rewards/margins": 0.9745893478393555, + "rewards/rejected": -2.561598539352417, "step": 390 }, { "epoch": 0.12582573136206354, - "grad_norm": 1.2100883722305298, + "grad_norm": 1.3056316375732422, "learning_rate": 4.989872358859716e-06, - "logits/chosen": -0.8574434518814087, - "logits/rejected": -1.0649316310882568, - "logps/chosen": -218.46463012695312, - "logps/rejected": -331.96588134765625, - "loss": 0.4477, + "logits/chosen": -0.7161710858345032, + "logits/rejected": -0.9287319183349609, + "logps/chosen": -227.08779907226562, + "logps/rejected": -343.8471374511719, + "loss": 0.4463, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.8527570962905884, - "rewards/margins": 1.106737732887268, - "rewards/rejected": -2.9594950675964355, + "rewards/chosen": -1.938988447189331, + "rewards/margins": 1.1392673254013062, + "rewards/rejected": -3.078256130218506, "step": 400 }, { "epoch": 0.12582573136206354, - "eval_logits/chosen": -0.3494086265563965, - "eval_logits/rejected": -0.5023281574249268, - "eval_logps/chosen": -225.44276428222656, - "eval_logps/rejected": -344.3971862792969, - "eval_loss": 0.40257528424263, - "eval_rewards/accuracies": 0.7201492786407471, - "eval_rewards/chosen": -1.9267174005508423, - "eval_rewards/margins": 1.1582682132720947, - "eval_rewards/rejected": -3.0849857330322266, - "eval_runtime": 215.0471, - "eval_samples_per_second": 99.592, - "eval_steps_per_second": 1.558, + "eval_logits/chosen": -0.21009895205497742, + "eval_logits/rejected": -0.36726853251457214, + "eval_logps/chosen": -235.92169189453125, + "eval_logps/rejected": -358.0407409667969, + "eval_loss": 0.39779898524284363, + "eval_rewards/accuracies": 0.7190298438072205, + "eval_rewards/chosen": -2.0312490463256836, + "eval_rewards/margins": 1.1899985074996948, + "eval_rewards/rejected": -3.221247911453247, + "eval_runtime": 216.5347, + "eval_samples_per_second": 98.908, + "eval_steps_per_second": 1.547, "step": 400 }, { "epoch": 0.12897137464611513, - "grad_norm": 1.2617757320404053, + "grad_norm": 1.398200273513794, "learning_rate": 4.987253812226373e-06, - "logits/chosen": -0.9884117245674133, - "logits/rejected": -1.162603735923767, - "logps/chosen": -231.59756469726562, - "logps/rejected": -362.80120849609375, - "loss": 0.4239, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.992637276649475, - "rewards/margins": 1.289119005203247, - "rewards/rejected": -3.281756639480591, + "logits/chosen": -0.9023457765579224, + "logits/rejected": -1.0780017375946045, + "logps/chosen": -238.050048828125, + "logps/rejected": -369.2995300292969, + "loss": 0.4253, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.0568206310272217, + "rewards/margins": 1.2894903421401978, + "rewards/rejected": -3.346311092376709, "step": 410 }, { "epoch": 0.13211701793016672, - "grad_norm": 1.5599896907806396, + "grad_norm": 1.451423168182373, "learning_rate": 4.984335363310513e-06, - "logits/chosen": -0.8311988115310669, - "logits/rejected": -0.9811903238296509, - "logps/chosen": -213.430908203125, - "logps/rejected": -332.53253173828125, - "loss": 0.4375, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8192352056503296, - "rewards/margins": 1.1662169694900513, - "rewards/rejected": -2.985452175140381, + "logits/chosen": -0.7545775771141052, + "logits/rejected": -0.9126666784286499, + "logps/chosen": -201.9867706298828, + "logps/rejected": -319.11279296875, + "loss": 0.4364, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7043613195419312, + "rewards/margins": 1.1467878818511963, + "rewards/rejected": -2.851149082183838, "step": 420 }, { "epoch": 0.13526266121421832, - "grad_norm": 1.3952319622039795, + "grad_norm": 1.5594825744628906, "learning_rate": 4.9811173640060516e-06, - "logits/chosen": -0.8797961473464966, - "logits/rejected": -0.876280665397644, - "logps/chosen": -261.06622314453125, - "logps/rejected": -368.8479309082031, - "loss": 0.4203, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.2755675315856934, - "rewards/margins": 1.0598723888397217, - "rewards/rejected": -3.335440158843994, + "logits/chosen": -0.8629263043403625, + "logits/rejected": -0.8639553189277649, + "logps/chosen": -271.3802795410156, + "logps/rejected": -379.85430908203125, + "loss": 0.4211, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.378561019897461, + "rewards/margins": 1.0667835474014282, + "rewards/rejected": -3.4453444480895996, "step": 430 }, { "epoch": 0.1384083044982699, - "grad_norm": 1.354478359222412, + "grad_norm": 1.3122005462646484, "learning_rate": 4.977600202325396e-06, - "logits/chosen": -0.9967167973518372, - "logits/rejected": -1.0930196046829224, - "logps/chosen": -235.45474243164062, - "logps/rejected": -352.99542236328125, - "loss": 0.3928, + "logits/chosen": -1.0775721073150635, + "logits/rejected": -1.1867996454238892, + "logps/chosen": -253.1031036376953, + "logps/rejected": -368.8877868652344, + "loss": 0.3914, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.014740467071533, - "rewards/margins": 1.1414014101028442, - "rewards/rejected": -3.156141757965088, + "rewards/chosen": -2.1909329891204834, + "rewards/margins": 1.1237446069717407, + "rewards/rejected": -3.3146774768829346, "step": 440 }, { "epoch": 0.14155394778232147, - "grad_norm": 1.211749792098999, + "grad_norm": 1.3066484928131104, "learning_rate": 4.973784302352654e-06, - "logits/chosen": -0.7338708639144897, - "logits/rejected": -0.8988674283027649, - "logps/chosen": -251.37356567382812, - "logps/rejected": -366.8519592285156, - "loss": 0.4014, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.1910181045532227, - "rewards/margins": 1.1455790996551514, - "rewards/rejected": -3.336597442626953, + "logits/chosen": -0.8042716979980469, + "logits/rejected": -0.9650095105171204, + "logps/chosen": -248.71920776367188, + "logps/rejected": -361.9927673339844, + "loss": 0.3995, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.164215564727783, + "rewards/margins": 1.123422622680664, + "rewards/rejected": -3.2876381874084473, "step": 450 }, { "epoch": 0.14469959106637306, - "grad_norm": 1.581602692604065, + "grad_norm": 1.8094184398651123, "learning_rate": 4.969670124192504e-06, - "logits/chosen": -0.4252908229827881, - "logits/rejected": -0.5512481927871704, - "logps/chosen": -241.5343780517578, - "logps/rejected": -368.2862243652344, - "loss": 0.3892, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.076319694519043, - "rewards/margins": 1.2542181015014648, - "rewards/rejected": -3.330537796020508, + "logits/chosen": -0.4726603627204895, + "logits/rejected": -0.5924113988876343, + "logps/chosen": -238.7507781982422, + "logps/rejected": -370.9023742675781, + "loss": 0.3829, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0481951236724854, + "rewards/margins": 1.3083794116973877, + "rewards/rejected": -3.356574296951294, "step": 460 }, { "epoch": 0.14784523435042465, - "grad_norm": 1.7681437730789185, + "grad_norm": 1.7695521116256714, "learning_rate": 4.965258163914713e-06, - "logits/chosen": -0.5902543067932129, - "logits/rejected": -0.5962556600570679, - "logps/chosen": -286.5151062011719, - "logps/rejected": -404.4659729003906, - "loss": 0.383, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.528414249420166, - "rewards/margins": 1.1718274354934692, - "rewards/rejected": -3.7002415657043457, + "logits/chosen": -0.552959144115448, + "logits/rejected": -0.5529184341430664, + "logps/chosen": -275.89312744140625, + "logps/rejected": -398.515380859375, + "loss": 0.3778, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.421806812286377, + "rewards/margins": 1.2187397480010986, + "rewards/rejected": -3.6405467987060547, "step": 470 }, { "epoch": 0.15099087763447624, - "grad_norm": 1.6546568870544434, + "grad_norm": 1.5262759923934937, "learning_rate": 4.960548953494325e-06, - "logits/chosen": -0.7005417943000793, - "logits/rejected": -0.7231167554855347, - "logps/chosen": -279.2669372558594, - "logps/rejected": -405.3627014160156, - "loss": 0.3813, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.462043523788452, - "rewards/margins": 1.2482386827468872, - "rewards/rejected": -3.7102818489074707, + "logits/chosen": -0.6184431314468384, + "logits/rejected": -0.6488745212554932, + "logps/chosen": -279.20599365234375, + "logps/rejected": -406.8487548828125, + "loss": 0.3824, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.461134672164917, + "rewards/margins": 1.263789415359497, + "rewards/rejected": -3.724924087524414, "step": 480 }, { "epoch": 0.15413652091852784, - "grad_norm": 1.8185548782348633, + "grad_norm": 1.7929073572158813, "learning_rate": 4.9555430607475194e-06, - "logits/chosen": -0.3447544276714325, - "logits/rejected": -0.43131861090660095, - "logps/chosen": -268.96234130859375, - "logps/rejected": -403.81939697265625, - "loss": 0.3635, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.3665878772735596, - "rewards/margins": 1.3263323307037354, - "rewards/rejected": -3.692920684814453, + "logits/chosen": -0.2868812382221222, + "logits/rejected": -0.37936121225357056, + "logps/chosen": -271.26873779296875, + "logps/rejected": -406.4844665527344, + "loss": 0.3623, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3893320560455322, + "rewards/margins": 1.3300443887710571, + "rewards/rejected": -3.7193763256073, "step": 490 }, { "epoch": 0.15728216420257943, - "grad_norm": 1.902738332748413, + "grad_norm": 2.1217312812805176, "learning_rate": 4.9502410892631426e-06, - "logits/chosen": -0.28083157539367676, - "logits/rejected": -0.32724082469940186, - "logps/chosen": -280.0065002441406, - "logps/rejected": -418.2118225097656, - "loss": 0.3583, + "logits/chosen": -0.17514923214912415, + "logits/rejected": -0.22318892180919647, + "logps/chosen": -285.77001953125, + "logps/rejected": -431.67822265625, + "loss": 0.3548, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.4585459232330322, - "rewards/margins": 1.3503986597061157, - "rewards/rejected": -3.8089442253112793, + "rewards/chosen": -2.51606822013855, + "rewards/margins": 1.4269219636917114, + "rewards/rejected": -3.9429900646209717, "step": 500 }, { "epoch": 0.15728216420257943, - "eval_logits/chosen": 0.47167521715164185, - "eval_logits/rejected": 0.31242406368255615, - "eval_logps/chosen": -281.4605407714844, - "eval_logps/rejected": -449.56976318359375, - "eval_loss": 0.3062981069087982, - "eval_rewards/accuracies": 0.7645522356033325, - "eval_rewards/chosen": -2.4868950843811035, - "eval_rewards/margins": 1.6498165130615234, - "eval_rewards/rejected": -4.136711597442627, - "eval_runtime": 215.1599, - "eval_samples_per_second": 99.54, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 0.6032934188842773, + "eval_logits/rejected": 0.4417119324207306, + "eval_logps/chosen": -284.2136535644531, + "eval_logps/rejected": -451.9689025878906, + "eval_loss": 0.3047660291194916, + "eval_rewards/accuracies": 0.7697761058807373, + "eval_rewards/chosen": -2.514169216156006, + "eval_rewards/margins": 1.6463606357574463, + "eval_rewards/rejected": -4.160529613494873, + "eval_runtime": 216.4531, + "eval_samples_per_second": 98.945, + "eval_steps_per_second": 1.548, "step": 500 }, { "epoch": 0.16042780748663102, - "grad_norm": 2.6762752532958984, + "grad_norm": 2.802795648574829, "learning_rate": 4.9446436783299315e-06, - "logits/chosen": -0.21420426666736603, - "logits/rejected": -0.31396135687828064, - "logps/chosen": -295.9054870605469, - "logps/rejected": -435.9479064941406, - "loss": 0.3801, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6259713172912598, - "rewards/margins": 1.3752977848052979, - "rewards/rejected": -4.001269340515137, + "logits/chosen": -0.009161519818007946, + "logits/rejected": -0.1047726422548294, + "logps/chosen": -297.81243896484375, + "logps/rejected": -442.32080078125, + "loss": 0.3762, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6446642875671387, + "rewards/margins": 1.4199548959732056, + "rewards/rejected": -4.064619064331055, "step": 510 }, { "epoch": 0.1635734507706826, - "grad_norm": 1.6177635192871094, + "grad_norm": 1.8657071590423584, "learning_rate": 4.938751502859433e-06, - "logits/chosen": -0.3818402886390686, - "logits/rejected": -0.4854033589363098, - "logps/chosen": -307.38507080078125, - "logps/rejected": -450.1014709472656, - "loss": 0.3959, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.744161367416382, - "rewards/margins": 1.3961646556854248, - "rewards/rejected": -4.140326499938965, + "logits/chosen": -0.21130414307117462, + "logits/rejected": -0.31424680352211, + "logps/chosen": -301.1555480957031, + "logps/rejected": -447.023193359375, + "loss": 0.3961, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.681652069091797, + "rewards/margins": 1.4277719259262085, + "rewards/rejected": -4.109424114227295, "step": 520 }, { "epoch": 0.1667190940547342, - "grad_norm": 1.5721980333328247, + "grad_norm": 1.8475096225738525, "learning_rate": 4.932565273304623e-06, - "logits/chosen": -0.30099183320999146, - "logits/rejected": -0.31828540563583374, - "logps/chosen": -302.99053955078125, - "logps/rejected": -423.03729248046875, - "loss": 0.3326, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.6925716400146484, - "rewards/margins": 1.1963403224945068, - "rewards/rejected": -3.888911724090576, + "logits/chosen": -0.23862802982330322, + "logits/rejected": -0.27317532896995544, + "logps/chosen": -288.6937561035156, + "logps/rejected": -411.35443115234375, + "loss": 0.3354, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.549204111099243, + "rewards/margins": 1.2228602170944214, + "rewards/rejected": -3.772064685821533, "step": 530 }, { "epoch": 0.1698647373387858, - "grad_norm": 2.8931405544281006, + "grad_norm": 2.7880358695983887, "learning_rate": 4.926085735574244e-06, - "logits/chosen": -0.07565931975841522, - "logits/rejected": -0.26495999097824097, - "logps/chosen": -333.55584716796875, - "logps/rejected": -520.9510498046875, - "loss": 0.3571, + "logits/chosen": -0.12997403740882874, + "logits/rejected": -0.3530030846595764, + "logps/chosen": -315.6742248535156, + "logps/rejected": -505.62261962890625, + "loss": 0.3532, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.99123215675354, - "rewards/margins": 1.855472207069397, - "rewards/rejected": -4.846704483032227, + "rewards/chosen": -2.8122756481170654, + "rewards/margins": 1.881052017211914, + "rewards/rejected": -4.693327903747559, "step": 540 }, { "epoch": 0.17301038062283736, - "grad_norm": 2.259403705596924, + "grad_norm": 2.3125181198120117, "learning_rate": 4.9193136709428666e-06, - "logits/chosen": -0.05604839324951172, - "logits/rejected": -0.09852688759565353, - "logps/chosen": -319.73895263671875, - "logps/rejected": -462.59490966796875, - "loss": 0.3559, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.861781597137451, - "rewards/margins": 1.3996176719665527, - "rewards/rejected": -4.261399269104004, + "logits/chosen": 0.041652340441942215, + "logits/rejected": -0.007311803288757801, + "logps/chosen": -301.4400634765625, + "logps/rejected": -451.60888671875, + "loss": 0.3412, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6784582138061523, + "rewards/margins": 1.4726378917694092, + "rewards/rejected": -4.151095867156982, "step": 550 }, { "epoch": 0.17615602390688895, - "grad_norm": 2.100226640701294, + "grad_norm": 2.2698020935058594, "learning_rate": 4.912249895956687e-06, - "logits/chosen": 0.10179214179515839, - "logits/rejected": -0.02760564163327217, - "logps/chosen": -284.9942626953125, - "logps/rejected": -476.2345275878906, - "loss": 0.3428, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.523716449737549, - "rewards/margins": 1.8726260662078857, - "rewards/rejected": -4.3963422775268555, + "logits/chosen": 0.3050948977470398, + "logits/rejected": 0.16047334671020508, + "logps/chosen": -294.1412353515625, + "logps/rejected": -494.5099182128906, + "loss": 0.3382, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.614988327026367, + "rewards/margins": 1.9638454914093018, + "rewards/rejected": -4.57883358001709, "step": 560 }, { "epoch": 0.17930166719094054, - "grad_norm": 2.5998682975769043, + "grad_norm": 2.4010682106018066, "learning_rate": 4.904895262335072e-06, - "logits/chosen": 0.22700171172618866, - "logits/rejected": 0.09037125110626221, - "logps/chosen": -315.6595764160156, - "logps/rejected": -516.5098876953125, - "loss": 0.3242, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.8289144039154053, - "rewards/margins": 1.9916881322860718, - "rewards/rejected": -4.8206024169921875, + "logits/chosen": 0.34259656071662903, + "logits/rejected": 0.16924947500228882, + "logps/chosen": -317.7947692871094, + "logps/rejected": -521.7095336914062, + "loss": 0.3234, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8498454093933105, + "rewards/margins": 2.0226728916168213, + "rewards/rejected": -4.872518062591553, "step": 570 }, { "epoch": 0.18244731047499213, - "grad_norm": 2.4419095516204834, + "grad_norm": 2.444711208343506, "learning_rate": 4.897250656867863e-06, - "logits/chosen": 0.32371488213539124, - "logits/rejected": 0.17300409078598022, - "logps/chosen": -358.229248046875, - "logps/rejected": -524.1956176757812, - "loss": 0.3043, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -3.2328643798828125, - "rewards/margins": 1.6491279602050781, - "rewards/rejected": -4.881992340087891, + "logits/chosen": 0.4256654381752014, + "logits/rejected": 0.26078343391418457, + "logps/chosen": -328.4891052246094, + "logps/rejected": -486.4072265625, + "loss": 0.3038, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9353702068328857, + "rewards/margins": 1.5686007738113403, + "rewards/rejected": -4.503971576690674, "step": 580 }, { "epoch": 0.18559295375904372, - "grad_norm": 2.5324759483337402, + "grad_norm": 2.8254048824310303, "learning_rate": 4.889317001308447e-06, - "logits/chosen": 0.3359132707118988, - "logits/rejected": 0.19779124855995178, - "logps/chosen": -359.0341491699219, - "logps/rejected": -536.9963989257812, - "loss": 0.3334, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -3.257742404937744, - "rewards/margins": 1.7748934030532837, - "rewards/rejected": -5.032635688781738, + "logits/chosen": 0.36937472224235535, + "logits/rejected": 0.22195684909820557, + "logps/chosen": -374.08203125, + "logps/rejected": -554.9903564453125, + "loss": 0.3311, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4078540802001953, + "rewards/margins": 1.8045555353164673, + "rewards/rejected": -5.212409496307373, "step": 590 }, { "epoch": 0.1887385970430953, - "grad_norm": 2.23077392578125, + "grad_norm": 2.11535382270813, "learning_rate": 4.881095252262619e-06, - "logits/chosen": 0.21948488056659698, - "logits/rejected": 0.23094649612903595, - "logps/chosen": -353.52838134765625, - "logps/rejected": -525.2556762695312, - "loss": 0.3041, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -3.2166409492492676, - "rewards/margins": 1.6981933116912842, - "rewards/rejected": -4.914834499359131, + "logits/chosen": 0.2527967095375061, + "logits/rejected": 0.26717156171798706, + "logps/chosen": -343.63336181640625, + "logps/rejected": -514.2552490234375, + "loss": 0.3014, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.1175334453582764, + "rewards/margins": 1.6867754459381104, + "rewards/rejected": -4.804308891296387, "step": 600 }, { "epoch": 0.1887385970430953, - "eval_logits/chosen": 1.1113409996032715, - "eval_logits/rejected": 0.964361846446991, - "eval_logps/chosen": -323.46649169921875, - "eval_logps/rejected": -533.2188720703125, - "eval_loss": 0.24049775302410126, - "eval_rewards/accuracies": 0.7917910218238831, - "eval_rewards/chosen": -2.906954765319824, - "eval_rewards/margins": 2.0662477016448975, - "eval_rewards/rejected": -4.973201751708984, - "eval_runtime": 215.0565, - "eval_samples_per_second": 99.588, - "eval_steps_per_second": 1.558, + "eval_logits/chosen": 1.1670206785202026, + "eval_logits/rejected": 1.002642273902893, + "eval_logps/chosen": -309.413818359375, + "eval_logps/rejected": -516.2450561523438, + "eval_loss": 0.2395239621400833, + "eval_rewards/accuracies": 0.7962686419487, + "eval_rewards/chosen": -2.7661707401275635, + "eval_rewards/margins": 2.0371201038360596, + "eval_rewards/rejected": -4.803291320800781, + "eval_runtime": 216.5737, + "eval_samples_per_second": 98.89, + "eval_steps_per_second": 1.547, "step": 600 }, { "epoch": 0.1918842403271469, - "grad_norm": 1.878097653388977, + "grad_norm": 1.8349930047988892, "learning_rate": 4.872586401073238e-06, - "logits/chosen": 0.34480124711990356, - "logits/rejected": 0.36164969205856323, - "logps/chosen": -299.5352478027344, - "logps/rejected": -532.43701171875, - "loss": 0.3015, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -2.655369520187378, - "rewards/margins": 2.305147409439087, - "rewards/rejected": -4.960516452789307, + "logits/chosen": 0.5030576586723328, + "logits/rejected": 0.5036323070526123, + "logps/chosen": -293.0291442871094, + "logps/rejected": -525.871826171875, + "loss": 0.302, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.590064525604248, + "rewards/margins": 2.304649829864502, + "rewards/rejected": -4.89471435546875, "step": 610 }, { "epoch": 0.1950298836111985, - "grad_norm": 2.0585927963256836, + "grad_norm": 2.018960475921631, "learning_rate": 4.863791473700695e-06, - "logits/chosen": 0.30810683965682983, - "logits/rejected": 0.3117186427116394, - "logps/chosen": -315.50518798828125, - "logps/rejected": -502.65057373046875, - "loss": 0.3237, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.825150966644287, - "rewards/margins": 1.8449840545654297, - "rewards/rejected": -4.670135021209717, + "logits/chosen": 0.6549355983734131, + "logits/rejected": 0.6538249254226685, + "logps/chosen": -299.74774169921875, + "logps/rejected": -482.9994201660156, + "loss": 0.3224, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6672844886779785, + "rewards/margins": 1.806138277053833, + "rewards/rejected": -4.473422050476074, "step": 620 }, { "epoch": 0.1981755268952501, - "grad_norm": 2.595250129699707, + "grad_norm": 2.5601086616516113, "learning_rate": 4.854711530599207e-06, - "logits/chosen": 0.17247377336025238, - "logits/rejected": 0.185434028506279, - "logps/chosen": -350.2391662597656, - "logps/rejected": -565.4362182617188, - "loss": 0.3099, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -3.168469190597534, - "rewards/margins": 2.1261777877807617, - "rewards/rejected": -5.294647216796875, + "logits/chosen": 0.4226033091545105, + "logits/rejected": 0.4217701852321625, + "logps/chosen": -349.31597900390625, + "logps/rejected": -569.6148681640625, + "loss": 0.3058, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.159015417098999, + "rewards/margins": 2.176971435546875, + "rewards/rejected": -5.335987091064453, "step": 630 }, { "epoch": 0.20132117017930168, - "grad_norm": 2.5605061054229736, + "grad_norm": 2.710289478302002, "learning_rate": 4.845347666588952e-06, - "logits/chosen": 0.36663442850112915, - "logits/rejected": 0.3369132876396179, - "logps/chosen": -355.25396728515625, - "logps/rejected": -588.9056396484375, - "loss": 0.3052, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -3.2272791862487793, - "rewards/margins": 2.284496545791626, - "rewards/rejected": -5.511775970458984, + "logits/chosen": 0.6457022428512573, + "logits/rejected": 0.5944028496742249, + "logps/chosen": -350.6747131347656, + "logps/rejected": -578.0604858398438, + "loss": 0.2957, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1813666820526123, + "rewards/margins": 2.2217352390289307, + "rewards/rejected": -5.403101921081543, "step": 640 }, { "epoch": 0.20446681346335324, - "grad_norm": 2.2802627086639404, + "grad_norm": 2.493267059326172, "learning_rate": 4.835701010724061e-06, - "logits/chosen": 0.32938310503959656, - "logits/rejected": 0.1386842280626297, - "logps/chosen": -336.86077880859375, - "logps/rejected": -585.5232543945312, - "loss": 0.2897, - "rewards/accuracies": 0.78125, - "rewards/chosen": -3.0445141792297363, - "rewards/margins": 2.4435367584228516, - "rewards/rejected": -5.488050937652588, + "logits/chosen": 0.6264899373054504, + "logits/rejected": 0.43023762106895447, + "logps/chosen": -328.0311279296875, + "logps/rejected": -580.6969604492188, + "loss": 0.287, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9558825492858887, + "rewards/margins": 2.48372483253479, + "rewards/rejected": -5.4396071434021, "step": 650 }, { "epoch": 0.20761245674740483, - "grad_norm": 2.4260733127593994, + "grad_norm": 2.2407076358795166, "learning_rate": 4.825772726156479e-06, - "logits/chosen": 0.5395032167434692, - "logits/rejected": 0.4053524434566498, - "logps/chosen": -383.67041015625, - "logps/rejected": -576.008056640625, - "loss": 0.2512, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -3.505573272705078, - "rewards/margins": 1.9211244583129883, - "rewards/rejected": -5.426698207855225, + "logits/chosen": 0.6440389752388, + "logits/rejected": 0.4974190294742584, + "logps/chosen": -372.54498291015625, + "logps/rejected": -562.779541015625, + "loss": 0.2556, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.39410662651062, + "rewards/margins": 1.8999735116958618, + "rewards/rejected": -5.294079780578613, "step": 660 }, { "epoch": 0.21075810003145642, - "grad_norm": 2.286827325820923, + "grad_norm": 2.0860157012939453, "learning_rate": 4.8155640099957206e-06, - "logits/chosen": 0.4676589071750641, - "logits/rejected": 0.3794856071472168, - "logps/chosen": -355.155517578125, - "logps/rejected": -585.6317138671875, - "loss": 0.2664, + "logits/chosen": 0.5455148816108704, + "logits/rejected": 0.4411854147911072, + "logps/chosen": -334.1847229003906, + "logps/rejected": -556.5390625, + "loss": 0.265, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -3.2314720153808594, - "rewards/margins": 2.29856538772583, - "rewards/rejected": -5.5300374031066895, + "rewards/chosen": -3.0214390754699707, + "rewards/margins": 2.2172813415527344, + "rewards/rejected": -5.238720893859863, "step": 670 }, { "epoch": 0.21390374331550802, - "grad_norm": 2.673321008682251, + "grad_norm": 2.7552173137664795, "learning_rate": 4.805076093164527e-06, - "logits/chosen": 0.4926396310329437, - "logits/rejected": 0.39895009994506836, - "logps/chosen": -373.0772705078125, - "logps/rejected": -613.6629638671875, - "loss": 0.2543, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -3.402231216430664, - "rewards/margins": 2.3745970726013184, - "rewards/rejected": -5.776828289031982, + "logits/chosen": 0.6582053899765015, + "logits/rejected": 0.5497349500656128, + "logps/chosen": -365.2132873535156, + "logps/rejected": -603.5557250976562, + "loss": 0.2555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.3236758708953857, + "rewards/margins": 2.3519177436828613, + "rewards/rejected": -5.675593376159668, "step": 680 }, { "epoch": 0.2170493865995596, - "grad_norm": 2.462273359298706, + "grad_norm": 2.692828893661499, "learning_rate": 4.794310240250444e-06, - "logits/chosen": 0.47542086243629456, - "logits/rejected": 0.526648223400116, - "logps/chosen": -397.76947021484375, - "logps/rejected": -614.8250732421875, - "loss": 0.2645, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -3.626183271408081, - "rewards/margins": 2.162733554840088, - "rewards/rejected": -5.78891658782959, + "logits/chosen": 0.671535849571228, + "logits/rejected": 0.7082802653312683, + "logps/chosen": -412.1849060058594, + "logps/rejected": -631.167724609375, + "loss": 0.2636, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.7698707580566406, + "rewards/margins": 2.1822402477264404, + "rewards/rejected": -5.95211124420166, "step": 690 }, { "epoch": 0.2201950298836112, - "grad_norm": 3.2976531982421875, + "grad_norm": 3.005434274673462, "learning_rate": 4.783267749353346e-06, - "logits/chosen": 0.8178389668464661, - "logits/rejected": 0.6482642889022827, - "logps/chosen": -347.7313232421875, - "logps/rejected": -585.7216186523438, - "loss": 0.2487, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -3.1436398029327393, - "rewards/margins": 2.362039089202881, - "rewards/rejected": -5.505678653717041, + "logits/chosen": 1.0438460111618042, + "logits/rejected": 0.8579059839248657, + "logps/chosen": -337.77716064453125, + "logps/rejected": -573.6937255859375, + "loss": 0.25, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0438902378082275, + "rewards/margins": 2.341344118118286, + "rewards/rejected": -5.385234355926514, "step": 700 }, { "epoch": 0.2201950298836112, - "eval_logits/chosen": 1.2932829856872559, - "eval_logits/rejected": 1.1342921257019043, - "eval_logps/chosen": -373.9985046386719, - "eval_logps/rejected": -617.6231079101562, - "eval_loss": 0.1963878720998764, - "eval_rewards/accuracies": 0.8208954930305481, - "eval_rewards/chosen": -3.4122743606567383, - "eval_rewards/margins": 2.4049696922302246, - "eval_rewards/rejected": -5.817244529724121, - "eval_runtime": 214.689, - "eval_samples_per_second": 99.758, - "eval_steps_per_second": 1.56, + "eval_logits/chosen": 1.505146861076355, + "eval_logits/rejected": 1.3420602083206177, + "eval_logps/chosen": -343.1828308105469, + "eval_logps/rejected": -577.8538208007812, + "eval_loss": 0.19892141222953796, + "eval_rewards/accuracies": 0.8235074877738953, + "eval_rewards/chosen": -3.103861093521118, + "eval_rewards/margins": 2.3155174255371094, + "eval_rewards/rejected": -5.41937780380249, + "eval_runtime": 216.5153, + "eval_samples_per_second": 98.917, + "eval_steps_per_second": 1.547, "step": 700 }, { "epoch": 0.2233406731676628, - "grad_norm": 2.3596227169036865, + "grad_norm": 2.0825021266937256, "learning_rate": 4.771949951928918e-06, - "logits/chosen": 0.4875836968421936, - "logits/rejected": 0.3758041262626648, - "logps/chosen": -386.7015380859375, - "logps/rejected": -659.1187744140625, - "loss": 0.2576, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -3.5298709869384766, - "rewards/margins": 2.6897921562194824, - "rewards/rejected": -6.219663619995117, + "logits/chosen": 0.7122886776924133, + "logits/rejected": 0.5764984488487244, + "logps/chosen": -368.8441162109375, + "logps/rejected": -645.1354370117188, + "loss": 0.2539, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.350966215133667, + "rewards/margins": 2.7285890579223633, + "rewards/rejected": -6.079554557800293, "step": 710 }, { "epoch": 0.22648631645171438, - "grad_norm": 2.243406295776367, + "grad_norm": 2.4290201663970947, "learning_rate": 4.76035821262811e-06, - "logits/chosen": 0.7452244162559509, - "logits/rejected": 0.5989497900009155, - "logps/chosen": -355.85284423828125, - "logps/rejected": -639.9850463867188, - "loss": 0.2265, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -3.235308885574341, - "rewards/margins": 2.7962398529052734, - "rewards/rejected": -6.031548976898193, + "logits/chosen": 0.9211057424545288, + "logits/rejected": 0.761319637298584, + "logps/chosen": -340.1125183105469, + "logps/rejected": -624.1261596679688, + "loss": 0.2178, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0774872303009033, + "rewards/margins": 2.7953667640686035, + "rewards/rejected": -5.872854232788086, "step": 720 }, { "epoch": 0.22963195973576597, - "grad_norm": 3.3983983993530273, + "grad_norm": 2.927579879760742, "learning_rate": 4.748493929132599e-06, - "logits/chosen": 0.8924552798271179, - "logits/rejected": 0.6032952070236206, - "logps/chosen": -359.5245056152344, - "logps/rejected": -623.7483520507812, - "loss": 0.268, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -3.2640464305877686, - "rewards/margins": 2.6267881393432617, - "rewards/rejected": -5.890834331512451, + "logits/chosen": 0.9934293031692505, + "logits/rejected": 0.7011948227882385, + "logps/chosen": -343.99798583984375, + "logps/rejected": -611.0202026367188, + "loss": 0.2574, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.108729839324951, + "rewards/margins": 2.6548655033111572, + "rewards/rejected": -5.763595104217529, "step": 730 }, { "epoch": 0.23277760301981756, - "grad_norm": 2.4341931343078613, + "grad_norm": 2.413174629211426, "learning_rate": 4.7363585319862535e-06, - "logits/chosen": 0.8101499676704407, - "logits/rejected": 0.7433942556381226, - "logps/chosen": -332.5082702636719, - "logps/rejected": -583.41650390625, - "loss": 0.2235, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.9981846809387207, - "rewards/margins": 2.4985713958740234, - "rewards/rejected": -5.496755599975586, + "logits/chosen": 0.8890258073806763, + "logits/rejected": 0.8161128759384155, + "logps/chosen": -297.27734375, + "logps/rejected": -534.4481201171875, + "loss": 0.2185, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6455435752868652, + "rewards/margins": 2.361142158508301, + "rewards/rejected": -5.006685256958008, "step": 740 }, { "epoch": 0.23592324630386913, - "grad_norm": 1.8014791011810303, + "grad_norm": 1.7433052062988281, "learning_rate": 4.7239534844226595e-06, - "logits/chosen": 0.9954848289489746, - "logits/rejected": 0.9288710355758667, - "logps/chosen": -386.96783447265625, - "logps/rejected": -663.5922241210938, - "loss": 0.2359, - "rewards/accuracies": 0.84375, - "rewards/chosen": -3.52067232131958, - "rewards/margins": 2.7473063468933105, - "rewards/rejected": -6.267977714538574, + "logits/chosen": 1.1142632961273193, + "logits/rejected": 1.041259527206421, + "logps/chosen": -389.9774169921875, + "logps/rejected": -677.3299560546875, + "loss": 0.2308, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.550323009490967, + "rewards/margins": 2.8546836376190186, + "rewards/rejected": -6.405006408691406, "step": 750 }, { "epoch": 0.23906888958792072, - "grad_norm": 2.4365100860595703, + "grad_norm": 2.963017225265503, "learning_rate": 4.711280282188674e-06, - "logits/chosen": 1.1280491352081299, - "logits/rejected": 0.9587424397468567, - "logps/chosen": -416.06427001953125, - "logps/rejected": -652.4948120117188, - "loss": 0.245, - "rewards/accuracies": 0.78125, - "rewards/chosen": -3.8194878101348877, - "rewards/margins": 2.3404040336608887, - "rewards/rejected": -6.159891128540039, + "logits/chosen": 1.2721364498138428, + "logits/rejected": 1.072861909866333, + "logps/chosen": -407.93896484375, + "logps/rejected": -637.9373779296875, + "loss": 0.2364, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.7379825115203857, + "rewards/margins": 2.2759547233581543, + "rewards/rejected": -6.013937950134277, "step": 760 }, { "epoch": 0.2422145328719723, - "grad_norm": 2.6265649795532227, + "grad_norm": 2.531205654144287, "learning_rate": 4.698340453364087e-06, - "logits/chosen": 1.0067460536956787, - "logits/rejected": 0.7757904529571533, - "logps/chosen": -403.1853942871094, - "logps/rejected": -655.3627319335938, - "loss": 0.2227, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -3.702298641204834, - "rewards/margins": 2.4899773597717285, - "rewards/rejected": -6.192275524139404, + "logits/chosen": 1.2041015625, + "logits/rejected": 0.9627124667167664, + "logps/chosen": -381.7981262207031, + "logps/rejected": -627.6697998046875, + "loss": 0.225, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.4880454540252686, + "rewards/margins": 2.4269821643829346, + "rewards/rejected": -5.915027618408203, "step": 770 }, { "epoch": 0.2453601761560239, - "grad_norm": 3.4549660682678223, + "grad_norm": 2.7467708587646484, "learning_rate": 4.685135558177361e-06, - "logits/chosen": 0.8524907827377319, - "logits/rejected": 0.7574479579925537, - "logps/chosen": -394.0924987792969, - "logps/rejected": -652.1280517578125, - "loss": 0.2464, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -3.604322910308838, - "rewards/margins": 2.561311721801758, - "rewards/rejected": -6.165635108947754, + "logits/chosen": 1.0979419946670532, + "logits/rejected": 0.9988969564437866, + "logps/chosen": -381.86737060546875, + "logps/rejected": -638.2721557617188, + "loss": 0.2466, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.4817283153533936, + "rewards/margins": 2.545217990875244, + "rewards/rejected": -6.026947021484375, "step": 780 }, { "epoch": 0.2485058194400755, - "grad_norm": 2.0555849075317383, + "grad_norm": 2.4694933891296387, "learning_rate": 4.671667188817516e-06, - "logits/chosen": 0.7523924112319946, - "logits/rejected": 0.791477382183075, - "logps/chosen": -409.0593566894531, - "logps/rejected": -673.9398193359375, - "loss": 0.2144, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -3.750147581100464, - "rewards/margins": 2.626615047454834, - "rewards/rejected": -6.376762866973877, + "logits/chosen": 0.9596866369247437, + "logits/rejected": 0.988632321357727, + "logps/chosen": -394.10980224609375, + "logps/rejected": -659.1544799804688, + "loss": 0.2089, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.600590944290161, + "rewards/margins": 2.62831711769104, + "rewards/rejected": -6.228908538818359, "step": 790 }, { "epoch": 0.2516514627241271, - "grad_norm": 3.2837741374969482, + "grad_norm": 3.156921863555908, "learning_rate": 4.657936969242146e-06, - "logits/chosen": 0.8540847897529602, - "logits/rejected": 0.8441425561904907, - "logps/chosen": -394.26251220703125, - "logps/rejected": -657.2059326171875, - "loss": 0.218, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -3.610989809036255, - "rewards/margins": 2.6211609840393066, - "rewards/rejected": -6.232151031494141, + "logits/chosen": 1.0584884881973267, + "logits/rejected": 1.056359052658081, + "logps/chosen": -367.99749755859375, + "logps/rejected": -622.4884033203125, + "loss": 0.2163, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.3479487895965576, + "rewards/margins": 2.537045955657959, + "rewards/rejected": -5.8849945068359375, "step": 800 }, { "epoch": 0.2516514627241271, - "eval_logits/chosen": 1.7290374040603638, - "eval_logits/rejected": 1.5710214376449585, - "eval_logps/chosen": -400.4794921875, - "eval_logps/rejected": -698.409423828125, - "eval_loss": 0.15468811988830566, - "eval_rewards/accuracies": 0.8335821032524109, - "eval_rewards/chosen": -3.67708420753479, - "eval_rewards/margins": 2.9480228424072266, - "eval_rewards/rejected": -6.625107288360596, - "eval_runtime": 215.1039, - "eval_samples_per_second": 99.566, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 1.969736933708191, + "eval_logits/rejected": 1.8083586692810059, + "eval_logps/chosen": -378.15106201171875, + "eval_logps/rejected": -674.7254638671875, + "eval_loss": 0.1564020812511444, + "eval_rewards/accuracies": 0.8369402885437012, + "eval_rewards/chosen": -3.453542947769165, + "eval_rewards/margins": 2.9345521926879883, + "eval_rewards/rejected": -6.388094902038574, + "eval_runtime": 216.5645, + "eval_samples_per_second": 98.894, + "eval_steps_per_second": 1.547, "step": 800 }, { "epoch": 0.2547971060081787, - "grad_norm": 2.1503520011901855, + "grad_norm": 2.260498523712158, "learning_rate": 4.643946554981607e-06, - "logits/chosen": 1.1990272998809814, - "logits/rejected": 1.1896309852600098, - "logps/chosen": -422.6075744628906, - "logps/rejected": -712.6632080078125, - "loss": 0.2211, + "logits/chosen": 1.3992656469345093, + "logits/rejected": 1.3807123899459839, + "logps/chosen": -410.2210388183594, + "logps/rejected": -693.8885498046875, + "loss": 0.2184, "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -3.8861594200134277, - "rewards/margins": 2.8886961936950684, - "rewards/rejected": -6.774855613708496, + "rewards/chosen": -3.762162685394287, + "rewards/margins": 2.8245062828063965, + "rewards/rejected": -6.586669921875, "step": 810 }, { "epoch": 0.25794274929223027, - "grad_norm": 2.367893934249878, + "grad_norm": 2.421734571456909, "learning_rate": 4.629697632939402e-06, - "logits/chosen": 0.8573128581047058, - "logits/rejected": 0.8127277493476868, - "logps/chosen": -448.3675231933594, - "logps/rejected": -742.9185180664062, - "loss": 0.2021, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -4.144953727722168, - "rewards/margins": 2.9193053245544434, - "rewards/rejected": -7.064258575439453, + "logits/chosen": 0.9631272554397583, + "logits/rejected": 0.9137741923332214, + "logps/chosen": -403.3345642089844, + "logps/rejected": -690.8912963867188, + "loss": 0.1978, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.6943416595458984, + "rewards/margins": 2.8497314453125, + "rewards/rejected": -6.544073581695557, "step": 820 }, { "epoch": 0.26108839257628186, - "grad_norm": 2.6760129928588867, + "grad_norm": 2.466891050338745, "learning_rate": 4.615191921188782e-06, - "logits/chosen": 1.047659993171692, - "logits/rejected": 0.8862239122390747, - "logps/chosen": -430.19622802734375, - "logps/rejected": -718.34033203125, - "loss": 0.1965, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -3.9595940113067627, - "rewards/margins": 2.861283779144287, - "rewards/rejected": -6.8208770751953125, + "logits/chosen": 1.216644287109375, + "logits/rejected": 1.0390194654464722, + "logps/chosen": -462.14080810546875, + "logps/rejected": -755.9129028320312, + "loss": 0.1893, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.278713226318359, + "rewards/margins": 2.9179956912994385, + "rewards/rejected": -7.196709632873535, "step": 830 }, { "epoch": 0.26423403586033345, - "grad_norm": 3.5069732666015625, + "grad_norm": 2.796055793762207, "learning_rate": 4.600431168765588e-06, - "logits/chosen": 1.041211724281311, - "logits/rejected": 1.060903549194336, - "logps/chosen": -402.40496826171875, - "logps/rejected": -657.80859375, - "loss": 0.2057, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -3.7000668048858643, - "rewards/margins": 2.5350348949432373, - "rewards/rejected": -6.235101222991943, + "logits/chosen": 1.23114013671875, + "logits/rejected": 1.243263840675354, + "logps/chosen": -384.63330078125, + "logps/rejected": -629.2211303710938, + "loss": 0.2038, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.522122859954834, + "rewards/margins": 2.426643133163452, + "rewards/rejected": -5.948765754699707, "step": 840 }, { "epoch": 0.26737967914438504, - "grad_norm": 2.710855484008789, + "grad_norm": 3.352532148361206, "learning_rate": 4.58541715545736e-06, - "logits/chosen": 1.0758662223815918, - "logits/rejected": 1.0488075017929077, - "logps/chosen": -438.24676513671875, - "logps/rejected": -723.1414184570312, - "loss": 0.213, - "rewards/accuracies": 0.8125, - "rewards/chosen": -4.031523704528809, - "rewards/margins": 2.824887752532959, - "rewards/rejected": -6.856411933898926, + "logits/chosen": 1.1673449277877808, + "logits/rejected": 1.1369606256484985, + "logps/chosen": -424.13214111328125, + "logps/rejected": -703.1793212890625, + "loss": 0.2075, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.89024019241333, + "rewards/margins": 2.7663023471832275, + "rewards/rejected": -6.656541347503662, "step": 850 }, { "epoch": 0.27052532242843663, - "grad_norm": 2.7011239528656006, + "grad_norm": 2.815277338027954, "learning_rate": 4.570151691588739e-06, - "logits/chosen": 1.2816145420074463, - "logits/rejected": 1.1929352283477783, - "logps/chosen": -421.4190368652344, - "logps/rejected": -684.8802490234375, - "loss": 0.2145, + "logits/chosen": 1.4445253610610962, + "logits/rejected": 1.3472189903259277, + "logps/chosen": -411.64410400390625, + "logps/rejected": -683.1400756835938, + "loss": 0.2071, "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -3.8643555641174316, - "rewards/margins": 2.6349289417266846, - "rewards/rejected": -6.499284267425537, + "rewards/chosen": -3.766723155975342, + "rewards/margins": 2.7148866653442383, + "rewards/rejected": -6.481610298156738, "step": 860 }, { "epoch": 0.2736709657124882, - "grad_norm": 2.7440969944000244, + "grad_norm": 2.5210723876953125, "learning_rate": 4.554636617803182e-06, - "logits/chosen": 1.342369556427002, - "logits/rejected": 1.008675217628479, - "logps/chosen": -391.8333435058594, - "logps/rejected": -700.033203125, - "loss": 0.1982, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -3.576009750366211, - "rewards/margins": 3.0503413677215576, - "rewards/rejected": -6.626351833343506, + "logits/chosen": 1.489781141281128, + "logits/rejected": 1.1546220779418945, + "logps/chosen": -364.3757019042969, + "logps/rejected": -662.89990234375, + "loss": 0.1936, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.301492691040039, + "rewards/margins": 2.953282117843628, + "rewards/rejected": -6.254774570465088, "step": 870 }, { "epoch": 0.2768166089965398, - "grad_norm": 2.3672075271606445, + "grad_norm": 2.1922731399536133, "learning_rate": 4.538873804841028e-06, - "logits/chosen": 1.249786615371704, - "logits/rejected": 1.0435155630111694, - "logps/chosen": -476.52423095703125, - "logps/rejected": -823.1788330078125, - "loss": 0.1808, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -4.4410505294799805, - "rewards/margins": 3.4214394092559814, - "rewards/rejected": -7.862489223480225, + "logits/chosen": 1.289786696434021, + "logits/rejected": 1.0749728679656982, + "logps/chosen": -441.4300842285156, + "logps/rejected": -771.0155029296875, + "loss": 0.1595, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.090059757232666, + "rewards/margins": 3.2505059242248535, + "rewards/rejected": -7.3405656814575195, "step": 880 }, { "epoch": 0.2799622522805914, - "grad_norm": 2.971510648727417, + "grad_norm": 3.1142098903656006, "learning_rate": 4.522865153313932e-06, - "logits/chosen": 1.4465210437774658, - "logits/rejected": 1.3960046768188477, - "logps/chosen": -461.5182189941406, - "logps/rejected": -782.6722412109375, - "loss": 0.2097, - "rewards/accuracies": 0.84375, - "rewards/chosen": -4.27487850189209, - "rewards/margins": 3.2032265663146973, - "rewards/rejected": -7.4781060218811035, + "logits/chosen": 1.5812687873840332, + "logits/rejected": 1.544809103012085, + "logps/chosen": -445.30047607421875, + "logps/rejected": -758.8344116210938, + "loss": 0.1982, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.112375259399414, + "rewards/margins": 3.127253770828247, + "rewards/rejected": -7.239628791809082, "step": 890 }, { "epoch": 0.28310789556464294, - "grad_norm": 3.615593910217285, + "grad_norm": 3.9424490928649902, "learning_rate": 4.506612593475701e-06, - "logits/chosen": 1.137704610824585, - "logits/rejected": 1.0405908823013306, - "logps/chosen": -378.880859375, - "logps/rejected": -688.6226806640625, - "loss": 0.1858, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -3.4608230590820312, - "rewards/margins": 3.0565378665924072, - "rewards/rejected": -6.517360687255859, + "logits/chosen": 1.2013763189315796, + "logits/rejected": 1.0818957090377808, + "logps/chosen": -384.22625732421875, + "logps/rejected": -698.1456909179688, + "loss": 0.178, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.513953447341919, + "rewards/margins": 3.0984597206115723, + "rewards/rejected": -6.612412929534912, "step": 900 }, { "epoch": 0.28310789556464294, - "eval_logits/chosen": 1.8630539178848267, - "eval_logits/rejected": 1.6987581253051758, - "eval_logps/chosen": -387.6122741699219, - "eval_logps/rejected": -703.9799194335938, - "eval_loss": 0.139369398355484, - "eval_rewards/accuracies": 0.8485074639320374, - "eval_rewards/chosen": -3.5484120845794678, - "eval_rewards/margins": 3.1324002742767334, - "eval_rewards/rejected": -6.680812835693359, - "eval_runtime": 215.0603, - "eval_samples_per_second": 99.586, - "eval_steps_per_second": 1.558, + "eval_logits/chosen": 1.9492088556289673, + "eval_logits/rejected": 1.768805742263794, + "eval_logps/chosen": -376.3503112792969, + "eval_logps/rejected": -690.027587890625, + "eval_loss": 0.13491272926330566, + "eval_rewards/accuracies": 0.858582079410553, + "eval_rewards/chosen": -3.4355356693267822, + "eval_rewards/margins": 3.1055805683135986, + "eval_rewards/rejected": -6.541116237640381, + "eval_runtime": 216.5072, + "eval_samples_per_second": 98.92, + "eval_steps_per_second": 1.547, "step": 900 }, { "epoch": 0.28625353884869453, - "grad_norm": 2.4971940517425537, + "grad_norm": 2.6901655197143555, "learning_rate": 4.490118084989544e-06, - "logits/chosen": 1.0084787607192993, - "logits/rejected": 1.003114938735962, - "logps/chosen": -420.396484375, - "logps/rejected": -712.1617431640625, - "loss": 0.2153, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -3.869481325149536, - "rewards/margins": 2.8973114490509033, - "rewards/rejected": -6.766793727874756, + "logits/chosen": 1.189724326133728, + "logits/rejected": 1.1710891723632812, + "logps/chosen": -429.6070251464844, + "logps/rejected": -719.17236328125, + "loss": 0.2161, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.961094379425049, + "rewards/margins": 2.875734329223633, + "rewards/rejected": -6.836828708648682, "step": 910 }, { "epoch": 0.2893991821327461, - "grad_norm": 2.809347629547119, + "grad_norm": 2.5591769218444824, "learning_rate": 4.473383616691792e-06, - "logits/chosen": 1.2141597270965576, - "logits/rejected": 1.092398762702942, - "logps/chosen": -439.409912109375, - "logps/rejected": -735.4099731445312, - "loss": 0.201, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -4.070156097412109, - "rewards/margins": 2.93392276763916, - "rewards/rejected": -7.0040788650512695, + "logits/chosen": 1.467657446861267, + "logits/rejected": 1.3278765678405762, + "logps/chosen": -419.4930114746094, + "logps/rejected": -711.3831787109375, + "loss": 0.193, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.870819091796875, + "rewards/margins": 2.892895221710205, + "rewards/rejected": -6.763714790344238, "step": 920 }, { "epoch": 0.2925448254167977, - "grad_norm": 2.333805561065674, + "grad_norm": 2.1629209518432617, "learning_rate": 4.456411206352088e-06, - "logits/chosen": 0.8967952728271484, - "logits/rejected": 0.8582611083984375, - "logps/chosen": -407.14739990234375, - "logps/rejected": -692.2764892578125, - "loss": 0.2086, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -3.749049425125122, - "rewards/margins": 2.817676067352295, - "rewards/rejected": -6.566725730895996, + "logits/chosen": 1.140165090560913, + "logits/rejected": 1.0872342586517334, + "logps/chosen": -381.86505126953125, + "logps/rejected": -660.9152221679688, + "loss": 0.1876, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.495915651321411, + "rewards/margins": 2.757035493850708, + "rewards/rejected": -6.252951145172119, "step": 930 }, { "epoch": 0.2956904687008493, - "grad_norm": 3.1295571327209473, + "grad_norm": 3.120842933654785, "learning_rate": 4.439202900430098e-06, - "logits/chosen": 1.0088117122650146, - "logits/rejected": 0.8580893278121948, - "logps/chosen": -401.88525390625, - "logps/rejected": -715.9742431640625, - "loss": 0.2032, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -3.6845085620880127, - "rewards/margins": 3.113250970840454, - "rewards/rejected": -6.797759056091309, + "logits/chosen": 1.248095154762268, + "logits/rejected": 1.0922152996063232, + "logps/chosen": -402.4951171875, + "logps/rejected": -733.9520263671875, + "loss": 0.2028, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.690582275390625, + "rewards/margins": 3.286928176879883, + "rewards/rejected": -6.97750997543335, "step": 940 }, { "epoch": 0.2988361119849009, - "grad_norm": 3.3793976306915283, + "grad_norm": 3.5598092079162598, "learning_rate": 4.421760773828749e-06, - "logits/chosen": 0.9543240666389465, - "logits/rejected": 0.8352963328361511, - "logps/chosen": -427.1727600097656, - "logps/rejected": -727.7288208007812, - "loss": 0.2395, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -3.9340949058532715, - "rewards/margins": 3.0020058155059814, - "rewards/rejected": -6.936100006103516, + "logits/chosen": 1.1661925315856934, + "logits/rejected": 1.0520888566970825, + "logps/chosen": -386.32763671875, + "logps/rejected": -687.5399169921875, + "loss": 0.227, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.5252959728240967, + "rewards/margins": 3.0088343620300293, + "rewards/rejected": -6.534131050109863, "step": 950 }, { "epoch": 0.3019817552689525, - "grad_norm": 2.7082152366638184, + "grad_norm": 2.8260598182678223, "learning_rate": 4.4040869296440595e-06, - "logits/chosen": 0.8623906970024109, - "logits/rejected": 0.835192859172821, - "logps/chosen": -421.047607421875, - "logps/rejected": -760.1714477539062, - "loss": 0.1768, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.8686699867248535, - "rewards/margins": 3.3722710609436035, - "rewards/rejected": -7.240941047668457, + "logits/chosen": 1.0562385320663452, + "logits/rejected": 1.0342681407928467, + "logps/chosen": -418.41790771484375, + "logps/rejected": -755.587158203125, + "loss": 0.1787, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.8422763347625732, + "rewards/margins": 3.352571964263916, + "rewards/rejected": -7.19484806060791, "step": 960 }, { "epoch": 0.3051273985530041, - "grad_norm": 3.3928000926971436, + "grad_norm": 2.8546972274780273, "learning_rate": 4.3861834989115435e-06, - "logits/chosen": 0.8181372880935669, - "logits/rejected": 0.8542720079421997, - "logps/chosen": -388.176513671875, - "logps/rejected": -731.9312744140625, - "loss": 0.1659, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -3.5451221466064453, - "rewards/margins": 3.4101932048797607, - "rewards/rejected": -6.955314636230469, + "logits/chosen": 1.0627679824829102, + "logits/rejected": 1.1002824306488037, + "logps/chosen": -389.6894226074219, + "logps/rejected": -745.2379760742188, + "loss": 0.1601, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.560076951980591, + "rewards/margins": 3.527824878692627, + "rewards/rejected": -7.0879011154174805, "step": 970 }, { "epoch": 0.3082730418370557, - "grad_norm": 3.535090446472168, + "grad_norm": 3.4414689540863037, "learning_rate": 4.368052640349269e-06, - "logits/chosen": 1.3438990116119385, - "logits/rejected": 1.2530713081359863, - "logps/chosen": -406.4024658203125, - "logps/rejected": -737.8275146484375, - "loss": 0.1641, - "rewards/accuracies": 0.84375, - "rewards/chosen": -3.737173557281494, - "rewards/margins": 3.2970378398895264, - "rewards/rejected": -7.034211158752441, + "logits/chosen": 1.4899530410766602, + "logits/rejected": 1.3966710567474365, + "logps/chosen": -402.5296630859375, + "logps/rejected": -745.9603271484375, + "loss": 0.1528, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.698240280151367, + "rewards/margins": 3.4171810150146484, + "rewards/rejected": -7.115421295166016, "step": 980 }, { "epoch": 0.31141868512110726, - "grad_norm": 3.4729607105255127, + "grad_norm": 3.587970018386841, "learning_rate": 4.349696540097564e-06, - "logits/chosen": 1.3546165227890015, - "logits/rejected": 1.2128336429595947, - "logps/chosen": -454.58148193359375, - "logps/rejected": -771.8201293945312, - "loss": 0.1636, + "logits/chosen": 1.3497339487075806, + "logits/rejected": 1.1979681253433228, + "logps/chosen": -443.26507568359375, + "logps/rejected": -760.2127685546875, + "loss": 0.1554, "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -4.219397068023682, - "rewards/margins": 3.1577281951904297, - "rewards/rejected": -7.3771257400512695, + "rewards/chosen": -4.1059465408325195, + "rewards/margins": 3.154737949371338, + "rewards/rejected": -7.260683536529541, "step": 990 }, { "epoch": 0.31456432840515886, - "grad_norm": 2.7894742488861084, + "grad_norm": 2.6379456520080566, "learning_rate": 4.331117411455425e-06, - "logits/chosen": 1.3166215419769287, - "logits/rejected": 1.3613556623458862, - "logps/chosen": -435.62359619140625, - "logps/rejected": -732.1107177734375, - "loss": 0.173, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -4.024202823638916, - "rewards/margins": 2.9517178535461426, - "rewards/rejected": -6.9759202003479, + "logits/chosen": 1.482219934463501, + "logits/rejected": 1.5086220502853394, + "logps/chosen": -444.00848388671875, + "logps/rejected": -734.5386962890625, + "loss": 0.1736, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.107657432556152, + "rewards/margins": 2.892225980758667, + "rewards/rejected": -6.99988317489624, "step": 1000 }, { "epoch": 0.31456432840515886, - "eval_logits/chosen": 1.9776256084442139, - "eval_logits/rejected": 1.8189852237701416, - "eval_logps/chosen": -381.0118408203125, - "eval_logps/rejected": -712.953125, - "eval_loss": 0.11758408695459366, - "eval_rewards/accuracies": 0.8649253845214844, - "eval_rewards/chosen": -3.482407569885254, - "eval_rewards/margins": 3.2881364822387695, - "eval_rewards/rejected": -6.770544052124023, - "eval_runtime": 214.5734, - "eval_samples_per_second": 99.812, - "eval_steps_per_second": 1.561, + "eval_logits/chosen": 2.243989944458008, + "eval_logits/rejected": 2.084845542907715, + "eval_logps/chosen": -387.5068664550781, + "eval_logps/rejected": -731.9054565429688, + "eval_loss": 0.11270873993635178, + "eval_rewards/accuracies": 0.8667910695075989, + "eval_rewards/chosen": -3.5471012592315674, + "eval_rewards/margins": 3.4127936363220215, + "eval_rewards/rejected": -6.95989465713501, + "eval_runtime": 216.499, + "eval_samples_per_second": 98.924, + "eval_steps_per_second": 1.547, "step": 1000 }, { "epoch": 0.31770997168921045, - "grad_norm": 2.152683734893799, + "grad_norm": 2.6611509323120117, "learning_rate": 4.312317494613642e-06, - "logits/chosen": 1.2920253276824951, - "logits/rejected": 1.169878602027893, - "logps/chosen": -423.5650939941406, - "logps/rejected": -764.8018798828125, - "loss": 0.1572, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -3.894944429397583, - "rewards/margins": 3.4041748046875, - "rewards/rejected": -7.299118995666504, + "logits/chosen": 1.5053398609161377, + "logits/rejected": 1.3990230560302734, + "logps/chosen": -433.1009826660156, + "logps/rejected": -780.1348266601562, + "loss": 0.1505, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.9901912212371826, + "rewards/margins": 3.4618980884552, + "rewards/rejected": -7.452090263366699, "step": 1010 }, { "epoch": 0.32085561497326204, - "grad_norm": 2.5282208919525146, + "grad_norm": 3.133005380630493, "learning_rate": 4.293299056384692e-06, - "logits/chosen": 1.3552110195159912, - "logits/rejected": 1.1271814107894897, - "logps/chosen": -462.6878356933594, - "logps/rejected": -754.1079711914062, - "loss": 0.1873, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -4.283886909484863, - "rewards/margins": 2.8990678787231445, - "rewards/rejected": -7.18295431137085, + "logits/chosen": 1.581313133239746, + "logits/rejected": 1.3655680418014526, + "logps/chosen": -428.97039794921875, + "logps/rejected": -712.6785278320312, + "loss": 0.1824, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.9461722373962402, + "rewards/margins": 2.822526454925537, + "rewards/rejected": -6.768698215484619, "step": 1020 }, { "epoch": 0.32400125825731363, - "grad_norm": 6.489924430847168, + "grad_norm": 2.3489255905151367, "learning_rate": 4.274064389929412e-06, - "logits/chosen": 1.4782826900482178, - "logits/rejected": 1.2579948902130127, - "logps/chosen": -387.65777587890625, - "logps/rejected": -723.7617797851562, - "loss": 0.1682, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -3.5566749572753906, - "rewards/margins": 3.3422751426696777, - "rewards/rejected": -6.898950099945068, + "logits/chosen": 1.619148850440979, + "logits/rejected": 1.3803061246871948, + "logps/chosen": -380.7969055175781, + "logps/rejected": -708.0203857421875, + "loss": 0.1684, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.4878249168395996, + "rewards/margins": 3.2536826133728027, + "rewards/rejected": -6.741507530212402, "step": 1030 }, { "epoch": 0.3271469015413652, - "grad_norm": 3.615981340408325, + "grad_norm": 4.2949042320251465, "learning_rate": 4.254615814480501e-06, - "logits/chosen": 1.3795908689498901, - "logits/rejected": 1.2748284339904785, - "logps/chosen": -461.3394470214844, - "logps/rejected": -786.0704956054688, - "loss": 0.1716, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.288393974304199, - "rewards/margins": 3.223513126373291, - "rewards/rejected": -7.51190710067749, + "logits/chosen": 1.4520130157470703, + "logits/rejected": 1.3369412422180176, + "logps/chosen": -459.8309631347656, + "logps/rejected": -775.1959838867188, + "loss": 0.165, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.2733917236328125, + "rewards/margins": 3.1295056343078613, + "rewards/rejected": -7.402897834777832, "step": 1040 }, { "epoch": 0.3302925448254168, - "grad_norm": 2.9559316635131836, + "grad_norm": 3.195559024810791, "learning_rate": 4.234955675062881e-06, - "logits/chosen": 1.324539303779602, - "logits/rejected": 1.2243735790252686, - "logps/chosen": -457.6017150878906, - "logps/rejected": -793.406005859375, - "loss": 0.1757, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -4.254838466644287, - "rewards/margins": 3.335017681121826, - "rewards/rejected": -7.5898566246032715, + "logits/chosen": 1.5306510925292969, + "logits/rejected": 1.4327778816223145, + "logps/chosen": -418.94842529296875, + "logps/rejected": -751.5892944335938, + "loss": 0.1749, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.8681347370147705, + "rewards/margins": 3.3033840656280518, + "rewards/rejected": -7.171518802642822, "step": 1050 }, { "epoch": 0.3334381881094684, - "grad_norm": 2.599055051803589, + "grad_norm": 2.2908482551574707, "learning_rate": 4.215086342210932e-06, - "logits/chosen": 1.4138332605361938, - "logits/rejected": 1.2221533060073853, - "logps/chosen": -416.51092529296875, - "logps/rejected": -771.9613037109375, - "loss": 0.1549, + "logits/chosen": 1.6077830791473389, + "logits/rejected": 1.3920700550079346, + "logps/chosen": -367.6110534667969, + "logps/rejected": -708.1036376953125, + "loss": 0.1536, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -3.84822416305542, - "rewards/margins": 3.504741668701172, - "rewards/rejected": -7.352965354919434, + "rewards/chosen": -3.358769655227661, + "rewards/margins": 3.355515718460083, + "rewards/rejected": -6.714285373687744, "step": 1060 }, { "epoch": 0.33658383139352, - "grad_norm": 2.722844362258911, + "grad_norm": 3.3649628162384033, "learning_rate": 4.19501021168268e-06, - "logits/chosen": 1.339404821395874, - "logits/rejected": 1.2510805130004883, - "logps/chosen": -417.2064514160156, - "logps/rejected": -769.0526123046875, - "loss": 0.182, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.8489937782287598, - "rewards/margins": 3.4992358684539795, - "rewards/rejected": -7.348229885101318, + "logits/chosen": 1.5467596054077148, + "logits/rejected": 1.4269657135009766, + "logps/chosen": -465.55694580078125, + "logps/rejected": -830.7658081054688, + "loss": 0.176, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.3319597244262695, + "rewards/margins": 3.633349657058716, + "rewards/rejected": -7.965310096740723, "step": 1070 }, { "epoch": 0.3397294746775716, - "grad_norm": 3.07171630859375, + "grad_norm": 2.973006248474121, "learning_rate": 4.174729704170914e-06, - "logits/chosen": 1.3331737518310547, - "logits/rejected": 1.3105535507202148, - "logps/chosen": -443.6220703125, - "logps/rejected": -835.4161987304688, - "loss": 0.1646, + "logits/chosen": 1.5441627502441406, + "logits/rejected": 1.4771041870117188, + "logps/chosen": -426.13592529296875, + "logps/rejected": -808.2399291992188, + "loss": 0.1639, "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.104954719543457, - "rewards/margins": 3.9018218517303467, - "rewards/rejected": -8.006775856018066, + "rewards/chosen": -3.9298815727233887, + "rewards/margins": 3.805147171020508, + "rewards/rejected": -7.7350287437438965, "step": 1080 }, { "epoch": 0.3428751179616232, - "grad_norm": 3.2593963146209717, + "grad_norm": 3.8005123138427734, "learning_rate": 4.154247265011313e-06, - "logits/chosen": 1.2752745151519775, - "logits/rejected": 1.0605169534683228, - "logps/chosen": -459.13067626953125, - "logps/rejected": -868.1017456054688, - "loss": 0.1369, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.2618088722229, - "rewards/margins": 4.040172576904297, - "rewards/rejected": -8.301980972290039, + "logits/chosen": 1.4085278511047363, + "logits/rejected": 1.1824581623077393, + "logps/chosen": -415.9977111816406, + "logps/rejected": -807.1639404296875, + "loss": 0.1373, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -3.8302745819091797, + "rewards/margins": 3.8619492053985596, + "rewards/rejected": -7.692224025726318, "step": 1090 }, { "epoch": 0.3460207612456747, - "grad_norm": 2.6058199405670166, + "grad_norm": 2.478180170059204, "learning_rate": 4.133565363887602e-06, - "logits/chosen": 1.3846371173858643, - "logits/rejected": 1.2146029472351074, - "logps/chosen": -433.3853454589844, - "logps/rejected": -802.4357299804688, - "loss": 0.1494, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.011342525482178, - "rewards/margins": 3.647768497467041, - "rewards/rejected": -7.659111022949219, + "logits/chosen": 1.4384472370147705, + "logits/rejected": 1.2546355724334717, + "logps/chosen": -395.85516357421875, + "logps/rejected": -749.6239013671875, + "loss": 0.1474, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -3.635516405105591, + "rewards/margins": 3.4953994750976562, + "rewards/rejected": -7.130915641784668, "step": 1100 }, { "epoch": 0.3460207612456747, - "eval_logits/chosen": 1.9864978790283203, - "eval_logits/rejected": 1.8179283142089844, - "eval_logps/chosen": -412.18609619140625, - "eval_logps/rejected": -781.1856689453125, - "eval_loss": 0.09792975336313248, - "eval_rewards/accuracies": 0.871268630027771, - "eval_rewards/chosen": -3.7941508293151855, - "eval_rewards/margins": 3.658719778060913, - "eval_rewards/rejected": -7.452869892120361, - "eval_runtime": 215.0997, - "eval_samples_per_second": 99.568, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.0075626373291016, + "eval_logits/rejected": 1.8279625177383423, + "eval_logps/chosen": -394.5699768066406, + "eval_logps/rejected": -759.1402587890625, + "eval_loss": 0.09821247309446335, + "eval_rewards/accuracies": 0.8798507452011108, + "eval_rewards/chosen": -3.617732286453247, + "eval_rewards/margins": 3.6145107746124268, + "eval_rewards/rejected": -7.232241630554199, + "eval_runtime": 216.4313, + "eval_samples_per_second": 98.955, + "eval_steps_per_second": 1.548, "step": 1100 }, { "epoch": 0.3491664045297263, - "grad_norm": 3.2654693126678467, + "grad_norm": 3.5338029861450195, "learning_rate": 4.112686494533762e-06, - "logits/chosen": 1.497064232826233, - "logits/rejected": 1.313239336013794, - "logps/chosen": -445.63079833984375, - "logps/rejected": -807.7423095703125, - "loss": 0.1143, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.1313910484313965, - "rewards/margins": 3.5983211994171143, - "rewards/rejected": -7.729712009429932, + "logits/chosen": 1.5566942691802979, + "logits/rejected": 1.3428875207901, + "logps/chosen": -441.243896484375, + "logps/rejected": -809.8138427734375, + "loss": 0.1096, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.087360858917236, + "rewards/margins": 3.6627678871154785, + "rewards/rejected": -7.750128746032715, "step": 1110 }, { "epoch": 0.3523120478137779, - "grad_norm": 3.974165201187134, + "grad_norm": 3.721736431121826, "learning_rate": 4.091613174433351e-06, - "logits/chosen": 1.1193442344665527, - "logits/rejected": 1.1738699674606323, - "logps/chosen": -500.4566345214844, - "logps/rejected": -867.6866455078125, - "loss": 0.1575, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -4.670734405517578, - "rewards/margins": 3.6689980030059814, - "rewards/rejected": -8.33973217010498, + "logits/chosen": 1.1858751773834229, + "logits/rejected": 1.2289283275604248, + "logps/chosen": -492.10211181640625, + "logps/rejected": -857.7195434570312, + "loss": 0.1577, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.586886405944824, + "rewards/margins": 3.6530747413635254, + "rewards/rejected": -8.239961624145508, "step": 1120 }, { "epoch": 0.3554576910978295, - "grad_norm": 3.4348835945129395, + "grad_norm": 2.7472167015075684, "learning_rate": 4.070347944515955e-06, - "logits/chosen": 1.054678201675415, - "logits/rejected": 0.9517561197280884, - "logps/chosen": -465.5830078125, - "logps/rejected": -858.0778198242188, - "loss": 0.1553, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.322167873382568, - "rewards/margins": 3.9099960327148438, - "rewards/rejected": -8.232163429260254, + "logits/chosen": 1.0944794416427612, + "logits/rejected": 0.9718171954154968, + "logps/chosen": -467.4380798339844, + "logps/rejected": -860.9664916992188, + "loss": 0.1574, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.340346813201904, + "rewards/margins": 3.920635223388672, + "rewards/rejected": -8.260981559753418, "step": 1130 }, { "epoch": 0.3586033343818811, - "grad_norm": 3.4094433784484863, + "grad_norm": 3.2876129150390625, "learning_rate": 4.048893368850812e-06, - "logits/chosen": 1.299513816833496, - "logits/rejected": 1.1844545602798462, - "logps/chosen": -450.3697204589844, - "logps/rejected": -804.0281982421875, - "loss": 0.1617, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -4.180676460266113, - "rewards/margins": 3.5175251960754395, - "rewards/rejected": -7.698201656341553, + "logits/chosen": 1.5173089504241943, + "logits/rejected": 1.392333745956421, + "logps/chosen": -432.60760498046875, + "logps/rejected": -770.94384765625, + "loss": 0.1599, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.002845287322998, + "rewards/margins": 3.364281415939331, + "rewards/rejected": -7.36712646484375, "step": 1140 }, { "epoch": 0.36174897766593267, - "grad_norm": 3.1470396518707275, + "grad_norm": 3.355199098587036, "learning_rate": 4.027252034337653e-06, - "logits/chosen": 1.1882685422897339, - "logits/rejected": 1.0305453538894653, - "logps/chosen": -461.49322509765625, - "logps/rejected": -830.1395263671875, - "loss": 0.1468, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -4.267642021179199, - "rewards/margins": 3.6701016426086426, - "rewards/rejected": -7.937744140625, + "logits/chosen": 1.4846436977386475, + "logits/rejected": 1.3371527194976807, + "logps/chosen": -445.65972900390625, + "logps/rejected": -809.9754638671875, + "loss": 0.1504, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.108954429626465, + "rewards/margins": 3.627216339111328, + "rewards/rejected": -7.736170291900635, "step": 1150 }, { "epoch": 0.36489462094998426, - "grad_norm": 2.149296998977661, + "grad_norm": 2.369302749633789, "learning_rate": 4.005426550394777e-06, - "logits/chosen": 1.0792266130447388, - "logits/rejected": 1.1189695596694946, - "logps/chosen": -465.95587158203125, - "logps/rejected": -866.3806762695312, - "loss": 0.1431, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -4.327679634094238, - "rewards/margins": 4.006646633148193, - "rewards/rejected": -8.334325790405273, + "logits/chosen": 1.3805218935012817, + "logits/rejected": 1.427558422088623, + "logps/chosen": -457.25701904296875, + "logps/rejected": -851.77783203125, + "loss": 0.1477, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.240313529968262, + "rewards/margins": 3.947707414627075, + "rewards/rejected": -8.188020706176758, "step": 1160 }, { "epoch": 0.36804026423403585, - "grad_norm": 2.5954060554504395, + "grad_norm": 2.904909133911133, "learning_rate": 3.983419548644427e-06, - "logits/chosen": 1.2456753253936768, - "logits/rejected": 1.0422345399856567, - "logps/chosen": -433.5536193847656, - "logps/rejected": -783.9739990234375, - "loss": 0.1501, - "rewards/accuracies": 0.84375, - "rewards/chosen": -4.011656284332275, - "rewards/margins": 3.4877192974090576, - "rewards/rejected": -7.499375820159912, + "logits/chosen": 1.5399326086044312, + "logits/rejected": 1.3559725284576416, + "logps/chosen": -400.92474365234375, + "logps/rejected": -741.2730712890625, + "loss": 0.1555, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.684882640838623, + "rewards/margins": 3.3871047496795654, + "rewards/rejected": -7.071987152099609, "step": 1170 }, { "epoch": 0.37118590751808744, - "grad_norm": 2.630852460861206, + "grad_norm": 2.247695207595825, "learning_rate": 3.961233682595474e-06, - "logits/chosen": 1.4556474685668945, - "logits/rejected": 1.3100624084472656, - "logps/chosen": -450.36737060546875, - "logps/rejected": -859.49658203125, - "loss": 0.1678, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.17067813873291, - "rewards/margins": 4.0663228034973145, - "rewards/rejected": -8.237001419067383, + "logits/chosen": 1.6838607788085938, + "logits/rejected": 1.5416187047958374, + "logps/chosen": -453.5838928222656, + "logps/rejected": -840.7737426757812, + "loss": 0.161, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.202359199523926, + "rewards/margins": 3.8470425605773926, + "rewards/rejected": -8.049402236938477, "step": 1180 }, { "epoch": 0.37433155080213903, - "grad_norm": 1.8810055255889893, + "grad_norm": 1.9283421039581299, "learning_rate": 3.93887162732347e-06, - "logits/chosen": 1.1926202774047852, - "logits/rejected": 1.159652829170227, - "logps/chosen": -428.0986328125, - "logps/rejected": -843.1412963867188, - "loss": 0.1476, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -3.963353395462036, - "rewards/margins": 4.120386123657227, - "rewards/rejected": -8.083739280700684, + "logits/chosen": 1.4460527896881104, + "logits/rejected": 1.4111425876617432, + "logps/chosen": -418.48614501953125, + "logps/rejected": -816.6475219726562, + "loss": 0.1523, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.866959810256958, + "rewards/margins": 3.951749801635742, + "rewards/rejected": -7.818709373474121, "step": 1190 }, { "epoch": 0.3774771940861906, - "grad_norm": 3.1499431133270264, + "grad_norm": 3.37664532661438, "learning_rate": 3.916336079148102e-06, - "logits/chosen": 1.6126463413238525, - "logits/rejected": 1.4319192171096802, - "logps/chosen": -460.66455078125, - "logps/rejected": -844.0821533203125, - "loss": 0.149, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.291562080383301, - "rewards/margins": 3.7794106006622314, - "rewards/rejected": -8.070972442626953, + "logits/chosen": 1.838653802871704, + "logits/rejected": 1.687483549118042, + "logps/chosen": -466.60052490234375, + "logps/rejected": -841.2741088867188, + "loss": 0.1382, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.350468635559082, + "rewards/margins": 3.6922600269317627, + "rewards/rejected": -8.04272747039795, "step": 1200 }, { "epoch": 0.3774771940861906, - "eval_logits/chosen": 2.0580649375915527, - "eval_logits/rejected": 1.8714509010314941, - "eval_logps/chosen": -451.3316345214844, - "eval_logps/rejected": -860.935546875, - "eval_loss": 0.08171828836202621, - "eval_rewards/accuracies": 0.8843283653259277, - "eval_rewards/chosen": -4.185605049133301, - "eval_rewards/margins": 4.064764022827148, - "eval_rewards/rejected": -8.250368118286133, - "eval_runtime": 215.1027, - "eval_samples_per_second": 99.566, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.2832651138305664, + "eval_logits/rejected": 2.0966413021087646, + "eval_logps/chosen": -464.0286865234375, + "eval_logps/rejected": -871.9454956054688, + "eval_loss": 0.08190137147903442, + "eval_rewards/accuracies": 0.8861940503120422, + "eval_rewards/chosen": -4.312319278717041, + "eval_rewards/margins": 4.047975063323975, + "eval_rewards/rejected": -8.360294342041016, + "eval_runtime": 216.3914, + "eval_samples_per_second": 98.973, + "eval_steps_per_second": 1.548, "step": 1200 }, { "epoch": 0.3806228373702422, - "grad_norm": 2.8907980918884277, + "grad_norm": 2.987619400024414, "learning_rate": 3.893629755308078e-06, - "logits/chosen": 1.040919303894043, - "logits/rejected": 1.0302915573120117, - "logps/chosen": -447.6219787597656, - "logps/rejected": -785.0813598632812, - "loss": 0.135, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -4.135832786560059, - "rewards/margins": 3.3801684379577637, - "rewards/rejected": -7.516000270843506, + "logits/chosen": 1.2712478637695312, + "logits/rejected": 1.2399590015411377, + "logps/chosen": -473.53546142578125, + "logps/rejected": -809.8004150390625, + "loss": 0.1397, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.39472770690918, + "rewards/margins": 3.3684539794921875, + "rewards/rejected": -7.763182163238525, "step": 1210 }, { "epoch": 0.3837684806542938, - "grad_norm": 3.385078191757202, + "grad_norm": 2.556084394454956, "learning_rate": 3.870755393633495e-06, - "logits/chosen": 1.291372299194336, - "logits/rejected": 1.1527048349380493, - "logps/chosen": -461.098388671875, - "logps/rejected": -850.8663940429688, - "loss": 0.1317, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -4.281379699707031, - "rewards/margins": 3.8819034099578857, - "rewards/rejected": -8.163283348083496, + "logits/chosen": 1.4189751148223877, + "logits/rejected": 1.2690774202346802, + "logps/chosen": -449.70166015625, + "logps/rejected": -821.1251831054688, + "loss": 0.1232, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.166979789733887, + "rewards/margins": 3.6986324787139893, + "rewards/rejected": -7.8656110763549805, "step": 1220 }, { "epoch": 0.3869141239383454, - "grad_norm": 2.8103888034820557, + "grad_norm": 3.653494358062744, "learning_rate": 3.847715752215725e-06, - "logits/chosen": 1.3119373321533203, - "logits/rejected": 1.25798499584198, - "logps/chosen": -471.1314392089844, - "logps/rejected": -876.8768310546875, - "loss": 0.1191, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -4.381020545959473, - "rewards/margins": 4.039222240447998, - "rewards/rejected": -8.420242309570312, + "logits/chosen": 1.3309882879257202, + "logits/rejected": 1.269213080406189, + "logps/chosen": -459.6454162597656, + "logps/rejected": -866.3443603515625, + "loss": 0.1229, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.266015529632568, + "rewards/margins": 4.048908710479736, + "rewards/rejected": -8.314924240112305, "step": 1230 }, { "epoch": 0.390059767222397, - "grad_norm": 3.6371328830718994, + "grad_norm": 3.7244603633880615, "learning_rate": 3.824513609074853e-06, - "logits/chosen": 1.3880940675735474, - "logits/rejected": 1.2948577404022217, - "logps/chosen": -476.92022705078125, - "logps/rejected": -872.46240234375, - "loss": 0.131, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -4.443149089813232, - "rewards/margins": 3.925715684890747, - "rewards/rejected": -8.368864059448242, + "logits/chosen": 1.4364420175552368, + "logits/rejected": 1.3250478506088257, + "logps/chosen": -494.782470703125, + "logps/rejected": -882.2566528320312, + "loss": 0.1304, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.621702194213867, + "rewards/margins": 3.8449578285217285, + "rewards/rejected": -8.466659545898438, "step": 1240 }, { "epoch": 0.3932054105064486, - "grad_norm": 2.8011200428009033, + "grad_norm": 2.3005125522613525, "learning_rate": 3.8011517618247208e-06, - "logits/chosen": 1.4773666858673096, - "logits/rejected": 1.3155330419540405, - "logps/chosen": -496.269287109375, - "logps/rejected": -857.8162841796875, - "loss": 0.1128, - "rewards/accuracies": 0.84375, - "rewards/chosen": -4.630643367767334, - "rewards/margins": 3.5995922088623047, - "rewards/rejected": -8.230236053466797, + "logits/chosen": 1.616355299949646, + "logits/rejected": 1.4450652599334717, + "logps/chosen": -475.3401794433594, + "logps/rejected": -832.5036010742188, + "loss": 0.1201, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.421032905578613, + "rewards/margins": 3.5559539794921875, + "rewards/rejected": -7.976986885070801, "step": 1250 }, { "epoch": 0.3963510537905002, - "grad_norm": 3.5925869941711426, + "grad_norm": 3.406710624694824, "learning_rate": 3.777633027335594e-06, - "logits/chosen": 1.2616320848464966, - "logits/rejected": 1.1137524843215942, - "logps/chosen": -487.0738220214844, - "logps/rejected": -863.6187744140625, - "loss": 0.1402, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -4.545956134796143, - "rewards/margins": 3.731048107147217, - "rewards/rejected": -8.277003288269043, + "logits/chosen": 1.4149603843688965, + "logits/rejected": 1.2735809087753296, + "logps/chosen": -464.1859436035156, + "logps/rejected": -824.88818359375, + "loss": 0.1362, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.316610336303711, + "rewards/margins": 3.572819471359253, + "rewards/rejected": -7.889430046081543, "step": 1260 }, { "epoch": 0.39949669707455177, - "grad_norm": 2.241239547729492, + "grad_norm": 2.030447006225586, "learning_rate": 3.7539602413945264e-06, - "logits/chosen": 1.1234735250473022, - "logits/rejected": 0.9519070386886597, - "logps/chosen": -431.80340576171875, - "logps/rejected": -828.6780395507812, - "loss": 0.108, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.9965758323669434, - "rewards/margins": 3.9541754722595215, - "rewards/rejected": -7.950751304626465, + "logits/chosen": 1.3235461711883545, + "logits/rejected": 1.1515737771987915, + "logps/chosen": -474.85174560546875, + "logps/rejected": -877.0484619140625, + "loss": 0.1061, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.426663398742676, + "rewards/margins": 4.007308006286621, + "rewards/rejected": -8.433971405029297, "step": 1270 }, { "epoch": 0.40264234035860336, - "grad_norm": 3.996866464614868, + "grad_norm": 2.5130529403686523, "learning_rate": 3.7301362583634255e-06, - "logits/chosen": 1.1398568153381348, - "logits/rejected": 1.0180408954620361, - "logps/chosen": -462.87225341796875, - "logps/rejected": -890.3841552734375, - "loss": 0.1281, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -4.300458908081055, - "rewards/margins": 4.238921165466309, - "rewards/rejected": -8.539380073547363, + "logits/chosen": 1.4249566793441772, + "logits/rejected": 1.328649640083313, + "logps/chosen": -446.62164306640625, + "logps/rejected": -855.87939453125, + "loss": 0.1303, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.137589454650879, + "rewards/margins": 4.056654930114746, + "rewards/rejected": -8.194245338439941, "step": 1280 }, { "epoch": 0.40578798364265495, - "grad_norm": 2.932255744934082, + "grad_norm": 2.781508684158325, "learning_rate": 3.7061639508348883e-06, - "logits/chosen": 0.9352201223373413, - "logits/rejected": 0.8519388437271118, - "logps/chosen": -481.8533630371094, - "logps/rejected": -956.1497802734375, - "loss": 0.1091, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.485152721405029, - "rewards/margins": 4.709257125854492, - "rewards/rejected": -9.19441032409668, + "logits/chosen": 1.2699975967407227, + "logits/rejected": 1.1878561973571777, + "logps/chosen": -464.4150390625, + "logps/rejected": -912.4680786132812, + "loss": 0.1063, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.31067419052124, + "rewards/margins": 4.446977138519287, + "rewards/rejected": -8.757651329040527, "step": 1290 }, { "epoch": 0.4089336269267065, - "grad_norm": 4.746089935302734, + "grad_norm": 3.4436986446380615, "learning_rate": 3.6820462092858388e-06, - "logits/chosen": 1.1464670896530151, - "logits/rejected": 0.9618524312973022, - "logps/chosen": -500.5083923339844, - "logps/rejected": -943.3963012695312, - "loss": 0.1143, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.674715995788574, - "rewards/margins": 4.405007839202881, - "rewards/rejected": -9.079724311828613, + "logits/chosen": 1.3347591161727905, + "logits/rejected": 1.1586949825286865, + "logps/chosen": -493.66229248046875, + "logps/rejected": -909.53076171875, + "loss": 0.1133, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.606060028076172, + "rewards/margins": 4.134642124176025, + "rewards/rejected": -8.740701675415039, "step": 1300 }, { "epoch": 0.4089336269267065, - "eval_logits/chosen": 1.9770227670669556, - "eval_logits/rejected": 1.7765424251556396, - "eval_logps/chosen": -457.2140808105469, - "eval_logps/rejected": -897.443115234375, - "eval_loss": 0.0702316164970398, - "eval_rewards/accuracies": 0.8884328603744507, - "eval_rewards/chosen": -4.2444305419921875, - "eval_rewards/margins": 4.371013641357422, - "eval_rewards/rejected": -8.615446090698242, - "eval_runtime": 214.7353, - "eval_samples_per_second": 99.737, - "eval_steps_per_second": 1.56, + "eval_logits/chosen": 2.1043541431427, + "eval_logits/rejected": 1.9082162380218506, + "eval_logps/chosen": -439.5054626464844, + "eval_logps/rejected": -869.0028686523438, + "eval_loss": 0.07139625400304794, + "eval_rewards/accuracies": 0.89552241563797, + "eval_rewards/chosen": -4.067087173461914, + "eval_rewards/margins": 4.263782501220703, + "eval_rewards/rejected": -8.330869674682617, + "eval_runtime": 215.9237, + "eval_samples_per_second": 99.188, + "eval_steps_per_second": 1.551, "step": 1300 }, { "epoch": 0.4120792702107581, - "grad_norm": 1.9351997375488281, + "grad_norm": 1.8959569931030273, "learning_rate": 3.6577859417290036e-06, - "logits/chosen": 1.0681344270706177, - "logits/rejected": 1.032234787940979, - "logps/chosen": -490.95538330078125, - "logps/rejected": -917.3497924804688, - "loss": 0.1097, + "logits/chosen": 1.2066175937652588, + "logits/rejected": 1.1854221820831299, + "logps/chosen": -471.34088134765625, + "logps/rejected": -895.9601440429688, + "loss": 0.1126, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.577587127685547, - "rewards/margins": 4.240847587585449, - "rewards/rejected": -8.818434715270996, + "rewards/chosen": -4.3812336921691895, + "rewards/margins": 4.223142623901367, + "rewards/rejected": -8.604375839233398, "step": 1310 }, { "epoch": 0.41522491349480967, - "grad_norm": 3.2459261417388916, + "grad_norm": 4.2024030685424805, "learning_rate": 3.633386073362275e-06, - "logits/chosen": 1.3883543014526367, - "logits/rejected": 1.1711828708648682, - "logps/chosen": -445.54901123046875, - "logps/rejected": -872.5389404296875, - "loss": 0.1097, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.128580570220947, - "rewards/margins": 4.237695693969727, - "rewards/rejected": -8.366275787353516, + "logits/chosen": 1.405761480331421, + "logits/rejected": 1.1724328994750977, + "logps/chosen": -487.13116455078125, + "logps/rejected": -914.7552490234375, + "loss": 0.1049, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.5441741943359375, + "rewards/margins": 4.243566036224365, + "rewards/rejected": -8.787739753723145, "step": 1320 }, { "epoch": 0.41837055677886126, - "grad_norm": 3.151752471923828, + "grad_norm": 2.757516622543335, "learning_rate": 3.6088495462160108e-06, - "logits/chosen": 1.3057953119277954, - "logits/rejected": 1.2494308948516846, - "logps/chosen": -468.8291931152344, - "logps/rejected": -929.8936767578125, - "loss": 0.1145, - "rewards/accuracies": 0.875, - "rewards/chosen": -4.374839782714844, - "rewards/margins": 4.57546854019165, - "rewards/rejected": -8.950307846069336, + "logits/chosen": 1.2204030752182007, + "logits/rejected": 1.1215746402740479, + "logps/chosen": -494.38232421875, + "logps/rejected": -946.015625, + "loss": 0.107, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.630240440368652, + "rewards/margins": 4.480798244476318, + "rewards/rejected": -9.111038208007812, "step": 1330 }, { "epoch": 0.42151620006291285, - "grad_norm": 2.9136953353881836, + "grad_norm": 3.821265459060669, "learning_rate": 3.584179318798287e-06, - "logits/chosen": 1.4748225212097168, - "logits/rejected": 1.2122485637664795, - "logps/chosen": -491.70416259765625, - "logps/rejected": -948.0320434570312, - "loss": 0.1099, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.592446804046631, - "rewards/margins": 4.535719871520996, - "rewards/rejected": -9.128166198730469, + "logits/chosen": 1.4361203908920288, + "logits/rejected": 1.1379070281982422, + "logps/chosen": -462.8627014160156, + "logps/rejected": -889.4700927734375, + "loss": 0.1151, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.30374002456665, + "rewards/margins": 4.2384233474731445, + "rewards/rejected": -8.542162895202637, "step": 1340 }, { "epoch": 0.42466184334696444, - "grad_norm": 2.980229616165161, + "grad_norm": 2.9233007431030273, "learning_rate": 3.5593783657381832e-06, - "logits/chosen": 1.3782281875610352, - "logits/rejected": 1.2022063732147217, - "logps/chosen": -457.11029052734375, - "logps/rejected": -905.9327392578125, - "loss": 0.1076, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.2324628829956055, - "rewards/margins": 4.458249568939209, - "rewards/rejected": -8.690712928771973, + "logits/chosen": 1.4705383777618408, + "logits/rejected": 1.309003472328186, + "logps/chosen": -431.55181884765625, + "logps/rejected": -872.4307861328125, + "loss": 0.1165, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9765896797180176, + "rewards/margins": 4.378879070281982, + "rewards/rejected": -8.35546875, "step": 1350 }, { "epoch": 0.42780748663101603, - "grad_norm": 2.3310577869415283, + "grad_norm": 2.001502752304077, "learning_rate": 3.534449677427106e-06, - "logits/chosen": 1.275436520576477, - "logits/rejected": 1.0545412302017212, - "logps/chosen": -419.954833984375, - "logps/rejected": -886.1671752929688, + "logits/chosen": 1.4111711978912354, + "logits/rejected": 1.179564118385315, + "logps/chosen": -390.43218994140625, + "logps/rejected": -836.7662963867188, "loss": 0.1152, - "rewards/accuracies": 0.90625, - "rewards/chosen": -3.8796966075897217, - "rewards/margins": 4.618420124053955, - "rewards/rejected": -8.498116493225098, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.5844473838806152, + "rewards/margins": 4.41940975189209, + "rewards/rejected": -8.003857612609863, "step": 1360 }, { "epoch": 0.4309531299150676, - "grad_norm": 2.4203007221221924, + "grad_norm": 1.7398028373718262, "learning_rate": 3.5093962596582288e-06, - "logits/chosen": 1.4109416007995605, - "logits/rejected": 1.346010446548462, - "logps/chosen": -479.3482360839844, - "logps/rejected": -935.4396362304688, - "loss": 0.1251, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -4.468480587005615, - "rewards/margins": 4.542850971221924, - "rewards/rejected": -9.011331558227539, + "logits/chosen": 1.4407987594604492, + "logits/rejected": 1.372896432876587, + "logps/chosen": -444.43994140625, + "logps/rejected": -889.0983276367188, + "loss": 0.1256, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.119224548339844, + "rewards/margins": 4.428411483764648, + "rewards/rejected": -8.547636032104492, "step": 1370 }, { "epoch": 0.4340987731991192, - "grad_norm": 2.9505016803741455, + "grad_norm": 2.6257755756378174, "learning_rate": 3.4842211332640595e-06, - "logits/chosen": 1.4074004888534546, - "logits/rejected": 1.0855344533920288, - "logps/chosen": -525.0509033203125, - "logps/rejected": -1000.765625, - "loss": 0.125, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.915558815002441, - "rewards/margins": 4.711212158203125, - "rewards/rejected": -9.626770973205566, + "logits/chosen": 1.5364018678665161, + "logits/rejected": 1.21645188331604, + "logps/chosen": -453.30401611328125, + "logps/rejected": -896.1051025390625, + "loss": 0.1219, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.197838306427002, + "rewards/margins": 4.3818278312683105, + "rewards/rejected": -8.579666137695312, "step": 1380 }, { "epoch": 0.4372444164831708, - "grad_norm": 3.437758207321167, + "grad_norm": 2.9230072498321533, "learning_rate": 3.4589273337522055e-06, - "logits/chosen": 1.7364375591278076, - "logits/rejected": 1.437404990196228, - "logps/chosen": -464.2837829589844, - "logps/rejected": -936.4107666015625, - "loss": 0.1151, + "logits/chosen": 1.8644187450408936, + "logits/rejected": 1.5944981575012207, + "logps/chosen": -427.46533203125, + "logps/rejected": -885.3250122070312, + "loss": 0.1007, "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.3078765869140625, - "rewards/margins": 4.697639465332031, - "rewards/rejected": -9.005515098571777, + "rewards/chosen": -3.9393608570098877, + "rewards/margins": 4.555096626281738, + "rewards/rejected": -8.494457244873047, "step": 1390 }, { "epoch": 0.4403900597672224, - "grad_norm": 3.1491661071777344, + "grad_norm": 2.494887351989746, "learning_rate": 3.433517910939364e-06, - "logits/chosen": 1.6241645812988281, - "logits/rejected": 1.4199696779251099, - "logps/chosen": -478.7039489746094, - "logps/rejected": -865.2511596679688, - "loss": 0.1204, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -4.474257469177246, - "rewards/margins": 3.8194992542266846, - "rewards/rejected": -8.293757438659668, + "logits/chosen": 1.6030244827270508, + "logits/rejected": 1.3681625127792358, + "logps/chosen": -540.4693603515625, + "logps/rejected": -944.8883666992188, + "loss": 0.1209, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.091638565063477, + "rewards/margins": 3.998020887374878, + "rewards/rejected": -9.089658737182617, "step": 1400 }, { "epoch": 0.4403900597672224, - "eval_logits/chosen": 2.373414993286133, - "eval_logits/rejected": 2.199601411819458, - "eval_logps/chosen": -447.18634033203125, - "eval_logps/rejected": -897.015380859375, - "eval_loss": 0.06420407444238663, - "eval_rewards/accuracies": 0.8966417908668518, - "eval_rewards/chosen": -4.144153118133545, - "eval_rewards/margins": 4.467014789581299, - "eval_rewards/rejected": -8.611167907714844, - "eval_runtime": 214.6217, - "eval_samples_per_second": 99.79, - "eval_steps_per_second": 1.561, + "eval_logits/chosen": 2.2677738666534424, + "eval_logits/rejected": 2.057406187057495, + "eval_logps/chosen": -516.4533081054688, + "eval_logps/rejected": -983.30810546875, + "eval_loss": 0.06340682506561279, + "eval_rewards/accuracies": 0.8932836055755615, + "eval_rewards/chosen": -4.8365654945373535, + "eval_rewards/margins": 4.637356281280518, + "eval_rewards/rejected": -9.473921775817871, + "eval_runtime": 216.4046, + "eval_samples_per_second": 98.967, + "eval_steps_per_second": 1.548, "step": 1400 }, { "epoch": 0.443535703051274, - "grad_norm": 3.9951677322387695, + "grad_norm": 3.8545944690704346, "learning_rate": 3.4079959285835895e-06, - "logits/chosen": 1.426998496055603, - "logits/rejected": 1.4089171886444092, - "logps/chosen": -486.6236267089844, - "logps/rejected": -924.7525634765625, - "loss": 0.116, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.531942844390869, - "rewards/margins": 4.369163990020752, - "rewards/rejected": -8.901106834411621, + "logits/chosen": 1.2882033586502075, + "logits/rejected": 1.2805908918380737, + "logps/chosen": -486.8324279785156, + "logps/rejected": -901.2437744140625, + "loss": 0.1161, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.533527851104736, + "rewards/margins": 4.132518291473389, + "rewards/rejected": -8.666045188903809, "step": 1410 }, { "epoch": 0.4466813463353256, - "grad_norm": 2.6183512210845947, + "grad_norm": 2.2411046028137207, "learning_rate": 3.3823644640148767e-06, - "logits/chosen": 1.2935250997543335, - "logits/rejected": 1.0634428262710571, - "logps/chosen": -477.60626220703125, - "logps/rejected": -1014.9510498046875, - "loss": 0.1088, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.449921607971191, - "rewards/margins": 5.347407341003418, - "rewards/rejected": -9.79732894897461, + "logits/chosen": 1.302230715751648, + "logits/rejected": 1.0823272466659546, + "logps/chosen": -426.0066833496094, + "logps/rejected": -929.7628173828125, + "loss": 0.1074, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.933643341064453, + "rewards/margins": 5.0114426612854, + "rewards/rejected": -8.945085525512695, "step": 1420 }, { "epoch": 0.44982698961937717, - "grad_norm": 4.819204330444336, + "grad_norm": 4.858677864074707, "learning_rate": 3.356626607764113e-06, - "logits/chosen": 1.4216114282608032, - "logits/rejected": 1.3206603527069092, - "logps/chosen": -439.0201110839844, - "logps/rejected": -846.8821411132812, - "loss": 0.1089, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -4.05989933013916, - "rewards/margins": 4.05983829498291, - "rewards/rejected": -8.11973762512207, + "logits/chosen": 1.5180631875991821, + "logits/rejected": 1.3998795747756958, + "logps/chosen": -523.0291137695312, + "logps/rejected": -951.4177856445312, + "loss": 0.1093, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.899743556976318, + "rewards/margins": 4.265069484710693, + "rewards/rejected": -9.164813041687012, "step": 1430 }, { "epoch": 0.45297263290342876, - "grad_norm": 4.423831462860107, + "grad_norm": 3.5026304721832275, "learning_rate": 3.3307854631904315e-06, - "logits/chosen": 1.1363990306854248, - "logits/rejected": 1.0369417667388916, - "logps/chosen": -541.7600708007812, - "logps/rejected": -1006.2171020507812, - "loss": 0.1362, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -5.079786777496338, - "rewards/margins": 4.622787952423096, - "rewards/rejected": -9.702574729919434, + "logits/chosen": 1.148923635482788, + "logits/rejected": 1.0456730127334595, + "logps/chosen": -508.77935791015625, + "logps/rejected": -951.7108154296875, + "loss": 0.1224, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.749743461608887, + "rewards/margins": 4.407544136047363, + "rewards/rejected": -9.157288551330566, "step": 1440 }, { "epoch": 0.45611827618748035, - "grad_norm": 1.959601640701294, + "grad_norm": 2.1828246116638184, "learning_rate": 3.3048441461070234e-06, - "logits/chosen": 1.2691389322280884, - "logits/rejected": 1.111587405204773, - "logps/chosen": -476.6605529785156, - "logps/rejected": -884.9557495117188, - "loss": 0.1146, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -4.442279815673828, - "rewards/margins": 4.060351848602295, - "rewards/rejected": -8.502632141113281, + "logits/chosen": 1.3276925086975098, + "logits/rejected": 1.2130540609359741, + "logps/chosen": -421.02197265625, + "logps/rejected": -818.87939453125, + "loss": 0.1112, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -3.8857734203338623, + "rewards/margins": 3.9558587074279785, + "rewards/rejected": -7.841631889343262, "step": 1450 }, { "epoch": 0.45926391947153195, - "grad_norm": 2.6528480052948, + "grad_norm": 2.359039783477783, "learning_rate": 3.278805784405451e-06, - "logits/chosen": 1.3994677066802979, - "logits/rejected": 1.1729563474655151, - "logps/chosen": -460.7957458496094, - "logps/rejected": -922.0712890625, - "loss": 0.1055, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -4.265277862548828, - "rewards/margins": 4.591836452484131, - "rewards/rejected": -8.857114791870117, + "logits/chosen": 1.4966380596160889, + "logits/rejected": 1.2815477848052979, + "logps/chosen": -455.6396484375, + "logps/rejected": -913.0255737304688, + "loss": 0.1069, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.213668346405029, + "rewards/margins": 4.552624702453613, + "rewards/rejected": -8.7662935256958, "step": 1460 }, { "epoch": 0.46240956275558354, - "grad_norm": 3.501101493835449, + "grad_norm": 3.10565447807312, "learning_rate": 3.2526735176784897e-06, - "logits/chosen": 1.8661353588104248, - "logits/rejected": 1.6564010381698608, - "logps/chosen": -528.0916748046875, - "logps/rejected": -949.0935668945312, - "loss": 0.1108, + "logits/chosen": 1.8660796880722046, + "logits/rejected": 1.6580556631088257, + "logps/chosen": -490.677001953125, + "logps/rejected": -909.7364501953125, + "loss": 0.1092, "rewards/accuracies": 0.875, - "rewards/chosen": -4.956791400909424, - "rewards/margins": 4.165668964385986, - "rewards/rejected": -9.12246036529541, + "rewards/chosen": -4.582307815551758, + "rewards/margins": 4.146181583404541, + "rewards/rejected": -8.728489875793457, "step": 1470 }, { "epoch": 0.4655552060396351, - "grad_norm": 3.2202489376068115, + "grad_norm": 2.580960750579834, "learning_rate": 3.2264504968415805e-06, - "logits/chosen": 1.4454188346862793, - "logits/rejected": 1.239365816116333, - "logps/chosen": -503.5089416503906, - "logps/rejected": -947.4742431640625, - "loss": 0.1045, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.705296993255615, - "rewards/margins": 4.416085720062256, - "rewards/rejected": -9.121381759643555, + "logits/chosen": 1.4424006938934326, + "logits/rejected": 1.2268205881118774, + "logps/chosen": -466.53887939453125, + "logps/rejected": -913.9548950195312, + "loss": 0.1048, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.335131645202637, + "rewards/margins": 4.450761795043945, + "rewards/rejected": -8.785893440246582, "step": 1480 }, { "epoch": 0.4687008493236867, - "grad_norm": 2.5105979442596436, + "grad_norm": 2.8086788654327393, "learning_rate": 3.2001398837529e-06, - "logits/chosen": 1.4767264127731323, - "logits/rejected": 1.294965147972107, - "logps/chosen": -509.91497802734375, - "logps/rejected": -995.5406494140625, - "loss": 0.0922, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.786181449890137, - "rewards/margins": 4.818673610687256, - "rewards/rejected": -9.604853630065918, + "logits/chosen": 1.4587178230285645, + "logits/rejected": 1.2750834226608276, + "logps/chosen": -459.68896484375, + "logps/rejected": -923.9265747070312, + "loss": 0.0928, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.283493995666504, + "rewards/margins": 4.605031490325928, + "rewards/rejected": -8.88852596282959, "step": 1490 }, { "epoch": 0.47184649260773825, - "grad_norm": 3.5266494750976562, + "grad_norm": 4.19993257522583, "learning_rate": 3.1737448508321176e-06, - "logits/chosen": 1.5441019535064697, - "logits/rejected": 1.2486565113067627, - "logps/chosen": -518.9967041015625, - "logps/rejected": -971.5304565429688, - "loss": 0.1013, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.873408317565918, - "rewards/margins": 4.476175785064697, - "rewards/rejected": -9.349583625793457, + "logits/chosen": 1.593569040298462, + "logits/rejected": 1.276476502418518, + "logps/chosen": -481.2831115722656, + "logps/rejected": -942.7236328125, + "loss": 0.1057, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.496027946472168, + "rewards/margins": 4.565060615539551, + "rewards/rejected": -9.061088562011719, "step": 1500 }, { "epoch": 0.47184649260773825, - "eval_logits/chosen": 2.136415958404541, - "eval_logits/rejected": 1.951367974281311, - "eval_logps/chosen": -483.0837707519531, - "eval_logps/rejected": -947.4904174804688, - "eval_loss": 0.058037880808115005, - "eval_rewards/accuracies": 0.8951492309570312, - "eval_rewards/chosen": -4.503126621246338, - "eval_rewards/margins": 4.612789630889893, - "eval_rewards/rejected": -9.11591625213623, - "eval_runtime": 214.8935, - "eval_samples_per_second": 99.663, - "eval_steps_per_second": 1.559, + "eval_logits/chosen": 2.277968406677246, + "eval_logits/rejected": 2.0906763076782227, + "eval_logps/chosen": -451.1487731933594, + "eval_logps/rejected": -921.7240600585938, + "eval_loss": 0.05749654024839401, + "eval_rewards/accuracies": 0.9018656611442566, + "eval_rewards/chosen": -4.1835198402404785, + "eval_rewards/margins": 4.674561023712158, + "eval_rewards/rejected": -8.858080863952637, + "eval_runtime": 216.5654, + "eval_samples_per_second": 98.894, + "eval_steps_per_second": 1.547, "step": 1500 }, { "epoch": 0.47499213589178985, - "grad_norm": 2.5726158618927, + "grad_norm": 3.436180591583252, "learning_rate": 3.1472685806778837e-06, - "logits/chosen": 1.462704062461853, - "logits/rejected": 1.1601091623306274, - "logps/chosen": -491.4873046875, - "logps/rejected": -966.7628784179688, - "loss": 0.1067, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -4.586657524108887, - "rewards/margins": 4.717507362365723, - "rewards/rejected": -9.30416488647461, + "logits/chosen": 1.5782949924468994, + "logits/rejected": 1.247202754020691, + "logps/chosen": -464.5965270996094, + "logps/rejected": -940.6214599609375, + "loss": 0.0983, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.317359447479248, + "rewards/margins": 4.72481632232666, + "rewards/rejected": -9.042176246643066, "step": 1510 }, { "epoch": 0.47813777917584144, - "grad_norm": 1.8397867679595947, + "grad_norm": 2.2029030323028564, "learning_rate": 3.1207142656840782e-06, - "logits/chosen": 1.4628472328186035, - "logits/rejected": 1.2875310182571411, - "logps/chosen": -484.8026428222656, - "logps/rejected": -923.8834228515625, - "loss": 0.0886, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -4.506746292114258, - "rewards/margins": 4.361083984375, - "rewards/rejected": -8.867830276489258, + "logits/chosen": 1.4659521579742432, + "logits/rejected": 1.255765676498413, + "logps/chosen": -502.91094970703125, + "logps/rejected": -932.7726440429688, + "loss": 0.0947, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.687650203704834, + "rewards/margins": 4.2687153816223145, + "rewards/rejected": -8.956365585327148, "step": 1520 }, { "epoch": 0.48128342245989303, - "grad_norm": 3.018728494644165, + "grad_norm": 2.461082935333252, "learning_rate": 3.094085107654891e-06, - "logits/chosen": 1.3588612079620361, - "logits/rejected": 1.0993680953979492, - "logps/chosen": -544.7242431640625, - "logps/rejected": -1018.6455078125, - "loss": 0.1048, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -5.132107734680176, - "rewards/margins": 4.696852684020996, - "rewards/rejected": -9.828960418701172, + "logits/chosen": 1.3395097255706787, + "logits/rejected": 1.0628923177719116, + "logps/chosen": -543.1881103515625, + "logps/rejected": -978.9011840820312, + "loss": 0.102, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.116451263427734, + "rewards/margins": 4.314597129821777, + "rewards/rejected": -9.431048393249512, "step": 1530 }, { "epoch": 0.4844290657439446, - "grad_norm": 2.7776284217834473, + "grad_norm": 2.2011494636535645, "learning_rate": 3.067384317418761e-06, - "logits/chosen": 1.5641913414001465, - "logits/rejected": 1.4478683471679688, - "logps/chosen": -493.83746337890625, - "logps/rejected": -903.9118041992188, - "loss": 0.116, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -4.606075286865234, - "rewards/margins": 4.075374126434326, - "rewards/rejected": -8.681449890136719, + "logits/chosen": 1.6539561748504639, + "logits/rejected": 1.5385477542877197, + "logps/chosen": -459.3910217285156, + "logps/rejected": -853.07177734375, + "loss": 0.1091, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.261325836181641, + "rewards/margins": 3.9113669395446777, + "rewards/rejected": -8.172693252563477, "step": 1540 }, { "epoch": 0.4875747090279962, - "grad_norm": 4.471944808959961, + "grad_norm": 4.575362682342529, "learning_rate": 3.0406151144412277e-06, - "logits/chosen": 1.447131633758545, - "logits/rejected": 1.3758630752563477, - "logps/chosen": -481.29736328125, - "logps/rejected": -913.4244384765625, - "loss": 0.0996, + "logits/chosen": 1.5317912101745605, + "logits/rejected": 1.4458856582641602, + "logps/chosen": -465.4632263183594, + "logps/rejected": -876.3185424804688, + "loss": 0.1009, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.4858551025390625, - "rewards/margins": 4.2920660972595215, - "rewards/rejected": -8.777920722961426, + "rewards/chosen": -4.327389717102051, + "rewards/margins": 4.079252243041992, + "rewards/rejected": -8.406641960144043, "step": 1550 }, { "epoch": 0.4907203523120478, - "grad_norm": 1.814293622970581, + "grad_norm": 2.7277722358703613, "learning_rate": 3.013780726436743e-06, - "logits/chosen": 1.3209139108657837, - "logits/rejected": 1.036739468574524, - "logps/chosen": -497.64080810546875, - "logps/rejected": -983.0264892578125, - "loss": 0.0891, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.66217565536499, - "rewards/margins": 4.808836936950684, - "rewards/rejected": -9.471014022827148, + "logits/chosen": 1.4253225326538086, + "logits/rejected": 1.1030299663543701, + "logps/chosen": -470.4295959472656, + "logps/rejected": -924.5455322265625, + "loss": 0.0854, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.38973331451416, + "rewards/margins": 4.496077537536621, + "rewards/rejected": -8.885812759399414, "step": 1560 }, { "epoch": 0.4938659955960994, - "grad_norm": 2.357700824737549, + "grad_norm": 2.7925002574920654, "learning_rate": 2.9868843889794867e-06, - "logits/chosen": 1.507373571395874, - "logits/rejected": 1.33405339717865, - "logps/chosen": -556.9375610351562, - "logps/rejected": -1055.0340576171875, - "loss": 0.0958, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -5.234933376312256, - "rewards/margins": 4.970101833343506, - "rewards/rejected": -10.205035209655762, + "logits/chosen": 1.7626025676727295, + "logits/rejected": 1.603830099105835, + "logps/chosen": -491.6067810058594, + "logps/rejected": -946.78076171875, + "loss": 0.0865, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.5813679695129395, + "rewards/margins": 4.5411176681518555, + "rewards/rejected": -9.122485160827637, "step": 1570 }, { "epoch": 0.497011638880151, - "grad_norm": 2.6845498085021973, + "grad_norm": 3.2050352096557617, "learning_rate": 2.9599293451132338e-06, - "logits/chosen": 1.3129870891571045, - "logits/rejected": 1.0617408752441406, - "logps/chosen": -516.5895385742188, - "logps/rejected": -1013.9332885742188, - "loss": 0.0764, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.83015775680542, - "rewards/margins": 4.950501441955566, - "rewards/rejected": -9.780659675598145, + "logits/chosen": 1.5611720085144043, + "logits/rejected": 1.3016364574432373, + "logps/chosen": -469.46661376953125, + "logps/rejected": -943.2091674804688, + "loss": 0.0895, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.3585710525512695, + "rewards/margins": 4.71451997756958, + "rewards/rejected": -9.073091506958008, "step": 1580 }, { "epoch": 0.5001572821642026, - "grad_norm": 3.2871477603912354, + "grad_norm": 2.416383743286133, "learning_rate": 2.9329188449603245e-06, - "logits/chosen": 1.3259484767913818, - "logits/rejected": 1.3023512363433838, - "logps/chosen": -524.0896606445312, - "logps/rejected": -990.5101318359375, - "loss": 0.0905, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.907674312591553, - "rewards/margins": 4.64675760269165, - "rewards/rejected": -9.55443286895752, + "logits/chosen": 1.4920631647109985, + "logits/rejected": 1.465451955795288, + "logps/chosen": -469.51873779296875, + "logps/rejected": -898.5816650390625, + "loss": 0.0846, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.3615593910217285, + "rewards/margins": 4.273259162902832, + "rewards/rejected": -8.634818077087402, "step": 1590 }, { "epoch": 0.5033029254482542, - "grad_norm": 3.381878137588501, + "grad_norm": 3.518874168395996, "learning_rate": 2.9058561453297783e-06, - "logits/chosen": 1.392225980758667, - "logits/rejected": 1.2445684671401978, - "logps/chosen": -441.95501708984375, - "logps/rejected": -929.5980224609375, - "loss": 0.1011, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -4.106937408447266, - "rewards/margins": 4.853115081787109, - "rewards/rejected": -8.960052490234375, + "logits/chosen": 1.5393750667572021, + "logits/rejected": 1.3930766582489014, + "logps/chosen": -449.69879150390625, + "logps/rejected": -957.0568237304688, + "loss": 0.1057, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.184264183044434, + "rewards/margins": 5.050030708312988, + "rewards/rejected": -9.234294891357422, "step": 1600 }, { "epoch": 0.5033029254482542, - "eval_logits/chosen": 2.1102631092071533, - "eval_logits/rejected": 1.9239301681518555, - "eval_logps/chosen": -436.5010681152344, - "eval_logps/rejected": -893.6846313476562, - "eval_loss": 0.05671687051653862, - "eval_rewards/accuracies": 0.9067164063453674, - "eval_rewards/chosen": -4.0373005867004395, - "eval_rewards/margins": 4.540557861328125, - "eval_rewards/rejected": -8.577857971191406, - "eval_runtime": 215.1071, - "eval_samples_per_second": 99.564, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.213601589202881, + "eval_logits/rejected": 2.019822120666504, + "eval_logps/chosen": -453.72308349609375, + "eval_logps/rejected": -928.4155883789062, + "eval_loss": 0.053593434393405914, + "eval_rewards/accuracies": 0.9130597114562988, + "eval_rewards/chosen": -4.209263324737549, + "eval_rewards/margins": 4.715731620788574, + "eval_rewards/rejected": -8.924994468688965, + "eval_runtime": 216.4803, + "eval_samples_per_second": 98.933, + "eval_steps_per_second": 1.547, "step": 1600 }, { "epoch": 0.5064485687323057, - "grad_norm": 3.025057315826416, + "grad_norm": 4.083642482757568, "learning_rate": 2.8787445093246004e-06, - "logits/chosen": 1.3364161252975464, - "logits/rejected": 1.176685094833374, - "logps/chosen": -451.01513671875, - "logps/rejected": -895.0511474609375, - "loss": 0.105, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -4.202619552612305, - "rewards/margins": 4.399237155914307, - "rewards/rejected": -8.60185718536377, + "logits/chosen": 1.4022510051727295, + "logits/rejected": 1.2157288789749146, + "logps/chosen": -457.09552001953125, + "logps/rejected": -900.7742309570312, + "loss": 0.1105, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.262914180755615, + "rewards/margins": 4.395590782165527, + "rewards/rejected": -8.6585054397583, "step": 1610 }, { "epoch": 0.5095942120163574, - "grad_norm": 3.637864112854004, + "grad_norm": 3.1871087551116943, "learning_rate": 2.8515872059483326e-06, - "logits/chosen": 1.2735852003097534, - "logits/rejected": 1.112821340560913, - "logps/chosen": -489.1302795410156, - "logps/rejected": -982.361328125, - "loss": 0.1066, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.53579568862915, - "rewards/margins": 4.906703948974609, - "rewards/rejected": -9.442499160766602, + "logits/chosen": 1.3273845911026, + "logits/rejected": 1.1355845928192139, + "logps/chosen": -478.5135803222656, + "logps/rejected": -951.0009765625, + "loss": 0.1045, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.429346561431885, + "rewards/margins": 4.699108600616455, + "rewards/rejected": -9.12845516204834, "step": 1620 }, { "epoch": 0.5127398553004089, - "grad_norm": 1.9579370021820068, + "grad_norm": 1.756052851676941, "learning_rate": 2.8243875097108897e-06, - "logits/chosen": 1.3890975713729858, - "logits/rejected": 1.2870466709136963, - "logps/chosen": -498.91650390625, - "logps/rejected": -1006.8635864257812, - "loss": 0.0956, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.648484230041504, - "rewards/margins": 5.069706916809082, - "rewards/rejected": -9.718191146850586, + "logits/chosen": 1.482096552848816, + "logits/rejected": 1.359783411026001, + "logps/chosen": -482.2904357910156, + "logps/rejected": -976.9202270507812, + "loss": 0.0854, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.482015609741211, + "rewards/margins": 4.936542510986328, + "rewards/rejected": -9.418558120727539, "step": 1630 }, { "epoch": 0.5158854985844605, - "grad_norm": 2.1294662952423096, + "grad_norm": 2.7499189376831055, "learning_rate": 2.7971487002337344e-06, - "logits/chosen": 1.551054835319519, - "logits/rejected": 1.4010810852050781, - "logps/chosen": -553.7382202148438, - "logps/rejected": -1012.7103271484375, + "logits/chosen": 1.6604124307632446, + "logits/rejected": 1.5121276378631592, + "logps/chosen": -536.4432373046875, + "logps/rejected": -991.6915893554688, "loss": 0.0835, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -5.215034008026123, - "rewards/margins": 4.574709415435791, - "rewards/rejected": -9.78974437713623, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.0419921875, + "rewards/margins": 4.537399768829346, + "rewards/rejected": -9.579391479492188, "step": 1640 }, { "epoch": 0.5190311418685121, - "grad_norm": 2.6101462841033936, + "grad_norm": 2.7009730339050293, "learning_rate": 2.769874061854434e-06, - "logits/chosen": 1.5008209943771362, - "logits/rejected": 1.2978509664535522, - "logps/chosen": -475.5283203125, - "logps/rejected": -943.5218505859375, - "loss": 0.0822, + "logits/chosen": 1.630128264427185, + "logits/rejected": 1.43096923828125, + "logps/chosen": -473.77838134765625, + "logps/rejected": -935.79931640625, + "loss": 0.0789, "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.424406051635742, - "rewards/margins": 4.653243064880371, - "rewards/rejected": -9.07765007019043, + "rewards/chosen": -4.406306743621826, + "rewards/margins": 4.593745708465576, + "rewards/rejected": -9.000051498413086, "step": 1650 }, { "epoch": 0.5221767851525637, - "grad_norm": 3.2625982761383057, + "grad_norm": 2.960386037826538, "learning_rate": 2.74256688323065e-06, - "logits/chosen": 1.244457483291626, - "logits/rejected": 1.1406192779541016, - "logps/chosen": -540.4188232421875, - "logps/rejected": -989.83154296875, - "loss": 0.0979, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -5.072657108306885, - "rewards/margins": 4.476801872253418, - "rewards/rejected": -9.549459457397461, + "logits/chosen": 1.2850834131240845, + "logits/rejected": 1.1641004085540771, + "logps/chosen": -516.1502685546875, + "logps/rejected": -943.5867309570312, + "loss": 0.0922, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.829821586608887, + "rewards/margins": 4.256933689117432, + "rewards/rejected": -9.086755752563477, "step": 1660 }, { "epoch": 0.5253224284366153, - "grad_norm": 2.4055895805358887, + "grad_norm": 3.012941360473633, "learning_rate": 2.7152304569436055e-06, - "logits/chosen": 1.3721317052841187, - "logits/rejected": 1.1766210794448853, - "logps/chosen": -486.52117919921875, - "logps/rejected": -998.5260009765625, - "loss": 0.0737, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.543233394622803, - "rewards/margins": 5.094179630279541, - "rewards/rejected": -9.637413024902344, + "logits/chosen": 1.3663134574890137, + "logits/rejected": 1.1525830030441284, + "logps/chosen": -484.38641357421875, + "logps/rejected": -991.9713134765625, + "loss": 0.0713, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.521730899810791, + "rewards/margins": 5.04987907409668, + "rewards/rejected": -9.571609497070312, "step": 1670 }, { "epoch": 0.5284680717206669, - "grad_norm": 4.099717140197754, + "grad_norm": 3.185410976409912, "learning_rate": 2.6878680791010786e-06, - "logits/chosen": 1.48918879032135, - "logits/rejected": 1.4536645412445068, - "logps/chosen": -565.0792236328125, - "logps/rejected": -1010.3558349609375, - "loss": 0.0941, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -5.319018840789795, - "rewards/margins": 4.435922145843506, - "rewards/rejected": -9.7549409866333, + "logits/chosen": 1.3727288246154785, + "logits/rejected": 1.32237708568573, + "logps/chosen": -528.0745849609375, + "logps/rejected": -955.1348876953125, + "loss": 0.0918, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.948831558227539, + "rewards/margins": 4.25364351272583, + "rewards/rejected": -9.202474594116211, "step": 1680 }, { "epoch": 0.5316137150047184, - "grad_norm": 4.480926513671875, + "grad_norm": 4.06403923034668, "learning_rate": 2.6604830489399763e-06, - "logits/chosen": 1.392275333404541, - "logits/rejected": 1.226122260093689, - "logps/chosen": -508.6419982910156, - "logps/rejected": -1001.2962646484375, - "loss": 0.1079, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.7513346672058105, - "rewards/margins": 4.9047369956970215, - "rewards/rejected": -9.656072616577148, + "logits/chosen": 1.203892707824707, + "logits/rejected": 1.0114085674285889, + "logps/chosen": -481.3412170410156, + "logps/rejected": -961.1526489257812, + "loss": 0.1006, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.478043556213379, + "rewards/margins": 4.7761616706848145, + "rewards/rejected": -9.254205703735352, "step": 1690 }, { "epoch": 0.5347593582887701, - "grad_norm": 3.0424184799194336, + "grad_norm": 4.09527587890625, "learning_rate": 2.6330786684285203e-06, - "logits/chosen": 1.5600693225860596, - "logits/rejected": 1.2706520557403564, - "logps/chosen": -478.228515625, - "logps/rejected": -1058.522705078125, - "loss": 0.0853, + "logits/chosen": 1.3332659006118774, + "logits/rejected": 1.002687692642212, + "logps/chosen": -484.726318359375, + "logps/rejected": -1045.978271484375, + "loss": 0.0881, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.449268341064453, - "rewards/margins": 5.78544282913208, - "rewards/rejected": -10.234711647033691, + "rewards/chosen": -4.514087677001953, + "rewards/margins": 5.59476900100708, + "rewards/rejected": -10.108857154846191, "step": 1700 }, { "epoch": 0.5347593582887701, - "eval_logits/chosen": 2.233569383621216, - "eval_logits/rejected": 2.0647757053375244, - "eval_logps/chosen": -463.9637145996094, - "eval_logps/rejected": -965.1707763671875, - "eval_loss": 0.04823598265647888, - "eval_rewards/accuracies": 0.9067164063453674, - "eval_rewards/chosen": -4.31192684173584, - "eval_rewards/margins": 4.9807939529418945, - "eval_rewards/rejected": -9.292719841003418, - "eval_runtime": 214.9777, - "eval_samples_per_second": 99.624, - "eval_steps_per_second": 1.558, + "eval_logits/chosen": 2.0804264545440674, + "eval_logits/rejected": 1.876003384590149, + "eval_logps/chosen": -478.5644226074219, + "eval_logps/rejected": -972.8605346679688, + "eval_loss": 0.0490301214158535, + "eval_rewards/accuracies": 0.9100746512413025, + "eval_rewards/chosen": -4.457676410675049, + "eval_rewards/margins": 4.911769866943359, + "eval_rewards/rejected": -9.369446754455566, + "eval_runtime": 215.8813, + "eval_samples_per_second": 99.207, + "eval_steps_per_second": 1.552, "step": 1700 }, { "epoch": 0.5379050015728216, - "grad_norm": 2.249830484390259, + "grad_norm": 2.8809056282043457, "learning_rate": 2.6056582418681164e-06, - "logits/chosen": 1.4642279148101807, - "logits/rejected": 1.204160451889038, - "logps/chosen": -478.66937255859375, - "logps/rejected": -993.77392578125, - "loss": 0.0807, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.444479465484619, - "rewards/margins": 5.1373610496521, - "rewards/rejected": -9.581840515136719, + "logits/chosen": 1.3827991485595703, + "logits/rejected": 1.080010175704956, + "logps/chosen": -460.39697265625, + "logps/rejected": -988.3931884765625, + "loss": 0.083, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.2614946365356445, + "rewards/margins": 5.26639986038208, + "rewards/rejected": -9.527894973754883, "step": 1710 }, { "epoch": 0.5410506448568733, - "grad_norm": 3.891451358795166, + "grad_norm": 3.1663739681243896, "learning_rate": 2.5782250754949334e-06, - "logits/chosen": 1.3919012546539307, - "logits/rejected": 1.2075676918029785, - "logps/chosen": -528.3414306640625, - "logps/rejected": -1053.069091796875, - "loss": 0.0831, - "rewards/accuracies": 0.875, - "rewards/chosen": -4.955795764923096, - "rewards/margins": 5.23477029800415, - "rewards/rejected": -10.19056510925293, + "logits/chosen": 1.3737657070159912, + "logits/rejected": 1.1691869497299194, + "logps/chosen": -492.1012268066406, + "logps/rejected": -988.88623046875, + "loss": 0.081, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.593070030212402, + "rewards/margins": 4.955358028411865, + "rewards/rejected": -9.548428535461426, "step": 1720 }, { "epoch": 0.5441962881409248, - "grad_norm": 2.48587965965271, + "grad_norm": 3.5924928188323975, "learning_rate": 2.55078247708125e-06, - "logits/chosen": 1.679917573928833, - "logits/rejected": 1.483515977859497, - "logps/chosen": -483.80340576171875, - "logps/rejected": -970.2359619140625, - "loss": 0.0823, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.530341148376465, - "rewards/margins": 4.841682434082031, - "rewards/rejected": -9.37202262878418, + "logits/chosen": 1.6769235134124756, + "logits/rejected": 1.4500882625579834, + "logps/chosen": -518.3870849609375, + "logps/rejected": -1004.2853393554688, + "loss": 0.081, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.87607479095459, + "rewards/margins": 4.8361406326293945, + "rewards/rejected": -9.712215423583984, "step": 1730 }, { "epoch": 0.5473419314249764, - "grad_norm": 4.296637058258057, + "grad_norm": 4.5862908363342285, "learning_rate": 2.5233337555366206e-06, - "logits/chosen": 1.3564870357513428, - "logits/rejected": 1.3211233615875244, - "logps/chosen": -497.56915283203125, - "logps/rejected": -997.6647338867188, - "loss": 0.1098, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.646389007568359, - "rewards/margins": 4.965304851531982, - "rewards/rejected": -9.6116943359375, + "logits/chosen": 1.373237133026123, + "logits/rejected": 1.3128149509429932, + "logps/chosen": -525.4005737304688, + "logps/rejected": -1018.37646484375, + "loss": 0.1082, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.924373626708984, + "rewards/margins": 4.894657135009766, + "rewards/rejected": -9.81903076171875, "step": 1740 }, { "epoch": 0.550487574709028, - "grad_norm": 3.6422853469848633, + "grad_norm": 2.2820427417755127, "learning_rate": 2.4958822205089e-06, - "logits/chosen": 1.2528626918792725, - "logits/rejected": 1.1883177757263184, - "logps/chosen": -489.92425537109375, - "logps/rejected": -988.5632934570312, - "loss": 0.0915, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.553713798522949, - "rewards/margins": 4.979607582092285, - "rewards/rejected": -9.533323287963867, + "logits/chosen": 1.3243175745010376, + "logits/rejected": 1.246760606765747, + "logps/chosen": -448.7105407714844, + "logps/rejected": -906.8756103515625, + "loss": 0.0957, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.1411871910095215, + "rewards/margins": 4.5750932693481445, + "rewards/rejected": -8.716279983520508, "step": 1750 }, { "epoch": 0.5536332179930796, - "grad_norm": 4.168347358703613, + "grad_norm": 3.8977651596069336, "learning_rate": 2.468431181985179e-06, - "logits/chosen": 1.3860187530517578, - "logits/rejected": 1.1525763273239136, - "logps/chosen": -503.0482482910156, - "logps/rejected": -1023.8341674804688, - "loss": 0.0837, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.700275421142578, - "rewards/margins": 5.1787824630737305, - "rewards/rejected": -9.879056930541992, + "logits/chosen": 1.4766485691070557, + "logits/rejected": 1.2319351434707642, + "logps/chosen": -504.727294921875, + "logps/rejected": -1006.2886962890625, + "loss": 0.0752, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.717135906219482, + "rewards/margins": 4.985989093780518, + "rewards/rejected": -9.703125953674316, "step": 1760 }, { "epoch": 0.5567788612771312, - "grad_norm": 2.398576498031616, + "grad_norm": 2.3041610717773438, "learning_rate": 2.4409839498926848e-06, - "logits/chosen": 1.508630394935608, - "logits/rejected": 1.2709838151931763, - "logps/chosen": -478.9554138183594, - "logps/rejected": -994.9898681640625, - "loss": 0.1004, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.457327842712402, - "rewards/margins": 5.13339900970459, - "rewards/rejected": -9.590726852416992, + "logits/chosen": 1.6048414707183838, + "logits/rejected": 1.3524333238601685, + "logps/chosen": -490.019287109375, + "logps/rejected": -1014.9865112304688, + "loss": 0.0925, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.56765079498291, + "rewards/margins": 5.2225165367126465, + "rewards/rejected": -9.790167808532715, "step": 1770 }, { "epoch": 0.5599245045611828, - "grad_norm": 1.8458563089370728, + "grad_norm": 1.7596114873886108, "learning_rate": 2.41354383369968e-06, - "logits/chosen": 1.2807908058166504, - "logits/rejected": 1.176458716392517, - "logps/chosen": -518.6404418945312, - "logps/rejected": -1035.277587890625, - "loss": 0.0897, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.8516459465026855, - "rewards/margins": 5.121449947357178, - "rewards/rejected": -9.973095893859863, + "logits/chosen": 1.3451988697052002, + "logits/rejected": 1.2149244546890259, + "logps/chosen": -526.4452514648438, + "logps/rejected": -1049.12646484375, + "loss": 0.0762, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.929293155670166, + "rewards/margins": 5.181774616241455, + "rewards/rejected": -10.111066818237305, "step": 1780 }, { "epoch": 0.5630701478452343, - "grad_norm": 2.2971677780151367, + "grad_norm": 2.1974737644195557, "learning_rate": 2.3861141420164246e-06, - "logits/chosen": 1.4796741008758545, - "logits/rejected": 1.2651772499084473, - "logps/chosen": -477.72747802734375, - "logps/rejected": -1049.280029296875, - "loss": 0.0764, + "logits/chosen": 1.6341116428375244, + "logits/rejected": 1.4171388149261475, + "logps/chosen": -478.0091247558594, + "logps/rejected": -1021.9977416992188, + "loss": 0.0787, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.469423294067383, - "rewards/margins": 5.668953895568848, - "rewards/rejected": -10.13837718963623, + "rewards/chosen": -4.472123146057129, + "rewards/margins": 5.393218517303467, + "rewards/rejected": -9.865342140197754, "step": 1790 }, { "epoch": 0.5662157911292859, - "grad_norm": 2.9305918216705322, + "grad_norm": 2.651939630508423, "learning_rate": 2.3586981821962325e-06, - "logits/chosen": 1.2771943807601929, - "logits/rejected": 1.146514654159546, - "logps/chosen": -525.7647705078125, - "logps/rejected": -1003.0315551757812, - "loss": 0.0897, - "rewards/accuracies": 0.84375, - "rewards/chosen": -4.923129558563232, - "rewards/margins": 4.7543439865112305, - "rewards/rejected": -9.677474975585938, + "logits/chosen": 1.4779160022735596, + "logits/rejected": 1.3249865770339966, + "logps/chosen": -514.2837524414062, + "logps/rejected": -986.8079833984375, + "loss": 0.0847, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.807987213134766, + "rewards/margins": 4.707097053527832, + "rewards/rejected": -9.515085220336914, "step": 1800 }, { "epoch": 0.5662157911292859, - "eval_logits/chosen": 2.0822219848632812, - "eval_logits/rejected": 1.9037203788757324, - "eval_logps/chosen": -462.9552307128906, - "eval_logps/rejected": -978.6490478515625, - "eval_loss": 0.04494684934616089, - "eval_rewards/accuracies": 0.9100746512413025, - "eval_rewards/chosen": -4.301841735839844, - "eval_rewards/margins": 5.1256632804870605, - "eval_rewards/rejected": -9.427504539489746, - "eval_runtime": 215.1799, - "eval_samples_per_second": 99.531, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.2903590202331543, + "eval_logits/rejected": 2.0999255180358887, + "eval_logps/chosen": -458.1053771972656, + "eval_logps/rejected": -977.0005493164062, + "eval_loss": 0.044130466878414154, + "eval_rewards/accuracies": 0.9130597114562988, + "eval_rewards/chosen": -4.253086566925049, + "eval_rewards/margins": 5.157759666442871, + "eval_rewards/rejected": -9.410846710205078, + "eval_runtime": 216.0337, + "eval_samples_per_second": 99.137, + "eval_steps_per_second": 1.551, "step": 1800 }, { "epoch": 0.5693614344133375, - "grad_norm": 3.8161098957061768, + "grad_norm": 4.547900676727295, "learning_rate": 2.3312992599366922e-06, - "logits/chosen": 1.2443963289260864, - "logits/rejected": 1.1431701183319092, - "logps/chosen": -506.3291015625, - "logps/rejected": -968.5440673828125, - "loss": 0.0944, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.7178635597229, - "rewards/margins": 4.596076011657715, - "rewards/rejected": -9.313939094543457, + "logits/chosen": 1.3972145318984985, + "logits/rejected": 1.2784581184387207, + "logps/chosen": -507.777587890625, + "logps/rejected": -977.9973754882812, + "loss": 0.0925, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.732403755187988, + "rewards/margins": 4.675986289978027, + "rewards/rejected": -9.4083890914917, "step": 1810 }, { "epoch": 0.5725070776973891, - "grad_norm": 2.8469791412353516, + "grad_norm": 3.3379406929016113, "learning_rate": 2.3039206788810772e-06, - "logits/chosen": 1.3020217418670654, - "logits/rejected": 1.1260459423065186, - "logps/chosen": -498.827392578125, - "logps/rejected": -1060.462158203125, - "loss": 0.0784, + "logits/chosen": 1.3549931049346924, + "logits/rejected": 1.1878349781036377, + "logps/chosen": -511.08306884765625, + "logps/rejected": -1061.042236328125, + "loss": 0.0734, "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.652395248413086, - "rewards/margins": 5.58463716506958, - "rewards/rejected": -10.237032890319824, + "rewards/chosen": -4.774610996246338, + "rewards/margins": 5.468084812164307, + "rewards/rejected": -10.242693901062012, "step": 1820 }, { "epoch": 0.5756527209814407, - "grad_norm": 2.866737127304077, + "grad_norm": 3.089817523956299, "learning_rate": 2.276565740220006e-06, - "logits/chosen": 1.2295194864273071, - "logits/rejected": 1.1037569046020508, - "logps/chosen": -521.6785888671875, - "logps/rejected": -1030.312255859375, - "loss": 0.0949, + "logits/chosen": 1.3616137504577637, + "logits/rejected": 1.2165549993515015, + "logps/chosen": -518.4862060546875, + "logps/rejected": -1029.1578369140625, + "loss": 0.088, "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.875964641571045, - "rewards/margins": 5.085350036621094, - "rewards/rejected": -9.96131420135498, + "rewards/chosen": -4.843867778778076, + "rewards/margins": 5.105698108673096, + "rewards/rejected": -9.949565887451172, "step": 1830 }, { "epoch": 0.5787983642654922, - "grad_norm": 3.850404739379883, + "grad_norm": 3.8508617877960205, "learning_rate": 2.249237742293399e-06, - "logits/chosen": 1.5953633785247803, - "logits/rejected": 1.3363173007965088, - "logps/chosen": -480.15625, - "logps/rejected": -995.4392700195312, - "loss": 0.0825, + "logits/chosen": 1.7054471969604492, + "logits/rejected": 1.4460281133651733, + "logps/chosen": -489.3634338378906, + "logps/rejected": -1021.7463989257812, + "loss": 0.0793, "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.456434726715088, - "rewards/margins": 5.125838756561279, - "rewards/rejected": -9.582273483276367, + "rewards/chosen": -4.548079013824463, + "rewards/margins": 5.296806812286377, + "rewards/rejected": -9.844886779785156, "step": 1840 }, { "epoch": 0.5819440075495439, - "grad_norm": 3.528315544128418, + "grad_norm": 3.4135677814483643, "learning_rate": 2.2219399801927818e-06, - "logits/chosen": 1.3022558689117432, - "logits/rejected": 1.3659191131591797, - "logps/chosen": -491.4390563964844, - "logps/rejected": -1003.6394653320312, - "loss": 0.0754, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.590095043182373, - "rewards/margins": 5.104881286621094, - "rewards/rejected": -9.694976806640625, + "logits/chosen": 1.3625301122665405, + "logits/rejected": 1.4042856693267822, + "logps/chosen": -479.4251403808594, + "logps/rejected": -991.6087036132812, + "loss": 0.0736, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.46974515914917, + "rewards/margins": 5.104647636413574, + "rewards/rejected": -9.574393272399902, "step": 1850 }, { "epoch": 0.5850896508335954, - "grad_norm": 1.6058087348937988, + "grad_norm": 1.2604960203170776, "learning_rate": 2.194675745363971e-06, - "logits/chosen": 1.203044056892395, - "logits/rejected": 1.063595175743103, - "logps/chosen": -501.7749938964844, - "logps/rejected": -1017.3069458007812, - "loss": 0.0582, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.695984840393066, - "rewards/margins": 5.130289077758789, - "rewards/rejected": -9.826274871826172, + "logits/chosen": 1.2001352310180664, + "logits/rejected": 1.0285674333572388, + "logps/chosen": -497.36669921875, + "logps/rejected": -994.6357421875, + "loss": 0.0587, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.651559829711914, + "rewards/margins": 4.947619915008545, + "rewards/rejected": -9.599178314208984, "step": 1860 }, { "epoch": 0.5882352941176471, - "grad_norm": 3.549255847930908, + "grad_norm": 1.9666972160339355, "learning_rate": 2.167448325210214e-06, - "logits/chosen": 1.3493187427520752, - "logits/rejected": 1.2300829887390137, - "logps/chosen": -551.4163208007812, - "logps/rejected": -1054.571533203125, - "loss": 0.0885, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -5.191308498382568, - "rewards/margins": 5.013763904571533, - "rewards/rejected": -10.205072402954102, + "logits/chosen": 1.406222939491272, + "logits/rejected": 1.2748348712921143, + "logps/chosen": -512.0443115234375, + "logps/rejected": -981.5230712890625, + "loss": 0.0841, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.797163486480713, + "rewards/margins": 4.677144527435303, + "rewards/rejected": -9.474308967590332, "step": 1870 }, { "epoch": 0.5913809374016986, - "grad_norm": 3.082181930541992, + "grad_norm": 2.399761915206909, "learning_rate": 2.140261002695804e-06, - "logits/chosen": 1.4283784627914429, - "logits/rejected": 1.312314748764038, - "logps/chosen": -510.80908203125, - "logps/rejected": -1003.2232666015625, - "loss": 0.0661, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.792733669281006, - "rewards/margins": 4.895735740661621, - "rewards/rejected": -9.688468933105469, + "logits/chosen": 1.6316009759902954, + "logits/rejected": 1.5087294578552246, + "logps/chosen": -472.5790100097656, + "logps/rejected": -953.4318237304688, + "loss": 0.0648, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.410122394561768, + "rewards/margins": 4.779942512512207, + "rewards/rejected": -9.190065383911133, "step": 1880 }, { "epoch": 0.5945265806857503, - "grad_norm": 3.06274676322937, + "grad_norm": 4.341762065887451, "learning_rate": 2.1131170559502328e-06, - "logits/chosen": 1.4798619747161865, - "logits/rejected": 1.2996398210525513, - "logps/chosen": -499.54010009765625, - "logps/rejected": -1004.2786865234375, - "loss": 0.0885, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.648800373077393, - "rewards/margins": 5.034187316894531, - "rewards/rejected": -9.682988166809082, + "logits/chosen": 1.6851389408111572, + "logits/rejected": 1.484548807144165, + "logps/chosen": -491.853271484375, + "logps/rejected": -999.4378051757812, + "loss": 0.0846, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.5716047286987305, + "rewards/margins": 5.062628269195557, + "rewards/rejected": -9.634233474731445, "step": 1890 }, { "epoch": 0.5976722239698018, - "grad_norm": 2.7769219875335693, + "grad_norm": 3.187537431716919, "learning_rate": 2.0860197578729306e-06, - "logits/chosen": 1.4442288875579834, - "logits/rejected": 1.2996704578399658, - "logps/chosen": -501.84149169921875, - "logps/rejected": -1037.2816162109375, - "loss": 0.0717, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.711190700531006, - "rewards/margins": 5.312428951263428, - "rewards/rejected": -10.023618698120117, + "logits/chosen": 1.5915769338607788, + "logits/rejected": 1.439206838607788, + "logps/chosen": -475.848876953125, + "logps/rejected": -1013.7513427734375, + "loss": 0.0713, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.451183795928955, + "rewards/margins": 5.336948871612549, + "rewards/rejected": -9.788132667541504, "step": 1900 }, { "epoch": 0.5976722239698018, - "eval_logits/chosen": 2.174854278564453, - "eval_logits/rejected": 2.000274181365967, - "eval_logps/chosen": -476.6779479980469, - "eval_logps/rejected": -1019.844482421875, - "eval_loss": 0.040205687284469604, - "eval_rewards/accuracies": 0.9111940264701843, - "eval_rewards/chosen": -4.4390692710876465, - "eval_rewards/margins": 5.400389194488525, - "eval_rewards/rejected": -9.839457511901855, - "eval_runtime": 215.3019, - "eval_samples_per_second": 99.474, - "eval_steps_per_second": 1.556, + "eval_logits/chosen": 2.286052942276001, + "eval_logits/rejected": 2.0887067317962646, + "eval_logps/chosen": -473.8065185546875, + "eval_logps/rejected": -1001.3447875976562, + "eval_loss": 0.04112754389643669, + "eval_rewards/accuracies": 0.9167910218238831, + "eval_rewards/chosen": -4.410098075866699, + "eval_rewards/margins": 5.244190692901611, + "eval_rewards/rejected": -9.654288291931152, + "eval_runtime": 216.0279, + "eval_samples_per_second": 99.14, + "eval_steps_per_second": 1.551, "step": 1900 }, { "epoch": 0.6008178672538534, - "grad_norm": 2.0829498767852783, + "grad_norm": 1.875487208366394, "learning_rate": 2.058972375738635e-06, - "logits/chosen": 1.4589773416519165, - "logits/rejected": 1.2779289484024048, - "logps/chosen": -474.3269958496094, - "logps/rejected": -990.4747924804688, - "loss": 0.0856, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.426661491394043, - "rewards/margins": 5.121007442474365, - "rewards/rejected": -9.547670364379883, + "logits/chosen": 1.528555154800415, + "logits/rejected": 1.322067379951477, + "logps/chosen": -459.99029541015625, + "logps/rejected": -968.6251831054688, + "loss": 0.0775, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.283050537109375, + "rewards/margins": 5.045687198638916, + "rewards/rejected": -9.328737258911133, "step": 1910 }, { "epoch": 0.603963510537905, - "grad_norm": 2.426499605178833, + "grad_norm": 2.8932604789733887, "learning_rate": 2.031978170803433e-06, - "logits/chosen": 1.4309927225112915, - "logits/rejected": 1.1588290929794312, - "logps/chosen": -473.109130859375, - "logps/rejected": -1036.1728515625, - "loss": 0.0589, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.395280361175537, - "rewards/margins": 5.615933418273926, - "rewards/rejected": -10.011213302612305, + "logits/chosen": 1.578962802886963, + "logits/rejected": 1.275937795639038, + "logps/chosen": -466.869873046875, + "logps/rejected": -997.1063232421875, + "loss": 0.0638, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.332566261291504, + "rewards/margins": 5.287723541259766, + "rewards/rejected": -9.62028980255127, "step": 1920 }, { "epoch": 0.6071091538219566, - "grad_norm": 4.0052947998046875, + "grad_norm": 3.8268966674804688, "learning_rate": 2.0050403979115372e-06, - "logits/chosen": 1.4139277935028076, - "logits/rejected": 1.32688307762146, - "logps/chosen": -508.6914978027344, - "logps/rejected": -1031.7740478515625, - "loss": 0.0887, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.769944667816162, - "rewards/margins": 5.1961283683776855, - "rewards/rejected": -9.966073036193848, + "logits/chosen": 1.6024353504180908, + "logits/rejected": 1.4667185544967651, + "logps/chosen": -519.0797119140625, + "logps/rejected": -1030.8382568359375, + "loss": 0.0863, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.873663902282715, + "rewards/margins": 5.082743167877197, + "rewards/rejected": -9.956406593322754, "step": 1930 }, { "epoch": 0.6102547971060082, - "grad_norm": 2.5407373905181885, + "grad_norm": 1.616674542427063, "learning_rate": 1.978162305102828e-06, - "logits/chosen": 1.255937099456787, - "logits/rejected": 1.1275227069854736, - "logps/chosen": -508.431884765625, - "logps/rejected": -1019.8643798828125, - "loss": 0.0771, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.759140491485596, - "rewards/margins": 5.087557792663574, - "rewards/rejected": -9.846698760986328, + "logits/chosen": 1.3861491680145264, + "logits/rejected": 1.261344313621521, + "logps/chosen": -475.9474182128906, + "logps/rejected": -974.275390625, + "loss": 0.0714, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.4340996742248535, + "rewards/margins": 4.956498146057129, + "rewards/rejected": -9.390597343444824, "step": 1940 }, { "epoch": 0.6134004403900598, - "grad_norm": 1.74916410446167, + "grad_norm": 2.2611825466156006, "learning_rate": 1.9513471332212218e-06, - "logits/chosen": 1.3682215213775635, - "logits/rejected": 1.1232274770736694, - "logps/chosen": -526.3414306640625, - "logps/rejected": -1046.4849853515625, - "loss": 0.0651, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.9356513023376465, - "rewards/margins": 5.17080020904541, - "rewards/rejected": -10.106451988220215, + "logits/chosen": 1.4959056377410889, + "logits/rejected": 1.239583969116211, + "logps/chosen": -499.00299072265625, + "logps/rejected": -1015.1644287109375, + "loss": 0.0668, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.661846160888672, + "rewards/margins": 5.131340503692627, + "rewards/rejected": -9.793185234069824, "step": 1950 }, { "epoch": 0.6165460836741113, - "grad_norm": 3.125661611557007, + "grad_norm": 2.701421022415161, "learning_rate": 1.9245981155239003e-06, - "logits/chosen": 1.0954793691635132, - "logits/rejected": 0.8896375894546509, - "logps/chosen": -468.3290100097656, - "logps/rejected": -1034.557861328125, - "loss": 0.0803, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.36580753326416, - "rewards/margins": 5.616608619689941, - "rewards/rejected": -9.982416152954102, + "logits/chosen": 1.2852675914764404, + "logits/rejected": 1.0676230192184448, + "logps/chosen": -492.07159423828125, + "logps/rejected": -1062.6197509765625, + "loss": 0.0787, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.603121757507324, + "rewards/margins": 5.659279823303223, + "rewards/rejected": -10.262402534484863, "step": 1960 }, { "epoch": 0.619691726958163, - "grad_norm": 1.5080751180648804, + "grad_norm": 1.3949973583221436, "learning_rate": 1.8979184772914626e-06, - "logits/chosen": 1.0934078693389893, - "logits/rejected": 1.0470936298370361, - "logps/chosen": -469.5868225097656, - "logps/rejected": -1005.32763671875, - "loss": 0.0781, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.377876281738281, - "rewards/margins": 5.329991340637207, - "rewards/rejected": -9.707868576049805, + "logits/chosen": 1.3694791793823242, + "logits/rejected": 1.3277966976165771, + "logps/chosen": -443.76300048828125, + "logps/rejected": -968.4646606445312, + "loss": 0.0792, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.119488716125488, + "rewards/margins": 5.219561576843262, + "rewards/rejected": -9.339049339294434, "step": 1970 }, { "epoch": 0.6228373702422145, - "grad_norm": 2.835935592651367, + "grad_norm": 2.097567081451416, "learning_rate": 1.8713114354390302e-06, - "logits/chosen": 1.0925285816192627, - "logits/rejected": 0.8882206678390503, - "logps/chosen": -486.85260009765625, - "logps/rejected": -1009.4669799804688, - "loss": 0.0776, + "logits/chosen": 1.3356621265411377, + "logits/rejected": 1.1214429140090942, + "logps/chosen": -444.4170837402344, + "logps/rejected": -960.7073974609375, + "loss": 0.0787, "rewards/accuracies": 0.90625, - "rewards/chosen": -4.53892183303833, - "rewards/margins": 5.20424222946167, - "rewards/rejected": -9.7431640625, + "rewards/chosen": -4.113885402679443, + "rewards/margins": 5.141117095947266, + "rewards/rejected": -9.255002975463867, "step": 1980 }, { "epoch": 0.6259830135262662, - "grad_norm": 3.386470317840576, + "grad_norm": 3.7821717262268066, "learning_rate": 1.8447801981283692e-06, - "logits/chosen": 1.1058335304260254, - "logits/rejected": 0.9235653877258301, - "logps/chosen": -508.955810546875, - "logps/rejected": -1028.66162109375, - "loss": 0.0806, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.748799800872803, - "rewards/margins": 5.186470031738281, - "rewards/rejected": -9.935269355773926, + "logits/chosen": 1.2892686128616333, + "logits/rejected": 1.1204421520233154, + "logps/chosen": -492.14544677734375, + "logps/rejected": -1007.2326049804688, + "loss": 0.0763, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.580545425415039, + "rewards/margins": 5.139875411987305, + "rewards/rejected": -9.720420837402344, "step": 1990 }, { "epoch": 0.6291286568103177, - "grad_norm": 2.4048423767089844, + "grad_norm": 2.3394644260406494, "learning_rate": 1.818327964381063e-06, - "logits/chosen": 1.3770169019699097, - "logits/rejected": 1.08171808719635, - "logps/chosen": -558.8704833984375, - "logps/rejected": -1182.05517578125, - "loss": 0.0487, - "rewards/accuracies": 0.9375, - "rewards/chosen": -5.268151760101318, - "rewards/margins": 6.191213607788086, - "rewards/rejected": -11.459364891052246, + "logits/chosen": 1.5386545658111572, + "logits/rejected": 1.2143795490264893, + "logps/chosen": -512.078125, + "logps/rejected": -1119.013916015625, + "loss": 0.0553, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.799826622009277, + "rewards/margins": 6.028892993927002, + "rewards/rejected": -10.828720092773438, "step": 2000 }, { "epoch": 0.6291286568103177, - "eval_logits/chosen": 2.1422410011291504, - "eval_logits/rejected": 1.9354875087738037, - "eval_logps/chosen": -580.0486450195312, - "eval_logps/rejected": -1167.69677734375, - "eval_loss": 0.03678546100854874, - "eval_rewards/accuracies": 0.907835841178894, - "eval_rewards/chosen": -5.472775936126709, - "eval_rewards/margins": 5.845205307006836, - "eval_rewards/rejected": -11.317980766296387, - "eval_runtime": 215.2983, - "eval_samples_per_second": 99.476, - "eval_steps_per_second": 1.556, + "eval_logits/chosen": 2.2607874870300293, + "eval_logits/rejected": 2.046884775161743, + "eval_logps/chosen": -529.6686401367188, + "eval_logps/rejected": -1093.740234375, + "eval_loss": 0.037754353135824203, + "eval_rewards/accuracies": 0.9123134613037109, + "eval_rewards/chosen": -4.9687180519104, + "eval_rewards/margins": 5.609524250030518, + "eval_rewards/rejected": -10.578243255615234, + "eval_runtime": 216.3701, + "eval_samples_per_second": 98.983, + "eval_steps_per_second": 1.548, "step": 2000 }, { "epoch": 0.6322743000943692, - "grad_norm": 4.648438453674316, + "grad_norm": 4.657315731048584, "learning_rate": 1.7919579236927873e-06, - "logits/chosen": 1.4500977993011475, - "logits/rejected": 1.4014136791229248, - "logps/chosen": -587.9722290039062, - "logps/rejected": -1136.3968505859375, - "loss": 0.0804, + "logits/chosen": 1.5727989673614502, + "logits/rejected": 1.5137349367141724, + "logps/chosen": -547.173828125, + "logps/rejected": -1079.36181640625, + "loss": 0.0835, "rewards/accuracies": 0.90625, - "rewards/chosen": -5.550187110900879, - "rewards/margins": 5.465216636657715, - "rewards/rejected": -11.015403747558594, + "rewards/chosen": -5.141914367675781, + "rewards/margins": 5.302892684936523, + "rewards/rejected": -10.444807052612305, "step": 2010 }, { "epoch": 0.6354199433784209, - "grad_norm": 3.089294672012329, + "grad_norm": 3.0012001991271973, "learning_rate": 1.7656732556487349e-06, - "logits/chosen": 1.5375685691833496, - "logits/rejected": 1.2935359477996826, - "logps/chosen": -569.8836059570312, - "logps/rejected": -1086.203125, - "loss": 0.0853, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -5.359100818634033, - "rewards/margins": 5.148070335388184, - "rewards/rejected": -10.507171630859375, + "logits/chosen": 1.6588795185089111, + "logits/rejected": 1.4154224395751953, + "logps/chosen": -508.59136962890625, + "logps/rejected": -1022.4967651367188, + "loss": 0.0759, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.74569845199585, + "rewards/margins": 5.124057292938232, + "rewards/rejected": -9.869755744934082, "step": 2020 }, { "epoch": 0.6385655866624724, - "grad_norm": 1.5491887331008911, + "grad_norm": 2.0503461360931396, "learning_rate": 1.7394771295402357e-06, - "logits/chosen": 1.234811544418335, - "logits/rejected": 1.1473228931427002, - "logps/chosen": -460.8042907714844, - "logps/rejected": -970.8651123046875, - "loss": 0.0637, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.264889717102051, - "rewards/margins": 5.0752058029174805, - "rewards/rejected": -9.340095520019531, + "logits/chosen": 1.388684868812561, + "logits/rejected": 1.3237318992614746, + "logps/chosen": -420.70574951171875, + "logps/rejected": -906.8675537109375, + "loss": 0.0695, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.863591432571411, + "rewards/margins": 4.836082458496094, + "rewards/rejected": -8.699673652648926, "step": 2030 }, { "epoch": 0.6417112299465241, - "grad_norm": 1.6754862070083618, + "grad_norm": 1.5963932275772095, "learning_rate": 1.713372703982616e-06, - "logits/chosen": 1.405088186264038, - "logits/rejected": 1.1589716672897339, - "logps/chosen": -482.781494140625, - "logps/rejected": -1041.592041015625, - "loss": 0.0658, + "logits/chosen": 1.6106688976287842, + "logits/rejected": 1.378296136856079, + "logps/chosen": -454.54437255859375, + "logps/rejected": -1000.5890502929688, + "loss": 0.0672, "rewards/accuracies": 0.90625, - "rewards/chosen": -4.50002384185791, - "rewards/margins": 5.562270164489746, - "rewards/rejected": -10.06229305267334, + "rewards/chosen": -4.217381477355957, + "rewards/margins": 5.434484958648682, + "rewards/rejected": -9.65186595916748, "step": 2040 }, { "epoch": 0.6448568732305756, - "grad_norm": 2.908865451812744, + "grad_norm": 2.611083745956421, "learning_rate": 1.6873631265343482e-06, - "logits/chosen": 1.2295814752578735, - "logits/rejected": 1.182570219039917, - "logps/chosen": -506.02899169921875, - "logps/rejected": -1007.8961181640625, - "loss": 0.0886, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.720425605773926, - "rewards/margins": 5.0006914138793945, - "rewards/rejected": -9.72111701965332, + "logits/chosen": 1.5027741193771362, + "logits/rejected": 1.4583556652069092, + "logps/chosen": -517.8651123046875, + "logps/rejected": -1013.5433349609375, + "loss": 0.0746, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.838639736175537, + "rewards/margins": 4.938787460327148, + "rewards/rejected": -9.777427673339844, "step": 2050 }, { "epoch": 0.6480025165146273, - "grad_norm": 2.9250617027282715, + "grad_norm": 3.0739071369171143, "learning_rate": 1.6614515333175301e-06, - "logits/chosen": 1.2245099544525146, - "logits/rejected": 1.1675890684127808, - "logps/chosen": -485.50726318359375, - "logps/rejected": -1001.1533203125, - "loss": 0.0791, + "logits/chosen": 1.4629807472229004, + "logits/rejected": 1.3686448335647583, + "logps/chosen": -518.0137939453125, + "logps/rejected": -1042.031494140625, + "loss": 0.0836, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.507925033569336, - "rewards/margins": 5.16204309463501, - "rewards/rejected": -9.669966697692871, + "rewards/chosen": -4.832829475402832, + "rewards/margins": 5.245598793029785, + "rewards/rejected": -10.0784273147583, "step": 2060 }, { "epoch": 0.6511481597986788, - "grad_norm": 1.8200067281723022, + "grad_norm": 2.08311128616333, "learning_rate": 1.6356410486397465e-06, - "logits/chosen": 1.4917027950286865, - "logits/rejected": 1.1800551414489746, - "logps/chosen": -441.74755859375, - "logps/rejected": -1085.5457763671875, - "loss": 0.0871, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.092404365539551, - "rewards/margins": 6.399931907653809, - "rewards/rejected": -10.492338180541992, + "logits/chosen": 1.7262866497039795, + "logits/rejected": 1.4022530317306519, + "logps/chosen": -440.1412658691406, + "logps/rejected": -1064.7659912109375, + "loss": 0.0969, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.076186180114746, + "rewards/margins": 6.208160400390625, + "rewards/rejected": -10.284345626831055, "step": 2070 }, { "epoch": 0.6542938030827304, - "grad_norm": 3.119096517562866, + "grad_norm": 2.8899753093719482, "learning_rate": 1.6099347846173515e-06, - "logits/chosen": 1.573908805847168, - "logits/rejected": 1.341700792312622, - "logps/chosen": -462.58782958984375, - "logps/rejected": -1023.86328125, - "loss": 0.0774, - "rewards/accuracies": 0.9375, - "rewards/chosen": -4.3063201904296875, - "rewards/margins": 5.589145660400391, - "rewards/rejected": -9.895464897155762, + "logits/chosen": 1.777707815170288, + "logits/rejected": 1.5722458362579346, + "logps/chosen": -450.6910095214844, + "logps/rejected": -995.6285400390625, + "loss": 0.066, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.187132835388184, + "rewards/margins": 5.425961494445801, + "rewards/rejected": -9.613094329833984, "step": 2080 }, { "epoch": 0.657439446366782, - "grad_norm": 3.447984218597412, + "grad_norm": 4.856938362121582, "learning_rate": 1.5843358408002263e-06, - "logits/chosen": 1.4889028072357178, - "logits/rejected": 1.2970954179763794, - "logps/chosen": -526.8050537109375, - "logps/rejected": -1041.9156494140625, - "loss": 0.0806, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.942746162414551, - "rewards/margins": 5.119975566864014, - "rewards/rejected": -10.062723159790039, + "logits/chosen": 1.6731908321380615, + "logits/rejected": 1.480979323387146, + "logps/chosen": -527.1090087890625, + "logps/rejected": -1042.3140869140625, + "loss": 0.0809, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.945659637451172, + "rewards/margins": 5.120595932006836, + "rewards/rejected": -10.066255569458008, "step": 2090 }, { "epoch": 0.6605850896508336, - "grad_norm": 2.026029109954834, + "grad_norm": 2.9849703311920166, "learning_rate": 1.5588473037980448e-06, - "logits/chosen": 1.607642412185669, - "logits/rejected": 1.3940250873565674, - "logps/chosen": -501.61932373046875, - "logps/rejected": -1060.6180419921875, - "loss": 0.0683, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.699268817901611, - "rewards/margins": 5.546324253082275, - "rewards/rejected": -10.24559211730957, + "logits/chosen": 1.7262375354766846, + "logits/rejected": 1.497453212738037, + "logps/chosen": -525.8902587890625, + "logps/rejected": -1072.6090087890625, + "loss": 0.0668, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.941816806793213, + "rewards/margins": 5.42348051071167, + "rewards/rejected": -10.365297317504883, "step": 2100 }, { "epoch": 0.6605850896508336, - "eval_logits/chosen": 2.205811023712158, - "eval_logits/rejected": 2.02057147026062, - "eval_logps/chosen": -500.12677001953125, - "eval_logps/rejected": -1064.2464599609375, - "eval_loss": 0.03558611497282982, + "eval_logits/chosen": 2.3367607593536377, + "eval_logits/rejected": 2.1354455947875977, + "eval_logps/chosen": -507.6487731933594, + "eval_logps/rejected": -1068.1822509765625, + "eval_loss": 0.0362231507897377, "eval_rewards/accuracies": 0.9190298318862915, - "eval_rewards/chosen": -4.673556804656982, - "eval_rewards/margins": 5.609921455383301, - "eval_rewards/rejected": -10.283478736877441, - "eval_runtime": 214.4369, - "eval_samples_per_second": 99.876, - "eval_steps_per_second": 1.562, + "eval_rewards/chosen": -4.7485198974609375, + "eval_rewards/margins": 5.574143886566162, + "eval_rewards/rejected": -10.322663307189941, + "eval_runtime": 216.5807, + "eval_samples_per_second": 98.887, + "eval_steps_per_second": 1.547, "step": 2100 }, { "epoch": 0.6637307329348852, - "grad_norm": 4.1672210693359375, + "grad_norm": 1.8077006340026855, "learning_rate": 1.5334722469081071e-06, - "logits/chosen": 1.6063897609710693, - "logits/rejected": 1.4115921258926392, - "logps/chosen": -531.2935791015625, - "logps/rejected": -1066.810302734375, - "loss": 0.08, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.991600036621094, - "rewards/margins": 5.322088241577148, - "rewards/rejected": -10.313688278198242, + "logits/chosen": 1.7123782634735107, + "logits/rejected": 1.5008488893508911, + "logps/chosen": -503.697265625, + "logps/rejected": -1033.975341796875, + "loss": 0.0767, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.715047359466553, + "rewards/margins": 5.269798278808594, + "rewards/rejected": -9.984844207763672, "step": 2110 }, { "epoch": 0.6668763762189368, - "grad_norm": 2.3815789222717285, + "grad_norm": 3.5828003883361816, "learning_rate": 1.508213729744773e-06, - "logits/chosen": 1.546438217163086, - "logits/rejected": 1.2946199178695679, - "logps/chosen": -535.8787231445312, - "logps/rejected": -1102.216064453125, - "loss": 0.0778, - "rewards/accuracies": 0.9375, - "rewards/chosen": -5.021573066711426, - "rewards/margins": 5.643277645111084, - "rewards/rejected": -10.664851188659668, + "logits/chosen": 1.653031587600708, + "logits/rejected": 1.3827488422393799, + "logps/chosen": -505.1139221191406, + "logps/rejected": -1056.74951171875, + "loss": 0.0823, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.713560581207275, + "rewards/margins": 5.496426582336426, + "rewards/rejected": -10.209986686706543, "step": 2120 }, { "epoch": 0.6700220195029883, - "grad_norm": 2.239928960800171, + "grad_norm": 2.5747244358062744, "learning_rate": 1.483074797870547e-06, - "logits/chosen": 1.2132775783538818, - "logits/rejected": 1.0260388851165771, - "logps/chosen": -521.3165893554688, - "logps/rejected": -1100.88037109375, - "loss": 0.0682, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.873823642730713, - "rewards/margins": 5.776439189910889, - "rewards/rejected": -10.650262832641602, + "logits/chosen": 1.361879587173462, + "logits/rejected": 1.1789066791534424, + "logps/chosen": -501.65203857421875, + "logps/rejected": -1063.2545166015625, + "loss": 0.0678, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.677037715911865, + "rewards/margins": 5.596593856811523, + "rewards/rejected": -10.273633003234863, "step": 2130 }, { "epoch": 0.67316766278704, - "grad_norm": 4.122522354125977, + "grad_norm": 4.103507041931152, "learning_rate": 1.4580584824288585e-06, - "logits/chosen": 1.3960988521575928, - "logits/rejected": 1.2553833723068237, - "logps/chosen": -499.489990234375, - "logps/rejected": -1097.9930419921875, - "loss": 0.0649, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.652958393096924, - "rewards/margins": 5.967487335205078, - "rewards/rejected": -10.620445251464844, + "logits/chosen": 1.5563806295394897, + "logits/rejected": 1.3796488046646118, + "logps/chosen": -461.9947814941406, + "logps/rejected": -1061.818603515625, + "loss": 0.0681, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.2779340744018555, + "rewards/margins": 5.980292797088623, + "rewards/rejected": -10.258227348327637, "step": 2140 }, { "epoch": 0.6763133060710915, - "grad_norm": 3.405120372772217, + "grad_norm": 3.899409770965576, "learning_rate": 1.4331677997785786e-06, - "logits/chosen": 1.1937223672866821, - "logits/rejected": 1.0959510803222656, - "logps/chosen": -533.7369384765625, - "logps/rejected": -1047.7427978515625, - "loss": 0.0817, - "rewards/accuracies": 0.90625, - "rewards/chosen": -5.0067901611328125, - "rewards/margins": 5.136543273925781, - "rewards/rejected": -10.143333435058594, + "logits/chosen": 1.3618905544281006, + "logits/rejected": 1.26822829246521, + "logps/chosen": -495.64288330078125, + "logps/rejected": -1018.1676025390625, + "loss": 0.0746, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.625652313232422, + "rewards/margins": 5.221669673919678, + "rewards/rejected": -9.847322463989258, "step": 2150 }, { "epoch": 0.6794589493551432, - "grad_norm": 3.386723279953003, + "grad_norm": 2.615987539291382, "learning_rate": 1.4084057511303212e-06, - "logits/chosen": 1.2363038063049316, - "logits/rejected": 1.009194254875183, - "logps/chosen": -475.3011779785156, - "logps/rejected": -1005.6472778320312, - "loss": 0.077, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.424704551696777, - "rewards/margins": 5.272775650024414, - "rewards/rejected": -9.697481155395508, + "logits/chosen": 1.3500601053237915, + "logits/rejected": 1.1144622564315796, + "logps/chosen": -491.9159240722656, + "logps/rejected": -1018.6404418945312, + "loss": 0.078, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.590536594390869, + "rewards/margins": 5.236490726470947, + "rewards/rejected": -9.827028274536133, "step": 2160 }, { "epoch": 0.6826045926391947, - "grad_norm": 1.0529998540878296, + "grad_norm": 2.499993085861206, "learning_rate": 1.383775322184569e-06, - "logits/chosen": 1.3098864555358887, - "logits/rejected": 1.1490665674209595, - "logps/chosen": -485.99853515625, - "logps/rejected": -971.4075927734375, - "loss": 0.0715, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.52325439453125, - "rewards/margins": 4.847430229187012, - "rewards/rejected": -9.370684623718262, + "logits/chosen": 1.3666261434555054, + "logits/rejected": 1.204685926437378, + "logps/chosen": -495.419677734375, + "logps/rejected": -962.2130737304688, + "loss": 0.0757, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.617236614227295, + "rewards/margins": 4.661618232727051, + "rewards/rejected": -9.278854370117188, "step": 2170 }, { "epoch": 0.6857502359232464, - "grad_norm": 3.592379093170166, + "grad_norm": 4.112603187561035, "learning_rate": 1.3592794827716726e-06, - "logits/chosen": 1.4308536052703857, - "logits/rejected": 1.1930121183395386, - "logps/chosen": -522.5880126953125, - "logps/rejected": -1014.5653076171875, - "loss": 0.0782, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.89089298248291, - "rewards/margins": 4.9056267738342285, - "rewards/rejected": -9.79651927947998, + "logits/chosen": 1.4919835329055786, + "logits/rejected": 1.2371511459350586, + "logps/chosen": -509.6441955566406, + "logps/rejected": -1004.6423950195312, + "loss": 0.0766, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.761294364929199, + "rewards/margins": 4.93589973449707, + "rewards/rejected": -9.697192192077637, "step": 2180 }, { "epoch": 0.6888958792072979, - "grad_norm": 3.138232946395874, + "grad_norm": 2.9656105041503906, "learning_rate": 1.33492118649376e-06, - "logits/chosen": 1.2991418838500977, - "logits/rejected": 1.0811710357666016, - "logps/chosen": -516.8934936523438, - "logps/rejected": -1030.9951171875, - "loss": 0.0764, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.8288469314575195, - "rewards/margins": 5.133552551269531, - "rewards/rejected": -9.96239948272705, + "logits/chosen": 1.4110748767852783, + "logits/rejected": 1.158238172531128, + "logps/chosen": -515.8118286132812, + "logps/rejected": -1045.42431640625, + "loss": 0.0695, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.818012237548828, + "rewards/margins": 5.288434028625488, + "rewards/rejected": -10.106447219848633, "step": 2190 }, { "epoch": 0.6920415224913494, - "grad_norm": 1.4153366088867188, + "grad_norm": 1.8345441818237305, "learning_rate": 1.310703370368605e-06, - "logits/chosen": 1.3389747142791748, - "logits/rejected": 1.2751328945159912, - "logps/chosen": -496.916015625, - "logps/rejected": -1069.6923828125, - "loss": 0.0514, + "logits/chosen": 1.3728022575378418, + "logits/rejected": 1.2915928363800049, + "logps/chosen": -491.84503173828125, + "logps/rejected": -1084.5633544921875, + "loss": 0.0528, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.62496280670166, - "rewards/margins": 5.712726593017578, - "rewards/rejected": -10.337689399719238, + "rewards/chosen": -4.574017524719238, + "rewards/margins": 5.912226676940918, + "rewards/rejected": -10.486245155334473, "step": 2200 }, { "epoch": 0.6920415224913494, - "eval_logits/chosen": 2.127232551574707, - "eval_logits/rejected": 1.9361519813537598, - "eval_logps/chosen": -493.0187072753906, - "eval_logps/rejected": -1058.18115234375, - "eval_loss": 0.03407713398337364, - "eval_rewards/accuracies": 0.920895516872406, - "eval_rewards/chosen": -4.602477073669434, - "eval_rewards/margins": 5.620346546173096, - "eval_rewards/rejected": -10.222823143005371, - "eval_runtime": 214.8052, - "eval_samples_per_second": 99.704, - "eval_steps_per_second": 1.56, + "eval_logits/chosen": 2.159437417984009, + "eval_logits/rejected": 1.9571863412857056, + "eval_logps/chosen": -500.4605407714844, + "eval_logps/rejected": -1057.6173095703125, + "eval_loss": 0.03563934564590454, + "eval_rewards/accuracies": 0.9175373315811157, + "eval_rewards/chosen": -4.676637649536133, + "eval_rewards/margins": 5.540375232696533, + "eval_rewards/rejected": -10.217013359069824, + "eval_runtime": 216.1143, + "eval_samples_per_second": 99.1, + "eval_steps_per_second": 1.55, "step": 2200 }, { "epoch": 0.6951871657754011, - "grad_norm": 2.530432939529419, + "grad_norm": 2.712627649307251, "learning_rate": 1.28662895447549e-06, - "logits/chosen": 1.5015747547149658, - "logits/rejected": 1.3566521406173706, - "logps/chosen": -486.2564392089844, - "logps/rejected": -1018.2689208984375, - "loss": 0.069, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -4.528347969055176, - "rewards/margins": 5.29823112487793, - "rewards/rejected": -9.826578140258789, + "logits/chosen": 1.4879462718963623, + "logits/rejected": 1.3425077199935913, + "logps/chosen": -491.4945373535156, + "logps/rejected": -1018.0159912109375, + "loss": 0.0729, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.580411911010742, + "rewards/margins": 5.243422031402588, + "rewards/rejected": -9.823833465576172, "step": 2210 }, { "epoch": 0.6983328090594526, - "grad_norm": 2.539612293243408, + "grad_norm": 1.8771477937698364, "learning_rate": 1.2627008416031234e-06, - "logits/chosen": 1.268513560295105, - "logits/rejected": 1.1765294075012207, - "logps/chosen": -509.4791564941406, - "logps/rejected": -1032.7933349609375, - "loss": 0.0587, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.755896091461182, - "rewards/margins": 5.238858699798584, - "rewards/rejected": -9.994754791259766, + "logits/chosen": 1.2122427225112915, + "logits/rejected": 1.1126110553741455, + "logps/chosen": -515.5506591796875, + "logps/rejected": -1040.5745849609375, + "loss": 0.0591, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.816196441650391, + "rewards/margins": 5.2558979988098145, + "rewards/rejected": -10.072093963623047, "step": 2220 }, { "epoch": 0.7014784523435043, - "grad_norm": 2.578758716583252, + "grad_norm": 3.229480743408203, "learning_rate": 1.2389219168996275e-06, - "logits/chosen": 1.171112298965454, - "logits/rejected": 0.9224494099617004, - "logps/chosen": -523.6820068359375, - "logps/rejected": -1086.1273193359375, - "loss": 0.058, + "logits/chosen": 1.109668254852295, + "logits/rejected": 0.8329025506973267, + "logps/chosen": -496.52435302734375, + "logps/rejected": -1035.8121337890625, + "loss": 0.0648, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.909883975982666, - "rewards/margins": 5.608704566955566, - "rewards/rejected": -10.51858901977539, + "rewards/chosen": -4.638185024261475, + "rewards/margins": 5.377082347869873, + "rewards/rejected": -10.015268325805664, "step": 2230 }, { "epoch": 0.7046240956275558, - "grad_norm": 3.774636745452881, + "grad_norm": 2.889782667160034, "learning_rate": 1.2152950475246621e-06, - "logits/chosen": 1.2603710889816284, - "logits/rejected": 1.075377106666565, - "logps/chosen": -540.7611083984375, - "logps/rejected": -1098.763427734375, - "loss": 0.0714, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -5.084263801574707, - "rewards/margins": 5.547011375427246, - "rewards/rejected": -10.631277084350586, + "logits/chosen": 1.315767526626587, + "logits/rejected": 1.145149827003479, + "logps/chosen": -474.19769287109375, + "logps/rejected": -1005.68701171875, + "loss": 0.0715, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.41811990737915, + "rewards/margins": 5.282149314880371, + "rewards/rejected": -9.70026969909668, "step": 2240 }, { "epoch": 0.7077697389116074, - "grad_norm": 4.932303428649902, + "grad_norm": 4.062774658203125, "learning_rate": 1.191823082303715e-06, - "logits/chosen": 1.3602168560028076, - "logits/rejected": 1.2510731220245361, - "logps/chosen": -512.5650634765625, - "logps/rejected": -1079.0947265625, - "loss": 0.0691, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.783736228942871, - "rewards/margins": 5.657622337341309, - "rewards/rejected": -10.44135856628418, + "logits/chosen": 1.4250564575195312, + "logits/rejected": 1.3247438669204712, + "logps/chosen": -444.0696716308594, + "logps/rejected": -981.8221435546875, + "loss": 0.0712, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.09854793548584, + "rewards/margins": 5.370059013366699, + "rewards/rejected": -9.468606948852539, "step": 2250 }, { "epoch": 0.710915382195659, - "grad_norm": 3.490607500076294, + "grad_norm": 4.499232292175293, "learning_rate": 1.1685088513846022e-06, - "logits/chosen": 1.2267606258392334, - "logits/rejected": 1.1843498945236206, - "logps/chosen": -507.78106689453125, - "logps/rejected": -1016.5057373046875, - "loss": 0.0721, + "logits/chosen": 1.277414321899414, + "logits/rejected": 1.2257994413375854, + "logps/chosen": -500.9100646972656, + "logps/rejected": -1016.2658081054688, + "loss": 0.069, "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -4.733147144317627, - "rewards/margins": 5.068860054016113, - "rewards/rejected": -9.802007675170898, + "rewards/chosen": -4.664144515991211, + "rewards/margins": 5.134991645812988, + "rewards/rejected": -9.7991361618042, "step": 2260 }, { "epoch": 0.7140610254797106, - "grad_norm": 3.0085694789886475, + "grad_norm": 3.174175262451172, "learning_rate": 1.1453551658962216e-06, - "logits/chosen": 1.3816957473754883, - "logits/rejected": 1.13775634765625, - "logps/chosen": -510.34344482421875, - "logps/rejected": -1073.6854248046875, - "loss": 0.0572, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.783577919006348, - "rewards/margins": 5.602017402648926, - "rewards/rejected": -10.385595321655273, + "logits/chosen": 1.4755274057388306, + "logits/rejected": 1.213168740272522, + "logps/chosen": -507.1438903808594, + "logps/rejected": -1052.708984375, + "loss": 0.0527, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.751412391662598, + "rewards/margins": 5.42425012588501, + "rewards/rejected": -10.175663948059082, "step": 2270 }, { "epoch": 0.7172066687637622, - "grad_norm": 1.4133647680282593, + "grad_norm": 2.1683666706085205, "learning_rate": 1.1223648176095992e-06, - "logits/chosen": 0.987162709236145, - "logits/rejected": 0.9267969131469727, - "logps/chosen": -526.0758666992188, - "logps/rejected": -1094.768798828125, - "loss": 0.0554, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.929390907287598, - "rewards/margins": 5.669344902038574, - "rewards/rejected": -10.598735809326172, + "logits/chosen": 1.091498613357544, + "logits/rejected": 1.0004950761795044, + "logps/chosen": -503.81689453125, + "logps/rejected": -1073.2960205078125, + "loss": 0.0599, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.70639181137085, + "rewards/margins": 5.677199840545654, + "rewards/rejected": -10.38359260559082, "step": 2280 }, { "epoch": 0.7203523120478138, - "grad_norm": 2.9754834175109863, + "grad_norm": 2.7001304626464844, "learning_rate": 1.0995405786012687e-06, - "logits/chosen": 1.2644927501678467, - "logits/rejected": 1.032185435295105, - "logps/chosen": -539.8465576171875, - "logps/rejected": -1119.389892578125, - "loss": 0.0728, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -5.067166805267334, - "rewards/margins": 5.777620315551758, - "rewards/rejected": -10.84478759765625, + "logits/chosen": 1.4330816268920898, + "logits/rejected": 1.1723562479019165, + "logps/chosen": -503.40789794921875, + "logps/rejected": -1042.260986328125, + "loss": 0.0602, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.702376365661621, + "rewards/margins": 5.370804309844971, + "rewards/rejected": -10.07318115234375, "step": 2290 }, { "epoch": 0.7234979553318653, - "grad_norm": 3.943408489227295, + "grad_norm": 2.845198392868042, "learning_rate": 1.0768852009190275e-06, - "logits/chosen": 1.2131414413452148, - "logits/rejected": 1.1425590515136719, - "logps/chosen": -527.9764404296875, - "logps/rejected": -1111.0107421875, - "loss": 0.0623, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.94549036026001, - "rewards/margins": 5.832481384277344, - "rewards/rejected": -10.777971267700195, + "logits/chosen": 1.3931114673614502, + "logits/rejected": 1.3433005809783936, + "logps/chosen": -490.6017150878906, + "logps/rejected": -1066.7279052734375, + "loss": 0.0596, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.5713019371032715, + "rewards/margins": 5.7634968757629395, + "rewards/rejected": -10.334797859191895, "step": 2300 }, { "epoch": 0.7234979553318653, - "eval_logits/chosen": 2.0326879024505615, - "eval_logits/rejected": 1.8240412473678589, - "eval_logps/chosen": -526.7490844726562, - "eval_logps/rejected": -1106.5096435546875, - "eval_loss": 0.032569337636232376, - "eval_rewards/accuracies": 0.9212686419487, - "eval_rewards/chosen": -4.939780235290527, - "eval_rewards/margins": 5.766328811645508, - "eval_rewards/rejected": -10.706109046936035, - "eval_runtime": 215.216, - "eval_samples_per_second": 99.514, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.211730480194092, + "eval_logits/rejected": 2.0040743350982666, + "eval_logps/chosen": -494.5928955078125, + "eval_logps/rejected": -1057.1298828125, + "eval_loss": 0.033993348479270935, + "eval_rewards/accuracies": 0.9235074520111084, + "eval_rewards/chosen": -4.617961406707764, + "eval_rewards/margins": 5.594176769256592, + "eval_rewards/rejected": -10.212138175964355, + "eval_runtime": 216.3884, + "eval_samples_per_second": 98.975, + "eval_steps_per_second": 1.548, "step": 2300 }, { "epoch": 0.726643598615917, - "grad_norm": 1.9171903133392334, + "grad_norm": 1.401442289352417, "learning_rate": 1.0544014162501065e-06, - "logits/chosen": 1.2516294717788696, - "logits/rejected": 1.073210597038269, - "logps/chosen": -539.9628295898438, - "logps/rejected": -1080.7843017578125, - "loss": 0.0617, - "rewards/accuracies": 0.90625, - "rewards/chosen": -5.059046745300293, - "rewards/margins": 5.38003396987915, - "rewards/rejected": -10.439081192016602, + "logits/chosen": 1.4220385551452637, + "logits/rejected": 1.2480995655059814, + "logps/chosen": -494.851318359375, + "logps/rejected": -1024.6070556640625, + "loss": 0.0659, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.607827186584473, + "rewards/margins": 5.269275665283203, + "rewards/rejected": -9.877102851867676, "step": 2310 }, { "epoch": 0.7297892418999685, - "grad_norm": 3.3843212127685547, + "grad_norm": 2.1367287635803223, "learning_rate": 1.0320919355917951e-06, - "logits/chosen": 1.2604172229766846, - "logits/rejected": 1.1014560461044312, - "logps/chosen": -511.25421142578125, - "logps/rejected": -1042.747314453125, - "loss": 0.0764, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.782188415527344, - "rewards/margins": 5.282374382019043, - "rewards/rejected": -10.06456184387207, + "logits/chosen": 1.483291506767273, + "logits/rejected": 1.3624471426010132, + "logps/chosen": -475.54974365234375, + "logps/rejected": -980.1355590820312, + "loss": 0.0703, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.424760341644287, + "rewards/margins": 5.013537406921387, + "rewards/rejected": -9.438297271728516, "step": 2320 }, { "epoch": 0.7329348851840202, - "grad_norm": 4.020650386810303, + "grad_norm": 3.646291732788086, "learning_rate": 1.0099594489245593e-06, - "logits/chosen": 1.5181647539138794, - "logits/rejected": 1.3572300672531128, - "logps/chosen": -520.1868896484375, - "logps/rejected": -1071.7554931640625, - "loss": 0.0737, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.876954078674316, - "rewards/margins": 5.484519004821777, - "rewards/rejected": -10.361473083496094, + "logits/chosen": 1.7658379077911377, + "logits/rejected": 1.5886876583099365, + "logps/chosen": -502.6063537597656, + "logps/rejected": -1023.0255737304688, + "loss": 0.0757, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.701047420501709, + "rewards/margins": 5.172977924346924, + "rewards/rejected": -9.874025344848633, "step": 2330 }, { "epoch": 0.7360805284680717, - "grad_norm": 3.158212184906006, + "grad_norm": 2.939910888671875, "learning_rate": 9.880066248876977e-07, - "logits/chosen": 1.3198108673095703, - "logits/rejected": 1.1537165641784668, - "logps/chosen": -531.109375, - "logps/rejected": -1087.0430908203125, - "loss": 0.0556, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -4.994508743286133, - "rewards/margins": 5.531711578369141, - "rewards/rejected": -10.526220321655273, + "logits/chosen": 1.554218053817749, + "logits/rejected": 1.3716495037078857, + "logps/chosen": -507.82977294921875, + "logps/rejected": -1061.218505859375, + "loss": 0.0576, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.761552333831787, + "rewards/margins": 5.506254196166992, + "rewards/rejected": -10.267806053161621, "step": 2340 }, { "epoch": 0.7392261717521234, - "grad_norm": 2.2422516345977783, + "grad_norm": 1.9353808164596558, "learning_rate": 9.662361104575688e-07, - "logits/chosen": 1.4852499961853027, - "logits/rejected": 1.3730026483535767, - "logps/chosen": -504.0821228027344, - "logps/rejected": -1052.568603515625, - "loss": 0.0707, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.705484390258789, - "rewards/margins": 5.476336479187012, - "rewards/rejected": -10.1818208694458, + "logits/chosen": 1.6388695240020752, + "logits/rejected": 1.5070264339447021, + "logps/chosen": -507.5098571777344, + "logps/rejected": -1031.0897216796875, + "loss": 0.0679, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.739305019378662, + "rewards/margins": 5.227367401123047, + "rewards/rejected": -9.966673851013184, "step": 2350 }, { "epoch": 0.7423718150361749, - "grad_norm": 2.419581174850464, + "grad_norm": 2.6814684867858887, "learning_rate": 9.446505306284276e-07, - "logits/chosen": 1.228144884109497, - "logits/rejected": 1.058624029159546, - "logps/chosen": -529.3380126953125, - "logps/rejected": -1101.996337890625, - "loss": 0.0779, + "logits/chosen": 1.3904848098754883, + "logits/rejected": 1.2182855606079102, + "logps/chosen": -494.0530700683594, + "logps/rejected": -1040.4232177734375, + "loss": 0.0705, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -4.964212894439697, - "rewards/margins": 5.716494083404541, - "rewards/rejected": -10.680707931518555, + "rewards/chosen": -4.611108303070068, + "rewards/margins": 5.45358943939209, + "rewards/rejected": -10.064697265625, "step": 2360 }, { "epoch": 0.7455174583202265, - "grad_norm": 1.8092293739318848, + "grad_norm": 2.1408207416534424, "learning_rate": 9.232524880959173e-07, - "logits/chosen": 1.497022032737732, - "logits/rejected": 1.2356172800064087, - "logps/chosen": -563.4901123046875, - "logps/rejected": -1064.003173828125, - "loss": 0.0721, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -5.283173561096191, - "rewards/margins": 5.0053606033325195, - "rewards/rejected": -10.288534164428711, + "logits/chosen": 1.6318490505218506, + "logits/rejected": 1.3836729526519775, + "logps/chosen": -533.8260498046875, + "logps/rejected": -1014.0780029296875, + "loss": 0.0694, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.986173152923584, + "rewards/margins": 4.802958011627197, + "rewards/rejected": -9.789131164550781, "step": 2370 }, { "epoch": 0.7486631016042781, - "grad_norm": 2.928718090057373, + "grad_norm": 1.7775107622146606, "learning_rate": 9.02044562943247e-07, - "logits/chosen": 1.3512821197509766, - "logits/rejected": 1.1803138256072998, - "logps/chosen": -544.3297119140625, - "logps/rejected": -1103.0115966796875, - "loss": 0.0649, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -5.104948043823242, - "rewards/margins": 5.569328784942627, - "rewards/rejected": -10.674277305603027, + "logits/chosen": 1.561553716659546, + "logits/rejected": 1.3704016208648682, + "logps/chosen": -532.5505981445312, + "logps/rejected": -1075.1712646484375, + "loss": 0.0614, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.986807346343994, + "rewards/margins": 5.408608436584473, + "rewards/rejected": -10.395415306091309, "step": 2380 }, { "epoch": 0.7518087448883297, - "grad_norm": 5.057467937469482, + "grad_norm": 3.8989508152008057, "learning_rate": 8.810293123300956e-07, - "logits/chosen": 1.2008953094482422, - "logits/rejected": 1.0849316120147705, - "logps/chosen": -523.2420654296875, - "logps/rejected": -1096.0482177734375, - "loss": 0.0727, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.887139320373535, - "rewards/margins": 5.702400207519531, - "rewards/rejected": -10.589539527893066, + "logits/chosen": 1.4359630346298218, + "logits/rejected": 1.3191871643066406, + "logps/chosen": -510.1795959472656, + "logps/rejected": -1077.068359375, + "loss": 0.0715, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.7562456130981445, + "rewards/margins": 5.643407344818115, + "rewards/rejected": -10.399652481079102, "step": 2390 }, { "epoch": 0.7549543881723813, - "grad_norm": 2.2187750339508057, + "grad_norm": 1.785208821296692, "learning_rate": 8.602092701842821e-07, - "logits/chosen": 1.3591426610946655, - "logits/rejected": 1.168972134590149, - "logps/chosen": -502.78167724609375, - "logps/rejected": -1061.560546875, - "loss": 0.0693, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.693524360656738, - "rewards/margins": 5.574719429016113, - "rewards/rejected": -10.268243789672852, + "logits/chosen": 1.6097145080566406, + "logits/rejected": 1.4199837446212769, + "logps/chosen": -485.87200927734375, + "logps/rejected": -1022.1697998046875, + "loss": 0.063, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.5243449211120605, + "rewards/margins": 5.349452018737793, + "rewards/rejected": -9.873797416687012, "step": 2400 }, { "epoch": 0.7549543881723813, - "eval_logits/chosen": 2.0582876205444336, - "eval_logits/rejected": 1.8580067157745361, - "eval_logps/chosen": -513.0094604492188, - "eval_logps/rejected": -1098.9998779296875, - "eval_loss": 0.03128579631447792, - "eval_rewards/accuracies": 0.9231343269348145, - "eval_rewards/chosen": -4.802384376525879, - "eval_rewards/margins": 5.828627109527588, - "eval_rewards/rejected": -10.631011962890625, - "eval_runtime": 214.497, - "eval_samples_per_second": 99.848, - "eval_steps_per_second": 1.562, + "eval_logits/chosen": 2.3488142490386963, + "eval_logits/rejected": 2.149275541305542, + "eval_logps/chosen": -486.36529541015625, + "eval_logps/rejected": -1054.6712646484375, + "eval_loss": 0.03283367305994034, + "eval_rewards/accuracies": 0.9257462620735168, + "eval_rewards/chosen": -4.5356855392456055, + "eval_rewards/margins": 5.651867866516113, + "eval_rewards/rejected": -10.187552452087402, + "eval_runtime": 216.3869, + "eval_samples_per_second": 98.976, + "eval_steps_per_second": 1.548, "step": 2400 }, { "epoch": 0.7581000314564328, - "grad_norm": 2.5894553661346436, + "grad_norm": 2.3619818687438965, "learning_rate": 8.395869468962337e-07, - "logits/chosen": 1.2950252294540405, - "logits/rejected": 1.0707935094833374, - "logps/chosen": -528.3794555664062, - "logps/rejected": -1150.237548828125, - "loss": 0.0622, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.965851306915283, - "rewards/margins": 6.168177604675293, - "rewards/rejected": -11.134028434753418, + "logits/chosen": 1.6147444248199463, + "logits/rejected": 1.395455241203308, + "logps/chosen": -495.06939697265625, + "logps/rejected": -1095.788330078125, + "loss": 0.0587, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.632855415344238, + "rewards/margins": 5.956536293029785, + "rewards/rejected": -10.589391708374023, "step": 2410 }, { "epoch": 0.7612456747404844, - "grad_norm": 1.3667354583740234, + "grad_norm": 1.6136853694915771, "learning_rate": 8.191648290162957e-07, - "logits/chosen": 1.440059781074524, - "logits/rejected": 1.308544635772705, - "logps/chosen": -504.36151123046875, - "logps/rejected": -1091.216552734375, - "loss": 0.0615, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -4.736184597015381, - "rewards/margins": 5.828288555145264, - "rewards/rejected": -10.564473152160645, + "logits/chosen": 1.7205461263656616, + "logits/rejected": 1.6001455783843994, + "logps/chosen": -488.3507385253906, + "logps/rejected": -1074.3258056640625, + "loss": 0.0606, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.576030731201172, + "rewards/margins": 5.819344520568848, + "rewards/rejected": -10.39537525177002, "step": 2420 }, { "epoch": 0.764391318024536, - "grad_norm": 3.833573579788208, + "grad_norm": 4.33929443359375, "learning_rate": 7.989453789549131e-07, - "logits/chosen": 1.3757915496826172, - "logits/rejected": 1.328730583190918, - "logps/chosen": -516.4542236328125, - "logps/rejected": -1068.0494384765625, - "loss": 0.083, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.81516170501709, - "rewards/margins": 5.511561393737793, - "rewards/rejected": -10.326723098754883, + "logits/chosen": 1.5811232328414917, + "logits/rejected": 1.5288398265838623, + "logps/chosen": -530.9954223632812, + "logps/rejected": -1079.800537109375, + "loss": 0.0758, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.9601640701293945, + "rewards/margins": 5.483851432800293, + "rewards/rejected": -10.444014549255371, "step": 2430 }, { "epoch": 0.7675369613085876, - "grad_norm": 1.203484296798706, + "grad_norm": 1.9559452533721924, "learning_rate": 7.789310346857243e-07, - "logits/chosen": 1.573310136795044, - "logits/rejected": 1.4590044021606445, - "logps/chosen": -496.33001708984375, - "logps/rejected": -956.9088134765625, - "loss": 0.0658, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.645215034484863, - "rewards/margins": 4.573522090911865, - "rewards/rejected": -9.21873664855957, + "logits/chosen": 1.7974998950958252, + "logits/rejected": 1.6878284215927124, + "logps/chosen": -516.4877319335938, + "logps/rejected": -997.9284057617188, + "loss": 0.0624, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.846479415893555, + "rewards/margins": 4.782151222229004, + "rewards/rejected": -9.628629684448242, "step": 2440 }, { "epoch": 0.7706826045926392, - "grad_norm": 3.1549971103668213, + "grad_norm": 2.785346031188965, "learning_rate": 7.591242094515983e-07, - "logits/chosen": 1.3946812152862549, - "logits/rejected": 1.2821216583251953, - "logps/chosen": -507.58221435546875, - "logps/rejected": -976.2921752929688, - "loss": 0.0655, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.728119850158691, - "rewards/margins": 4.666321754455566, - "rewards/rejected": -9.394440650939941, + "logits/chosen": 1.5722239017486572, + "logits/rejected": 1.4374796152114868, + "logps/chosen": -528.6546630859375, + "logps/rejected": -994.2796630859375, + "loss": 0.0662, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.938473701477051, + "rewards/margins": 4.6353349685668945, + "rewards/rejected": -9.573808670043945, "step": 2450 }, { "epoch": 0.7738282478766908, - "grad_norm": 2.5392165184020996, + "grad_norm": 2.5824944972991943, "learning_rate": 7.395272914736604e-07, - "logits/chosen": 1.22986900806427, - "logits/rejected": 1.043039083480835, - "logps/chosen": -506.740478515625, - "logps/rejected": -1101.1920166015625, - "loss": 0.0592, + "logits/chosen": 1.4237116575241089, + "logits/rejected": 1.2312453985214233, + "logps/chosen": -511.4111328125, + "logps/rejected": -1094.482177734375, + "loss": 0.0636, "rewards/accuracies": 0.9375, - "rewards/chosen": -4.742389678955078, - "rewards/margins": 5.9132208824157715, - "rewards/rejected": -10.655611038208008, + "rewards/chosen": -4.7887115478515625, + "rewards/margins": 5.799224376678467, + "rewards/rejected": -10.587934494018555, "step": 2460 }, { "epoch": 0.7769738911607423, - "grad_norm": 3.4939658641815186, + "grad_norm": 3.5525786876678467, "learning_rate": 7.201426436633289e-07, - "logits/chosen": 1.1813576221466064, - "logits/rejected": 1.0866069793701172, - "logps/chosen": -519.2979125976562, - "logps/rejected": -1120.2783203125, - "loss": 0.0617, + "logits/chosen": 1.383684754371643, + "logits/rejected": 1.2983518838882446, + "logps/chosen": -495.83428955078125, + "logps/rejected": -1077.6273193359375, + "loss": 0.0627, "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.859357833862305, - "rewards/margins": 5.996047019958496, - "rewards/rejected": -10.8554048538208, + "rewards/chosen": -4.624549865722656, + "rewards/margins": 5.804167747497559, + "rewards/rejected": -10.428717613220215, "step": 2470 }, { "epoch": 0.780119534444794, - "grad_norm": 2.185563325881958, + "grad_norm": 1.6180325746536255, "learning_rate": 7.009726033374045e-07, - "logits/chosen": 1.399010419845581, - "logits/rejected": 1.1917028427124023, - "logps/chosen": -552.6378173828125, - "logps/rejected": -1109.709716796875, - "loss": 0.0602, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -5.179744243621826, - "rewards/margins": 5.563017845153809, - "rewards/rejected": -10.742761611938477, + "logits/chosen": 1.5975977182388306, + "logits/rejected": 1.3688011169433594, + "logps/chosen": -537.0506591796875, + "logps/rejected": -1072.892333984375, + "loss": 0.0647, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.023386478424072, + "rewards/margins": 5.3509063720703125, + "rewards/rejected": -10.374292373657227, "step": 2480 }, { "epoch": 0.7832651777288455, - "grad_norm": 3.3408279418945312, + "grad_norm": 2.176626443862915, "learning_rate": 6.820194819362477e-07, - "logits/chosen": 1.303276777267456, - "logits/rejected": 1.1878306865692139, - "logps/chosen": -515.8385009765625, - "logps/rejected": -1112.105224609375, - "loss": 0.0577, - "rewards/accuracies": 0.9375, - "rewards/chosen": -4.832343101501465, - "rewards/margins": 5.937832832336426, - "rewards/rejected": -10.770174980163574, + "logits/chosen": 1.496756672859192, + "logits/rejected": 1.3709968328475952, + "logps/chosen": -487.99371337890625, + "logps/rejected": -1085.553466796875, + "loss": 0.0616, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.553643226623535, + "rewards/margins": 5.951053619384766, + "rewards/rejected": -10.5046968460083, "step": 2490 }, { "epoch": 0.7864108210128972, - "grad_norm": 2.98215651512146, + "grad_norm": 2.5447304248809814, "learning_rate": 6.632855647450784e-07, - "logits/chosen": 1.2269493341445923, - "logits/rejected": 1.1103036403656006, - "logps/chosen": -512.5769653320312, - "logps/rejected": -1073.9197998046875, - "loss": 0.0543, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.790786266326904, - "rewards/margins": 5.6085734367370605, - "rewards/rejected": -10.399358749389648, + "logits/chosen": 1.41390860080719, + "logits/rejected": 1.2979408502578735, + "logps/chosen": -501.66461181640625, + "logps/rejected": -1036.599365234375, + "loss": 0.0558, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.6812262535095215, + "rewards/margins": 5.344993591308594, + "rewards/rejected": -10.026220321655273, "step": 2500 }, { "epoch": 0.7864108210128972, - "eval_logits/chosen": 2.147142171859741, - "eval_logits/rejected": 1.9534480571746826, - "eval_logps/chosen": -514.0873413085938, - "eval_logps/rejected": -1109.419921875, - "eval_loss": 0.03031795844435692, - "eval_rewards/accuracies": 0.9227612018585205, - "eval_rewards/chosen": -4.8131632804870605, - "eval_rewards/margins": 5.922050476074219, - "eval_rewards/rejected": -10.735214233398438, - "eval_runtime": 214.5706, - "eval_samples_per_second": 99.813, - "eval_steps_per_second": 1.561, + "eval_logits/chosen": 2.3275063037872314, + "eval_logits/rejected": 2.120762586593628, + "eval_logps/chosen": -504.3434753417969, + "eval_logps/rejected": -1092.718505859375, + "eval_loss": 0.031053731217980385, + "eval_rewards/accuracies": 0.9261193871498108, + "eval_rewards/chosen": -4.71546745300293, + "eval_rewards/margins": 5.852557182312012, + "eval_rewards/rejected": -10.568025588989258, + "eval_runtime": 216.5885, + "eval_samples_per_second": 98.883, + "eval_steps_per_second": 1.547, "step": 2500 }, { "epoch": 0.7895564642969487, - "grad_norm": 3.696200132369995, + "grad_norm": 3.3239963054656982, "learning_rate": 6.447731106184183e-07, - "logits/chosen": 1.3392512798309326, - "logits/rejected": 1.1863142251968384, - "logps/chosen": -556.5394287109375, - "logps/rejected": -1091.25537109375, - "loss": 0.0509, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -5.229084014892578, - "rewards/margins": 5.346131324768066, - "rewards/rejected": -10.575216293334961, + "logits/chosen": 1.494217872619629, + "logits/rejected": 1.3212921619415283, + "logps/chosen": -518.6863403320312, + "logps/rejected": -1040.155517578125, + "loss": 0.0492, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.850244522094727, + "rewards/margins": 5.21368932723999, + "rewards/rejected": -10.063933372497559, "step": 2510 }, { "epoch": 0.7927021075810003, - "grad_norm": 3.2612533569335938, + "grad_norm": 1.9487500190734863, "learning_rate": 6.264843517077355e-07, - "logits/chosen": 1.5268758535385132, - "logits/rejected": 1.4993855953216553, - "logps/chosen": -488.5655212402344, - "logps/rejected": -1056.704345703125, - "loss": 0.0712, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.5665812492370605, - "rewards/margins": 5.6401848793029785, - "rewards/rejected": -10.206766128540039, + "logits/chosen": 1.693529486656189, + "logits/rejected": 1.6498600244522095, + "logps/chosen": -487.17913818359375, + "logps/rejected": -1024.142822265625, + "loss": 0.0597, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.552428245544434, + "rewards/margins": 5.328381538391113, + "rewards/rejected": -9.880809783935547, "step": 2520 }, { "epoch": 0.7958477508650519, - "grad_norm": 1.6281901597976685, + "grad_norm": 2.9930226802825928, "learning_rate": 6.084214931922988e-07, - "logits/chosen": 1.544272780418396, - "logits/rejected": 1.394097924232483, - "logps/chosen": -487.400146484375, - "logps/rejected": -1142.82470703125, - "loss": 0.0544, + "logits/chosen": 1.6899865865707397, + "logits/rejected": 1.5415594577789307, + "logps/chosen": -488.88018798828125, + "logps/rejected": -1136.771484375, + "loss": 0.0621, "rewards/accuracies": 0.9375, - "rewards/chosen": -4.553741455078125, - "rewards/margins": 6.4883294105529785, - "rewards/rejected": -11.042070388793945, + "rewards/chosen": -4.568345546722412, + "rewards/margins": 6.412973880767822, + "rewards/rejected": -10.981319427490234, "step": 2530 }, { "epoch": 0.7989933941491035, - "grad_norm": 1.7609894275665283, + "grad_norm": 2.1499834060668945, "learning_rate": 5.905867130132858e-07, - "logits/chosen": 1.3884477615356445, - "logits/rejected": 1.174084186553955, - "logps/chosen": -524.9644775390625, - "logps/rejected": -1094.303466796875, - "loss": 0.0616, + "logits/chosen": 1.594299554824829, + "logits/rejected": 1.3576323986053467, + "logps/chosen": -532.1160278320312, + "logps/rejected": -1078.0628662109375, + "loss": 0.0652, "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.912240028381348, - "rewards/margins": 5.668644428253174, - "rewards/rejected": -10.58088493347168, + "rewards/chosen": -4.983504295349121, + "rewards/margins": 5.434745788574219, + "rewards/rejected": -10.41825008392334, "step": 2540 }, { "epoch": 0.8021390374331551, - "grad_norm": 1.901530385017395, + "grad_norm": 1.3630064725875854, "learning_rate": 5.729821616111777e-07, - "logits/chosen": 1.2051628828048706, - "logits/rejected": 1.0570333003997803, - "logps/chosen": -522.4967651367188, - "logps/rejected": -1080.4366455078125, - "loss": 0.0678, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.89337158203125, - "rewards/margins": 5.575320243835449, - "rewards/rejected": -10.4686918258667, + "logits/chosen": 1.3776555061340332, + "logits/rejected": 1.2333744764328003, + "logps/chosen": -500.73834228515625, + "logps/rejected": -1054.62841796875, + "loss": 0.0573, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.675589561462402, + "rewards/margins": 5.534988880157471, + "rewards/rejected": -10.210577011108398, "step": 2550 }, { "epoch": 0.8052846807172067, - "grad_norm": 2.6274139881134033, + "grad_norm": 2.679340124130249, "learning_rate": 5.556099616664678e-07, - "logits/chosen": 1.457098126411438, - "logits/rejected": 1.270530343055725, - "logps/chosen": -524.5520629882812, - "logps/rejected": -1138.426025390625, - "loss": 0.0612, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.922528266906738, - "rewards/margins": 6.103795051574707, - "rewards/rejected": -11.026323318481445, + "logits/chosen": 1.6116464138031006, + "logits/rejected": 1.4320402145385742, + "logps/chosen": -498.9954528808594, + "logps/rejected": -1082.236328125, + "loss": 0.0653, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.666613578796387, + "rewards/margins": 5.797518253326416, + "rewards/rejected": -10.464132308959961, "step": 2560 }, { "epoch": 0.8084303240012582, - "grad_norm": 3.4322991371154785, + "grad_norm": 2.050906181335449, "learning_rate": 5.384722078437163e-07, - "logits/chosen": 1.168784499168396, - "logits/rejected": 0.9237147569656372, - "logps/chosen": -503.53948974609375, - "logps/rejected": -1102.985107421875, - "loss": 0.0587, + "logits/chosen": 1.3231595754623413, + "logits/rejected": 1.0914764404296875, + "logps/chosen": -481.60302734375, + "logps/rejected": -1077.0623779296875, + "loss": 0.0558, "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.709792137145996, - "rewards/margins": 5.9735107421875, - "rewards/rejected": -10.68330192565918, + "rewards/chosen": -4.489951133728027, + "rewards/margins": 5.934098720550537, + "rewards/rejected": -10.424050331115723, "step": 2570 }, { "epoch": 0.8115759672853099, - "grad_norm": 2.3039779663085938, + "grad_norm": 2.0678343772888184, "learning_rate": 5.215709665389884e-07, - "logits/chosen": 1.5015685558319092, - "logits/rejected": 1.1338298320770264, - "logps/chosen": -495.9610900878906, - "logps/rejected": -1086.0147705078125, - "loss": 0.0628, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.62796688079834, - "rewards/margins": 5.865346431732178, - "rewards/rejected": -10.493314743041992, + "logits/chosen": 1.6653324365615845, + "logits/rejected": 1.2603800296783447, + "logps/chosen": -480.2110290527344, + "logps/rejected": -1065.371337890625, + "loss": 0.0603, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.470128536224365, + "rewards/margins": 5.816575527191162, + "rewards/rejected": -10.286703109741211, "step": 2580 }, { "epoch": 0.8147216105693614, - "grad_norm": 3.216778039932251, + "grad_norm": 4.421222686767578, "learning_rate": 5.049082756306933e-07, - "logits/chosen": 1.1515804529190063, - "logits/rejected": 1.0844125747680664, - "logps/chosen": -494.03338623046875, - "logps/rejected": -1051.1014404296875, - "loss": 0.0659, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.619012832641602, - "rewards/margins": 5.5344648361206055, - "rewards/rejected": -10.153477668762207, + "logits/chosen": 1.2575275897979736, + "logits/rejected": 1.1939196586608887, + "logps/chosen": -500.1796875, + "logps/rejected": -1039.970703125, + "loss": 0.0656, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.680001258850098, + "rewards/margins": 5.361957550048828, + "rewards/rejected": -10.04195785522461, "step": 2590 }, { "epoch": 0.817867253853413, - "grad_norm": 1.8903234004974365, + "grad_norm": 2.2859609127044678, "learning_rate": 4.884861442338703e-07, - "logits/chosen": 1.355169415473938, - "logits/rejected": 1.2924784421920776, - "logps/chosen": -503.74908447265625, - "logps/rejected": -1109.2928466796875, - "loss": 0.0555, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.697711944580078, - "rewards/margins": 6.013389587402344, - "rewards/rejected": -10.711101531982422, + "logits/chosen": 1.4497255086898804, + "logits/rejected": 1.3707365989685059, + "logps/chosen": -507.47845458984375, + "logps/rejected": -1101.427734375, + "loss": 0.0552, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.734853744506836, + "rewards/margins": 5.897332191467285, + "rewards/rejected": -10.632184982299805, "step": 2600 }, { "epoch": 0.817867253853413, - "eval_logits/chosen": 2.135688543319702, - "eval_logits/rejected": 1.939805507659912, - "eval_logps/chosen": -505.2809753417969, - "eval_logps/rejected": -1092.1619873046875, - "eval_loss": 0.0300795566290617, - "eval_rewards/accuracies": 0.9261193871498108, - "eval_rewards/chosen": -4.725099086761475, - "eval_rewards/margins": 5.837533473968506, - "eval_rewards/rejected": -10.56263256072998, - "eval_runtime": 215.0481, - "eval_samples_per_second": 99.592, - "eval_steps_per_second": 1.558, + "eval_logits/chosen": 2.259222984313965, + "eval_logits/rejected": 2.054378032684326, + "eval_logps/chosen": -498.5399169921875, + "eval_logps/rejected": -1072.4942626953125, + "eval_loss": 0.031242508441209793, + "eval_rewards/accuracies": 0.9253731369972229, + "eval_rewards/chosen": -4.657431125640869, + "eval_rewards/margins": 5.708352088928223, + "eval_rewards/rejected": -10.36578369140625, + "eval_runtime": 216.4474, + "eval_samples_per_second": 98.948, + "eval_steps_per_second": 1.548, "step": 2600 }, { "epoch": 0.8210128971374646, - "grad_norm": 1.426253318786621, + "grad_norm": 3.060898542404175, "learning_rate": 4.7230655245793286e-07, - "logits/chosen": 1.430915117263794, - "logits/rejected": 1.2984659671783447, - "logps/chosen": -506.103271484375, - "logps/rejected": -1104.470458984375, - "loss": 0.0576, - "rewards/accuracies": 0.875, - "rewards/chosen": -4.7343974113464355, - "rewards/margins": 5.964964866638184, - "rewards/rejected": -10.699361801147461, + "logits/chosen": 1.5630236864089966, + "logits/rejected": 1.4133121967315674, + "logps/chosen": -493.82373046875, + "logps/rejected": -1063.783203125, + "loss": 0.0572, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.611185550689697, + "rewards/margins": 5.6810383796691895, + "rewards/rejected": -10.292223930358887, "step": 2610 }, { "epoch": 0.8241585404215162, - "grad_norm": 4.4854736328125, + "grad_norm": 5.270165920257568, "learning_rate": 4.563714511679201e-07, - "logits/chosen": 1.1772897243499756, - "logits/rejected": 0.9327267408370972, - "logps/chosen": -552.3887939453125, - "logps/rejected": -1113.9022216796875, - "loss": 0.0646, - "rewards/accuracies": 0.84375, - "rewards/chosen": -5.1871819496154785, - "rewards/margins": 5.609848976135254, - "rewards/rejected": -10.797030448913574, + "logits/chosen": 1.2811131477355957, + "logits/rejected": 1.0420016050338745, + "logps/chosen": -521.6427612304688, + "logps/rejected": -1078.5167236328125, + "loss": 0.0636, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.879427909851074, + "rewards/margins": 5.563521385192871, + "rewards/rejected": -10.442949295043945, "step": 2620 }, { "epoch": 0.8273041837055678, - "grad_norm": 2.612297296524048, + "grad_norm": 1.9632947444915771, "learning_rate": 4.4068276174926624e-07, - "logits/chosen": 1.4450725317001343, - "logits/rejected": 1.308194875717163, - "logps/chosen": -527.3743896484375, - "logps/rejected": -1082.718994140625, - "loss": 0.0691, + "logits/chosen": 1.5258196592330933, + "logits/rejected": 1.4023991823196411, + "logps/chosen": -515.6566162109375, + "logps/rejected": -1040.244384765625, + "loss": 0.0692, "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.931950569152832, - "rewards/margins": 5.522625923156738, - "rewards/rejected": -10.45457649230957, + "rewards/chosen": -4.814579010009766, + "rewards/margins": 5.215153694152832, + "rewards/rejected": -10.029733657836914, "step": 2630 }, { "epoch": 0.8304498269896193, - "grad_norm": 2.8371591567993164, + "grad_norm": 1.8762720823287964, "learning_rate": 4.25242375876132e-07, - "logits/chosen": 1.6938962936401367, - "logits/rejected": 1.4360706806182861, - "logps/chosen": -514.8795166015625, - "logps/rejected": -1132.747802734375, - "loss": 0.055, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.795286655426025, - "rewards/margins": 6.1563005447387695, - "rewards/rejected": -10.95158576965332, + "logits/chosen": 1.8149995803833008, + "logits/rejected": 1.559918999671936, + "logps/chosen": -512.7207641601562, + "logps/rejected": -1104.062744140625, + "loss": 0.0588, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.773519039154053, + "rewards/margins": 5.890836715698242, + "rewards/rejected": -10.66435718536377, "step": 2640 }, { "epoch": 0.833595470273671, - "grad_norm": 2.8870389461517334, + "grad_norm": 2.6767687797546387, "learning_rate": 4.1005215528331254e-07, - "logits/chosen": 1.656299352645874, - "logits/rejected": 1.4690709114074707, - "logps/chosen": -529.0469360351562, - "logps/rejected": -1109.736083984375, - "loss": 0.0657, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.954121112823486, - "rewards/margins": 5.782765865325928, - "rewards/rejected": -10.736886978149414, + "logits/chosen": 1.7480008602142334, + "logits/rejected": 1.5533208847045898, + "logps/chosen": -519.8841552734375, + "logps/rejected": -1083.5428466796875, + "loss": 0.0641, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.862178802490234, + "rewards/margins": 5.612443923950195, + "rewards/rejected": -10.474621772766113, "step": 2650 }, { "epoch": 0.8367411135577225, - "grad_norm": 1.9303276538848877, + "grad_norm": 1.4044471979141235, "learning_rate": 3.9511393154175795e-07, - "logits/chosen": 1.6134449243545532, - "logits/rejected": 1.3297247886657715, - "logps/chosen": -525.5284423828125, - "logps/rejected": -1118.3677978515625, - "loss": 0.0512, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.909844398498535, - "rewards/margins": 5.91357421875, - "rewards/rejected": -10.823419570922852, + "logits/chosen": 1.734551191329956, + "logits/rejected": 1.43448805809021, + "logps/chosen": -519.4636840820312, + "logps/rejected": -1096.210205078125, + "loss": 0.0504, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.848996162414551, + "rewards/margins": 5.752721309661865, + "rewards/rejected": -10.601716995239258, "step": 2660 }, { "epoch": 0.8398867568417742, - "grad_norm": 3.3604700565338135, + "grad_norm": 2.7965497970581055, "learning_rate": 3.8042950583773054e-07, - "logits/chosen": 1.5226154327392578, - "logits/rejected": 1.2554523944854736, - "logps/chosen": -510.0711975097656, - "logps/rejected": -1055.9796142578125, - "loss": 0.0662, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.7616868019104, - "rewards/margins": 5.4259772300720215, - "rewards/rejected": -10.187662124633789, + "logits/chosen": 1.638389229774475, + "logits/rejected": 1.3333652019500732, + "logps/chosen": -516.922607421875, + "logps/rejected": -1048.8199462890625, + "loss": 0.0616, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.830017566680908, + "rewards/margins": 5.2858991622924805, + "rewards/rejected": -10.115918159484863, "step": 2670 }, { "epoch": 0.8430324001258257, - "grad_norm": 2.3669848442077637, + "grad_norm": 2.85134220123291, "learning_rate": 3.660006487556245e-07, - "logits/chosen": 1.499861478805542, - "logits/rejected": 1.3318068981170654, - "logps/chosen": -489.2085876464844, - "logps/rejected": -1086.85546875, - "loss": 0.0561, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.581131935119629, - "rewards/margins": 5.949007987976074, - "rewards/rejected": -10.530141830444336, + "logits/chosen": 1.612595796585083, + "logits/rejected": 1.4408342838287354, + "logps/chosen": -476.4307556152344, + "logps/rejected": -1083.2095947265625, + "loss": 0.0506, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.453230857849121, + "rewards/margins": 6.040000915527344, + "rewards/rejected": -10.493230819702148, "step": 2680 }, { "epoch": 0.8461780434098773, - "grad_norm": 1.5873863697052002, + "grad_norm": 1.8909926414489746, "learning_rate": 3.5182910006447775e-07, - "logits/chosen": 1.3309674263000488, - "logits/rejected": 1.1546790599822998, - "logps/chosen": -509.28302001953125, - "logps/rejected": -1080.261962890625, - "loss": 0.0488, - "rewards/accuracies": 0.9375, - "rewards/chosen": -4.774194717407227, - "rewards/margins": 5.679512023925781, - "rewards/rejected": -10.453706741333008, + "logits/chosen": 1.4525648355484009, + "logits/rejected": 1.2461488246917725, + "logps/chosen": -502.4288635253906, + "logps/rejected": -1073.686767578125, + "loss": 0.0545, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.705319404602051, + "rewards/margins": 5.682326793670654, + "rewards/rejected": -10.38764762878418, "step": 2690 }, { "epoch": 0.8493236866939289, - "grad_norm": 2.22021484375, + "grad_norm": 2.196668863296509, "learning_rate": 3.3791656850819975e-07, - "logits/chosen": 1.4363747835159302, - "logits/rejected": 1.231518030166626, - "logps/chosen": -512.7792358398438, - "logps/rejected": -1080.0721435546875, - "loss": 0.0646, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.79265022277832, - "rewards/margins": 5.652270793914795, - "rewards/rejected": -10.444921493530273, + "logits/chosen": 1.5441794395446777, + "logits/rejected": 1.3334633111953735, + "logps/chosen": -505.63177490234375, + "logps/rejected": -1055.5289306640625, + "loss": 0.066, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.721020221710205, + "rewards/margins": 5.478200912475586, + "rewards/rejected": -10.19922161102295, "step": 2700 }, { "epoch": 0.8493236866939289, - "eval_logits/chosen": 2.194664716720581, - "eval_logits/rejected": 2.000288724899292, - "eval_logps/chosen": -502.0693664550781, - "eval_logps/rejected": -1098.9693603515625, - "eval_loss": 0.029430242255330086, - "eval_rewards/accuracies": 0.9261193871498108, - "eval_rewards/chosen": -4.692983627319336, - "eval_rewards/margins": 5.937723636627197, - "eval_rewards/rejected": -10.630707740783691, - "eval_runtime": 215.131, - "eval_samples_per_second": 99.553, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.296788215637207, + "eval_logits/rejected": 2.0913844108581543, + "eval_logps/chosen": -497.8610534667969, + "eval_logps/rejected": -1083.573974609375, + "eval_loss": 0.03053486905992031, + "eval_rewards/accuracies": 0.9287313222885132, + "eval_rewards/chosen": -4.650643348693848, + "eval_rewards/margins": 5.825936317443848, + "eval_rewards/rejected": -10.476579666137695, + "eval_runtime": 216.4192, + "eval_samples_per_second": 98.961, + "eval_steps_per_second": 1.548, "step": 2700 }, { "epoch": 0.8524693299779805, - "grad_norm": 3.6423380374908447, + "grad_norm": 2.1117446422576904, "learning_rate": 3.2426473159953455e-07, - "logits/chosen": 1.4169188737869263, - "logits/rejected": 1.2496435642242432, - "logps/chosen": -513.0960693359375, - "logps/rejected": -1108.312255859375, - "loss": 0.062, + "logits/chosen": 1.5046918392181396, + "logits/rejected": 1.335351586341858, + "logps/chosen": -514.4112548828125, + "logps/rejected": -1087.9561767578125, + "loss": 0.0614, "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.786744117736816, - "rewards/margins": 5.93210506439209, - "rewards/rejected": -10.71884822845459, + "rewards/chosen": -4.799644470214844, + "rewards/margins": 5.715089321136475, + "rewards/rejected": -10.514734268188477, "step": 2710 }, { "epoch": 0.8556149732620321, - "grad_norm": 3.5630664825439453, + "grad_norm": 2.354701519012451, "learning_rate": 3.108752354177963e-07, - "logits/chosen": 1.552750825881958, - "logits/rejected": 1.3104605674743652, - "logps/chosen": -508.5628967285156, - "logps/rejected": -1070.4361572265625, - "loss": 0.0549, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.747432231903076, - "rewards/margins": 5.604626178741455, - "rewards/rejected": -10.352057456970215, + "logits/chosen": 1.6507476568222046, + "logits/rejected": 1.3912304639816284, + "logps/chosen": -515.09326171875, + "logps/rejected": -1063.3089599609375, + "loss": 0.0575, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.812552452087402, + "rewards/margins": 5.467874050140381, + "rewards/rejected": -10.280427932739258, "step": 2720 }, { "epoch": 0.8587606165460837, - "grad_norm": 4.022861957550049, + "grad_norm": 3.7214064598083496, "learning_rate": 2.9774969441039247e-07, - "logits/chosen": 1.208437204360962, - "logits/rejected": 1.0336554050445557, - "logps/chosen": -515.5606689453125, - "logps/rejected": -1061.59375, - "loss": 0.07, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.836429595947266, - "rewards/margins": 5.419116020202637, - "rewards/rejected": -10.255544662475586, + "logits/chosen": 1.3230955600738525, + "logits/rejected": 1.1357682943344116, + "logps/chosen": -505.62921142578125, + "logps/rejected": -1043.6890869140625, + "loss": 0.0689, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.736936092376709, + "rewards/margins": 5.339118003845215, + "rewards/rejected": -10.076053619384766, "step": 2730 }, { "epoch": 0.8619062598301352, - "grad_norm": 2.5656957626342773, + "grad_norm": 2.939478874206543, "learning_rate": 2.848896911981575e-07, - "logits/chosen": 1.585086703300476, - "logits/rejected": 1.302536964416504, - "logps/chosen": -475.01495361328125, - "logps/rejected": -1078.8958740234375, - "loss": 0.067, + "logits/chosen": 1.6969833374023438, + "logits/rejected": 1.3960387706756592, + "logps/chosen": -471.97393798828125, + "logps/rejected": -1061.218505859375, + "loss": 0.072, "rewards/accuracies": 0.90625, - "rewards/chosen": -4.43586540222168, - "rewards/margins": 6.01693058013916, - "rewards/rejected": -10.452796936035156, + "rewards/chosen": -4.4052605628967285, + "rewards/margins": 5.870527267456055, + "rewards/rejected": -10.275788307189941, "step": 2740 }, { "epoch": 0.8650519031141869, - "grad_norm": 1.7859119176864624, + "grad_norm": 1.170915961265564, "learning_rate": 2.722967763845316e-07, - "logits/chosen": 1.4026825428009033, - "logits/rejected": 1.267093539237976, - "logps/chosen": -501.826171875, - "logps/rejected": -1080.474365234375, - "loss": 0.048, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.700644016265869, - "rewards/margins": 5.758360862731934, - "rewards/rejected": -10.459004402160645, + "logits/chosen": 1.487412691116333, + "logits/rejected": 1.3554164171218872, + "logps/chosen": -488.6513671875, + "logps/rejected": -1064.09912109375, + "loss": 0.0444, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.568634986877441, + "rewards/margins": 5.726897239685059, + "rewards/rejected": -10.2955322265625, "step": 2750 }, { "epoch": 0.8681975463982384, - "grad_norm": 2.9971580505371094, + "grad_norm": 3.7928836345672607, "learning_rate": 2.5997246836859335e-07, - "logits/chosen": 1.5809880495071411, - "logits/rejected": 1.286370038986206, - "logps/chosen": -566.8330688476562, - "logps/rejected": -1177.3800048828125, - "loss": 0.0768, - "rewards/accuracies": 0.9375, - "rewards/chosen": -5.326024532318115, - "rewards/margins": 6.073489189147949, - "rewards/rejected": -11.399515151977539, + "logits/chosen": 1.6719999313354492, + "logits/rejected": 1.3611785173416138, + "logps/chosen": -537.3580322265625, + "logps/rejected": -1134.085205078125, + "loss": 0.0707, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.030951976776123, + "rewards/margins": 5.935522556304932, + "rewards/rejected": -10.966474533081055, "step": 2760 }, { "epoch": 0.8713431896822901, - "grad_norm": 3.9024531841278076, + "grad_norm": 4.211234092712402, "learning_rate": 2.479182531619778e-07, - "logits/chosen": 1.4867026805877686, - "logits/rejected": 1.2765355110168457, - "logps/chosen": -556.2525634765625, - "logps/rejected": -1071.0406494140625, - "loss": 0.0591, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -5.220080375671387, - "rewards/margins": 5.128342628479004, - "rewards/rejected": -10.348422050476074, + "logits/chosen": 1.5656145811080933, + "logits/rejected": 1.356136679649353, + "logps/chosen": -540.0008544921875, + "logps/rejected": -1038.9478759765625, + "loss": 0.0582, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.057316780090332, + "rewards/margins": 4.969836235046387, + "rewards/rejected": -10.027152061462402, "step": 2770 }, { "epoch": 0.8744888329663416, - "grad_norm": 3.937291383743286, + "grad_norm": 3.4313180446624756, "learning_rate": 2.3613558420969988e-07, - "logits/chosen": 1.4205322265625, - "logits/rejected": 1.195991039276123, - "logps/chosen": -532.1976318359375, - "logps/rejected": -1121.0986328125, - "loss": 0.0611, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -4.998320579528809, - "rewards/margins": 5.846045017242432, - "rewards/rejected": -10.844365119934082, + "logits/chosen": 1.5074255466461182, + "logits/rejected": 1.2785370349884033, + "logps/chosen": -519.2490844726562, + "logps/rejected": -1096.0401611328125, + "loss": 0.055, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.868134021759033, + "rewards/margins": 5.725502967834473, + "rewards/rejected": -10.593636512756348, "step": 2780 }, { "epoch": 0.8776344762503933, - "grad_norm": 3.083104133605957, + "grad_norm": 3.3754262924194336, "learning_rate": 2.2462588221490445e-07, - "logits/chosen": 1.5729047060012817, - "logits/rejected": 1.2844527959823608, - "logps/chosen": -530.4925537109375, - "logps/rejected": -1121.37939453125, - "loss": 0.0561, + "logits/chosen": 1.7104518413543701, + "logits/rejected": 1.401397466659546, + "logps/chosen": -519.754150390625, + "logps/rejected": -1093.9193115234375, + "loss": 0.0621, "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.973997592926025, - "rewards/margins": 5.868443489074707, - "rewards/rejected": -10.842439651489258, + "rewards/chosen": -4.866362571716309, + "rewards/margins": 5.7011823654174805, + "rewards/rejected": -10.567545890808105, "step": 2790 }, { "epoch": 0.8807801195344448, - "grad_norm": 2.435751438140869, + "grad_norm": 2.195533275604248, "learning_rate": 2.1339053496756413e-07, - "logits/chosen": 1.7648521661758423, - "logits/rejected": 1.4746003150939941, - "logps/chosen": -520.1046142578125, - "logps/rejected": -1067.98486328125, - "loss": 0.0546, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.869875907897949, - "rewards/margins": 5.440675258636475, - "rewards/rejected": -10.310551643371582, + "logits/chosen": 1.8926490545272827, + "logits/rejected": 1.602657675743103, + "logps/chosen": -496.7962951660156, + "logps/rejected": -1042.2293701171875, + "loss": 0.0568, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.636801719665527, + "rewards/margins": 5.415767192840576, + "rewards/rejected": -10.052568435668945, "step": 2800 }, { "epoch": 0.8807801195344448, - "eval_logits/chosen": 2.1606998443603516, - "eval_logits/rejected": 1.9595870971679688, - "eval_logps/chosen": -513.6257934570312, - "eval_logps/rejected": -1117.5887451171875, - "eval_loss": 0.02870255894958973, - "eval_rewards/accuracies": 0.925000011920929, - "eval_rewards/chosen": -4.808547019958496, - "eval_rewards/margins": 6.008353233337402, - "eval_rewards/rejected": -10.816901206970215, - "eval_runtime": 215.3283, - "eval_samples_per_second": 99.462, - "eval_steps_per_second": 1.556, + "eval_logits/chosen": 2.3025965690612793, + "eval_logits/rejected": 2.095693349838257, + "eval_logps/chosen": -497.02655029296875, + "eval_logps/rejected": -1082.205078125, + "eval_loss": 0.030157454311847687, + "eval_rewards/accuracies": 0.9302238821983337, + "eval_rewards/chosen": -4.642297744750977, + "eval_rewards/margins": 5.820593357086182, + "eval_rewards/rejected": -10.462892532348633, + "eval_runtime": 216.4848, + "eval_samples_per_second": 98.931, + "eval_steps_per_second": 1.547, "step": 2800 }, { "epoch": 0.8839257628184963, - "grad_norm": 4.853071212768555, + "grad_norm": 2.889522075653076, "learning_rate": 2.0243089717714465e-07, - "logits/chosen": 1.6696542501449585, - "logits/rejected": 1.3343422412872314, - "logps/chosen": -513.935546875, - "logps/rejected": -1139.2392578125, - "loss": 0.0607, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.819055080413818, - "rewards/margins": 6.217933654785156, - "rewards/rejected": -11.036989212036133, + "logits/chosen": 1.807885766029358, + "logits/rejected": 1.4938302040100098, + "logps/chosen": -484.04443359375, + "logps/rejected": -1089.961181640625, + "loss": 0.0552, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.520063877105713, + "rewards/margins": 6.023884296417236, + "rewards/rejected": -10.543947219848633, "step": 2810 }, { "epoch": 0.887071406102548, - "grad_norm": 3.8749752044677734, + "grad_norm": 3.5756685733795166, "learning_rate": 1.9174829030926157e-07, - "logits/chosen": 1.3861050605773926, - "logits/rejected": 1.3259289264678955, - "logps/chosen": -569.0123901367188, - "logps/rejected": -1085.4583740234375, - "loss": 0.0498, + "logits/chosen": 1.5275074243545532, + "logits/rejected": 1.4837501049041748, + "logps/chosen": -543.8445434570312, + "logps/rejected": -1033.0228271484375, + "loss": 0.048, "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -5.3545613288879395, - "rewards/margins": 5.1358819007873535, - "rewards/rejected": -10.490442276000977, + "rewards/chosen": -5.1024169921875, + "rewards/margins": 4.863530158996582, + "rewards/rejected": -9.965947151184082, "step": 2820 }, { "epoch": 0.8902170493865995, - "grad_norm": 1.685403823852539, + "grad_norm": 1.6179348230361938, "learning_rate": 1.8134400242634214e-07, - "logits/chosen": 1.3578906059265137, - "logits/rejected": 1.2923089265823364, - "logps/chosen": -559.1904296875, - "logps/rejected": -1093.1944580078125, - "loss": 0.066, + "logits/chosen": 1.504921793937683, + "logits/rejected": 1.4333007335662842, + "logps/chosen": -552.3648681640625, + "logps/rejected": -1058.1396484375, + "loss": 0.0736, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -5.261031150817871, - "rewards/margins": 5.333484649658203, - "rewards/rejected": -10.59451675415039, + "rewards/chosen": -5.192325592041016, + "rewards/margins": 5.051383972167969, + "rewards/rejected": -10.243708610534668, "step": 2830 }, { "epoch": 0.8933626926706512, - "grad_norm": 3.112783670425415, + "grad_norm": 2.699253797531128, "learning_rate": 1.7121928803231714e-07, - "logits/chosen": 1.6115779876708984, - "logits/rejected": 1.3464069366455078, - "logps/chosen": -504.38385009765625, - "logps/rejected": -1135.041259765625, - "loss": 0.054, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -4.714786052703857, - "rewards/margins": 6.264063835144043, - "rewards/rejected": -10.978850364685059, + "logits/chosen": 1.7584819793701172, + "logits/rejected": 1.4663532972335815, + "logps/chosen": -492.44598388671875, + "logps/rejected": -1074.4373779296875, + "loss": 0.0559, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.594998836517334, + "rewards/margins": 5.777581214904785, + "rewards/rejected": -10.372581481933594, "step": 2840 }, { "epoch": 0.8965083359547027, - "grad_norm": 1.6297857761383057, + "grad_norm": 1.8945022821426392, "learning_rate": 1.613753679213581e-07, - "logits/chosen": 1.4668216705322266, - "logits/rejected": 1.3703067302703857, - "logps/chosen": -546.5037841796875, - "logps/rejected": -1063.0972900390625, - "loss": 0.0533, + "logits/chosen": 1.57563054561615, + "logits/rejected": 1.482763409614563, + "logps/chosen": -539.2989501953125, + "logps/rejected": -1020.1134033203125, + "loss": 0.0574, "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -5.121490478515625, - "rewards/margins": 5.16286563873291, - "rewards/rejected": -10.284356117248535, + "rewards/chosen": -5.049017906188965, + "rewards/margins": 4.8051300048828125, + "rewards/rejected": -9.854147911071777, "step": 2850 }, { "epoch": 0.8996539792387543, - "grad_norm": 0.9355494379997253, + "grad_norm": 1.4570543766021729, "learning_rate": 1.5181342903067803e-07, - "logits/chosen": 1.5894020795822144, - "logits/rejected": 1.4288842678070068, - "logps/chosen": -475.189453125, - "logps/rejected": -1045.166259765625, - "loss": 0.0595, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.432465553283691, - "rewards/margins": 5.66586446762085, - "rewards/rejected": -10.098329544067383, + "logits/chosen": 1.693933129310608, + "logits/rejected": 1.528136968612671, + "logps/chosen": -460.54095458984375, + "logps/rejected": -1023.5963745117188, + "loss": 0.0594, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.285604953765869, + "rewards/margins": 5.596875190734863, + "rewards/rejected": -9.882479667663574, "step": 2860 }, { "epoch": 0.9027996225228059, - "grad_norm": 3.7816286087036133, + "grad_norm": 3.3452985286712646, "learning_rate": 1.4253462429741877e-07, - "logits/chosen": 1.426163673400879, - "logits/rejected": 1.2298691272735596, - "logps/chosen": -501.76739501953125, - "logps/rejected": -1122.6923828125, - "loss": 0.0571, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -4.693778038024902, - "rewards/margins": 6.179869651794434, - "rewards/rejected": -10.873647689819336, + "logits/chosen": 1.5829732418060303, + "logits/rejected": 1.3568916320800781, + "logps/chosen": -491.80145263671875, + "logps/rejected": -1098.6715087890625, + "loss": 0.0513, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.593815803527832, + "rewards/margins": 6.039064407348633, + "rewards/rejected": -10.632879257202148, "step": 2870 }, { "epoch": 0.9059452658068575, - "grad_norm": 3.62336802482605, + "grad_norm": 4.80794620513916, "learning_rate": 1.335400725196309e-07, - "logits/chosen": 1.4734015464782715, - "logits/rejected": 1.3798444271087646, - "logps/chosen": -544.997314453125, - "logps/rejected": -1142.9996337890625, - "loss": 0.0782, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -5.112950325012207, - "rewards/margins": 5.9460601806640625, - "rewards/rejected": -11.05901050567627, + "logits/chosen": 1.5622551441192627, + "logits/rejected": 1.4623697996139526, + "logps/chosen": -530.6185302734375, + "logps/rejected": -1116.3253173828125, + "loss": 0.0756, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.968831539154053, + "rewards/margins": 5.823412895202637, + "rewards/rejected": -10.792244911193848, "step": 2880 }, { "epoch": 0.9090909090909091, - "grad_norm": 3.0453686714172363, + "grad_norm": 2.9681992530822754, "learning_rate": 1.2483085822137752e-07, - "logits/chosen": 1.3678343296051025, - "logits/rejected": 1.3175466060638428, - "logps/chosen": -511.88134765625, - "logps/rejected": -1121.2176513671875, - "loss": 0.0579, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.770578861236572, - "rewards/margins": 6.093011379241943, - "rewards/rejected": -10.863590240478516, + "logits/chosen": 1.4609724283218384, + "logits/rejected": 1.4121100902557373, + "logps/chosen": -513.2243041992188, + "logps/rejected": -1081.070068359375, + "loss": 0.0569, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.783768653869629, + "rewards/margins": 5.678202152252197, + "rewards/rejected": -10.4619722366333, "step": 2890 }, { "epoch": 0.9122365523749607, - "grad_norm": 2.422604560852051, + "grad_norm": 2.235750913619995, "learning_rate": 1.16408031521964e-07, - "logits/chosen": 1.3921037912368774, - "logits/rejected": 1.2343624830245972, - "logps/chosen": -517.9652099609375, - "logps/rejected": -1084.047119140625, - "loss": 0.0702, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.85400390625, - "rewards/margins": 5.625793933868408, - "rewards/rejected": -10.479796409606934, + "logits/chosen": 1.5177078247070312, + "logits/rejected": 1.3542033433914185, + "logps/chosen": -497.5135192871094, + "logps/rejected": -1059.788818359375, + "loss": 0.0602, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.6491899490356445, + "rewards/margins": 5.587696075439453, + "rewards/rejected": -10.236886024475098, "step": 2900 }, { "epoch": 0.9122365523749607, - "eval_logits/chosen": 2.1647133827209473, - "eval_logits/rejected": 1.969600796699524, - "eval_logps/chosen": -502.4717712402344, - "eval_logps/rejected": -1104.9371337890625, - "eval_loss": 0.028775138780474663, - "eval_rewards/accuracies": 0.9242537021636963, - "eval_rewards/chosen": -4.697007656097412, - "eval_rewards/margins": 5.993377208709717, - "eval_rewards/rejected": -10.690384864807129, - "eval_runtime": 215.1474, - "eval_samples_per_second": 99.546, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.2910573482513428, + "eval_logits/rejected": 2.086066961288452, + "eval_logps/chosen": -495.3988952636719, + "eval_logps/rejected": -1081.995849609375, + "eval_loss": 0.029906345531344414, + "eval_rewards/accuracies": 0.9298507571220398, + "eval_rewards/chosen": -4.626021385192871, + "eval_rewards/margins": 5.83477783203125, + "eval_rewards/rejected": -10.460798263549805, + "eval_runtime": 216.6391, + "eval_samples_per_second": 98.86, + "eval_steps_per_second": 1.546, "step": 2900 }, { "epoch": 0.9153821956590122, - "grad_norm": 2.0521950721740723, + "grad_norm": 3.275084972381592, "learning_rate": 1.0827260800932132e-07, - "logits/chosen": 1.2516539096832275, - "logits/rejected": 1.1419090032577515, - "logps/chosen": -541.690185546875, - "logps/rejected": -1094.5318603515625, - "loss": 0.0505, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -5.085227012634277, - "rewards/margins": 5.53399658203125, - "rewards/rejected": -10.619223594665527, + "logits/chosen": 1.3776516914367676, + "logits/rejected": 1.2710825204849243, + "logps/chosen": -530.8807373046875, + "logps/rejected": -1062.57763671875, + "loss": 0.0566, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.976878643035889, + "rewards/margins": 5.322489261627197, + "rewards/rejected": -10.299367904663086, "step": 2910 }, { "epoch": 0.9185278389430639, - "grad_norm": 1.5800129175186157, + "grad_norm": 1.3798928260803223, "learning_rate": 1.0042556861754981e-07, - "logits/chosen": 1.3491766452789307, - "logits/rejected": 1.2649667263031006, - "logps/chosen": -520.3859252929688, - "logps/rejected": -1101.0565185546875, - "loss": 0.0559, - "rewards/accuracies": 0.9375, - "rewards/chosen": -4.897294998168945, - "rewards/margins": 5.766488075256348, - "rewards/rejected": -10.663783073425293, + "logits/chosen": 1.4723575115203857, + "logits/rejected": 1.3601932525634766, + "logps/chosen": -520.2330322265625, + "logps/rejected": -1072.7567138671875, + "loss": 0.0463, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.895315647125244, + "rewards/margins": 5.485459327697754, + "rewards/rejected": -10.380775451660156, "step": 2920 }, { "epoch": 0.9216734822271154, - "grad_norm": 4.585078239440918, + "grad_norm": 3.591219425201416, "learning_rate": 9.286785950864297e-08, - "logits/chosen": 1.4685800075531006, - "logits/rejected": 1.3016550540924072, - "logps/chosen": -514.4509887695312, - "logps/rejected": -1111.2257080078125, - "loss": 0.0676, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.8128156661987305, - "rewards/margins": 5.959506034851074, - "rewards/rejected": -10.772321701049805, + "logits/chosen": 1.5991261005401611, + "logits/rejected": 1.4333770275115967, + "logps/chosen": -497.31903076171875, + "logps/rejected": -1074.6182861328125, + "loss": 0.0597, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.641297817230225, + "rewards/margins": 5.764665126800537, + "rewards/rejected": -10.405962944030762, "step": 2930 }, { "epoch": 0.9248191255111671, - "grad_norm": 2.5994062423706055, + "grad_norm": 1.934587836265564, "learning_rate": 8.560039195840226e-08, - "logits/chosen": 1.4919440746307373, - "logits/rejected": 1.1589787006378174, - "logps/chosen": -523.2716674804688, - "logps/rejected": -1082.82275390625, - "loss": 0.0663, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -4.89348840713501, - "rewards/margins": 5.579774379730225, - "rewards/rejected": -10.473262786865234, + "logits/chosen": 1.61993408203125, + "logits/rejected": 1.257753610610962, + "logps/chosen": -511.5843811035156, + "logps/rejected": -1043.6748046875, + "loss": 0.0618, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.776283264160156, + "rewards/margins": 5.305155277252197, + "rewards/rejected": -10.081439018249512, "step": 2940 }, { "epoch": 0.9279647687952186, - "grad_norm": 1.1528620719909668, + "grad_norm": 1.1722760200500488, "learning_rate": 7.86240422465609e-08, - "logits/chosen": 1.3870362043380737, - "logits/rejected": 1.323654294013977, - "logps/chosen": -536.1802978515625, - "logps/rejected": -1112.9925537109375, - "loss": 0.0643, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -5.029669761657715, - "rewards/margins": 5.733022212982178, - "rewards/rejected": -10.762693405151367, + "logits/chosen": 1.4890453815460205, + "logits/rejected": 1.4258759021759033, + "logps/chosen": -507.92279052734375, + "logps/rejected": -1072.976318359375, + "loss": 0.061, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.746781826019287, + "rewards/margins": 5.61569881439209, + "rewards/rejected": -10.362482070922852, "step": 2950 }, { "epoch": 0.9311104120792703, - "grad_norm": 2.2496023178100586, + "grad_norm": 2.363994598388672, "learning_rate": 7.193965155112475e-08, - "logits/chosen": 1.3429162502288818, - "logits/rejected": 1.1649806499481201, - "logps/chosen": -537.1536254882812, - "logps/rejected": -1076.180908203125, - "loss": 0.0602, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -5.0520339012146, - "rewards/margins": 5.354105472564697, - "rewards/rejected": -10.406140327453613, + "logits/chosen": 1.4762569665908813, + "logits/rejected": 1.2930829524993896, + "logps/chosen": -531.8678588867188, + "logps/rejected": -1051.727783203125, + "loss": 0.063, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.999019622802734, + "rewards/margins": 5.162339687347412, + "rewards/rejected": -10.161359786987305, "step": 2960 }, { "epoch": 0.9342560553633218, - "grad_norm": 1.6158769130706787, + "grad_norm": 2.2596049308776855, "learning_rate": 6.554802584694791e-08, - "logits/chosen": 1.508937954902649, - "logits/rejected": 1.3928695917129517, - "logps/chosen": -454.51019287109375, - "logps/rejected": -1097.5311279296875, - "loss": 0.0531, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.223744869232178, - "rewards/margins": 6.399988651275635, - "rewards/rejected": -10.623734474182129, + "logits/chosen": 1.6314208507537842, + "logits/rejected": 1.5024511814117432, + "logps/chosen": -449.2713317871094, + "logps/rejected": -1065.9703369140625, + "loss": 0.0606, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.171161651611328, + "rewards/margins": 6.13657808303833, + "rewards/rejected": -10.3077392578125, "step": 2970 }, { "epoch": 0.9374016986473734, - "grad_norm": 4.760510444641113, + "grad_norm": 3.5321264266967773, "learning_rate": 5.9449935808549576e-08, - "logits/chosen": 1.424422025680542, - "logits/rejected": 1.188518762588501, - "logps/chosen": -521.4863891601562, - "logps/rejected": -1131.7774658203125, - "loss": 0.0607, - "rewards/accuracies": 0.9375, - "rewards/chosen": -4.8670854568481445, - "rewards/margins": 6.077136039733887, - "rewards/rejected": -10.944222450256348, + "logits/chosen": 1.5491445064544678, + "logits/rejected": 1.3002042770385742, + "logps/chosen": -510.8706970214844, + "logps/rejected": -1102.7340087890625, + "loss": 0.0614, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.760753154754639, + "rewards/margins": 5.892817974090576, + "rewards/rejected": -10.653572082519531, "step": 2980 }, { "epoch": 0.940547341931425, - "grad_norm": 1.8301982879638672, + "grad_norm": 1.3359121084213257, "learning_rate": 5.3646116717191723e-08, - "logits/chosen": 1.7392746210098267, - "logits/rejected": 1.4524823427200317, - "logps/chosen": -491.2704162597656, - "logps/rejected": -1077.2618408203125, - "loss": 0.0505, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -4.597735404968262, - "rewards/margins": 5.812699794769287, - "rewards/rejected": -10.41043472290039, + "logits/chosen": 1.860054612159729, + "logits/rejected": 1.563511610031128, + "logps/chosen": -489.28070068359375, + "logps/rejected": -1061.9932861328125, + "loss": 0.0525, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.577395439147949, + "rewards/margins": 5.680055618286133, + "rewards/rejected": -10.257450103759766, "step": 2990 }, { "epoch": 0.9436929852154765, - "grad_norm": 1.969045877456665, + "grad_norm": 2.009251832962036, "learning_rate": 4.813726837222116e-08, - "logits/chosen": 1.5177466869354248, - "logits/rejected": 1.5681421756744385, - "logps/chosen": -530.2708129882812, - "logps/rejected": -1052.5723876953125, - "loss": 0.0623, - "rewards/accuracies": 0.875, - "rewards/chosen": -4.96749210357666, - "rewards/margins": 5.2094526290893555, - "rewards/rejected": -10.176944732666016, + "logits/chosen": 1.6448522806167603, + "logits/rejected": 1.6830604076385498, + "logps/chosen": -513.0660400390625, + "logps/rejected": -1033.781494140625, + "loss": 0.0634, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.795180320739746, + "rewards/margins": 5.193611145019531, + "rewards/rejected": -9.988792419433594, "step": 3000 }, { "epoch": 0.9436929852154765, - "eval_logits/chosen": 2.143676996231079, - "eval_logits/rejected": 1.9440104961395264, - "eval_logps/chosen": -503.750732421875, - "eval_logps/rejected": -1103.3302001953125, - "eval_loss": 0.0286338459700346, - "eval_rewards/accuracies": 0.9268656969070435, - "eval_rewards/chosen": -4.709796905517578, - "eval_rewards/margins": 5.964517116546631, - "eval_rewards/rejected": -10.674314498901367, - "eval_runtime": 215.2202, - "eval_samples_per_second": 99.512, - "eval_steps_per_second": 1.557, + "eval_logits/chosen": 2.2738683223724365, + "eval_logits/rejected": 2.065464735031128, + "eval_logps/chosen": -497.3409423828125, + "eval_logps/rejected": -1084.345458984375, + "eval_loss": 0.029775429517030716, + "eval_rewards/accuracies": 0.9313432574272156, + "eval_rewards/chosen": -4.645442008972168, + "eval_rewards/margins": 5.838852405548096, + "eval_rewards/rejected": -10.484294891357422, + "eval_runtime": 216.5306, + "eval_samples_per_second": 98.91, + "eval_steps_per_second": 1.547, "step": 3000 }, { "epoch": 0.9468386284995282, - "grad_norm": 4.614864826202393, + "grad_norm": 3.1543209552764893, "learning_rate": 4.292405500669061e-08, - "logits/chosen": 1.4414303302764893, - "logits/rejected": 1.157649278640747, - "logps/chosen": -522.2523193359375, - "logps/rejected": -1145.5069580078125, - "loss": 0.0761, + "logits/chosen": 1.5579684972763062, + "logits/rejected": 1.275383710861206, + "logps/chosen": -507.2293395996094, + "logps/rejected": -1104.159912109375, + "loss": 0.0674, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.891526699066162, - "rewards/margins": 6.2130513191223145, - "rewards/rejected": -11.104578018188477, + "rewards/chosen": -4.741163730621338, + "rewards/margins": 5.949568748474121, + "rewards/rejected": -10.6907320022583, "step": 3010 }, { "epoch": 0.9499842717835797, - "grad_norm": 2.128798246383667, + "grad_norm": 2.6585705280303955, "learning_rate": 3.8007105207268355e-08, - "logits/chosen": 1.5095900297164917, - "logits/rejected": 1.2341318130493164, - "logps/chosen": -525.769775390625, - "logps/rejected": -1126.2525634765625, - "loss": 0.059, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -4.912005424499512, - "rewards/margins": 5.982682228088379, - "rewards/rejected": -10.894686698913574, + "logits/chosen": 1.6319191455841064, + "logits/rejected": 1.3634922504425049, + "logps/chosen": -508.10174560546875, + "logps/rejected": -1092.864501953125, + "loss": 0.0605, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.735014915466309, + "rewards/margins": 5.825577735900879, + "rewards/rejected": -10.560592651367188, "step": 3020 }, { "epoch": 0.9531299150676313, - "grad_norm": 3.470418930053711, + "grad_norm": 2.6183907985687256, "learning_rate": 3.338701183844689e-08, - "logits/chosen": 1.2871098518371582, - "logits/rejected": 1.203657865524292, - "logps/chosen": -469.0284118652344, - "logps/rejected": -1091.468994140625, - "loss": 0.0596, + "logits/chosen": 1.4023044109344482, + "logits/rejected": 1.3029506206512451, + "logps/chosen": -472.97216796875, + "logps/rejected": -1090.064697265625, + "loss": 0.0564, "rewards/accuracies": 0.9375, - "rewards/chosen": -4.372101783752441, - "rewards/margins": 6.188841819763184, - "rewards/rejected": -10.560943603515625, + "rewards/chosen": -4.41116189956665, + "rewards/margins": 6.135444164276123, + "rewards/rejected": -10.546606063842773, "step": 3030 }, { "epoch": 0.9562755583516829, - "grad_norm": 2.421786308288574, + "grad_norm": 2.2757019996643066, "learning_rate": 2.9064331971056515e-08, - "logits/chosen": 1.432936668395996, - "logits/rejected": 1.4242966175079346, - "logps/chosen": -510.3072204589844, - "logps/rejected": -1054.8065185546875, - "loss": 0.0593, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.7712721824646, - "rewards/margins": 5.450275421142578, - "rewards/rejected": -10.221548080444336, + "logits/chosen": 1.5462950468063354, + "logits/rejected": 1.5350978374481201, + "logps/chosen": -504.5541076660156, + "logps/rejected": -1036.603759765625, + "loss": 0.0627, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.7133469581604, + "rewards/margins": 5.325845718383789, + "rewards/rejected": -10.039191246032715, "step": 3040 }, { "epoch": 0.9594212016357345, - "grad_norm": 3.0846569538116455, + "grad_norm": 1.553912878036499, "learning_rate": 2.503958681509683e-08, - "logits/chosen": 1.566232681274414, - "logits/rejected": 1.3942068815231323, - "logps/chosen": -524.6204833984375, - "logps/rejected": -1089.9156494140625, - "loss": 0.0568, + "logits/chosen": 1.6635456085205078, + "logits/rejected": 1.4628212451934814, + "logps/chosen": -515.1585083007812, + "logps/rejected": -1074.81103515625, + "loss": 0.0487, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -4.901736736297607, - "rewards/margins": 5.649755954742432, - "rewards/rejected": -10.551492691040039, + "rewards/chosen": -4.806754112243652, + "rewards/margins": 5.593099594116211, + "rewards/rejected": -10.399853706359863, "step": 3050 }, { "epoch": 0.9625668449197861, - "grad_norm": 4.23090934753418, + "grad_norm": 3.9352211952209473, "learning_rate": 2.1313261656891737e-08, - "logits/chosen": 1.4874424934387207, - "logits/rejected": 1.4000155925750732, - "logps/chosen": -557.3499145507812, - "logps/rejected": -1110.4896240234375, - "loss": 0.0655, - "rewards/accuracies": 0.90625, - "rewards/chosen": -5.248296737670898, - "rewards/margins": 5.522107124328613, - "rewards/rejected": -10.770402908325195, + "logits/chosen": 1.6192388534545898, + "logits/rejected": 1.528223991394043, + "logps/chosen": -551.916748046875, + "logps/rejected": -1081.7154541015625, + "loss": 0.0645, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.1936187744140625, + "rewards/margins": 5.288642883300781, + "rewards/rejected": -10.48226261138916, "step": 3060 }, { "epoch": 0.9657124882038377, - "grad_norm": 1.998992681503296, + "grad_norm": 2.4550108909606934, "learning_rate": 1.788580580057514e-08, - "logits/chosen": 1.43699049949646, - "logits/rejected": 1.2418830394744873, - "logps/chosen": -558.634765625, - "logps/rejected": -1058.3809814453125, - "loss": 0.0662, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -5.254110336303711, - "rewards/margins": 4.97817850112915, - "rewards/rejected": -10.23228931427002, + "logits/chosen": 1.5395796298980713, + "logits/rejected": 1.3465406894683838, + "logps/chosen": -556.5447387695312, + "logps/rejected": -1052.707275390625, + "loss": 0.0695, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.232929229736328, + "rewards/margins": 4.9423723220825195, + "rewards/rejected": -10.175302505493164, "step": 3070 }, { "epoch": 0.9688581314878892, - "grad_norm": 3.4238898754119873, + "grad_norm": 3.4560182094573975, "learning_rate": 1.4757632513916764e-08, - "logits/chosen": 1.4360311031341553, - "logits/rejected": 1.2258044481277466, - "logps/chosen": -472.9986267089844, - "logps/rejected": -1088.400634765625, - "loss": 0.0615, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -4.3953752517700195, - "rewards/margins": 6.134710788726807, - "rewards/rejected": -10.530085563659668, + "logits/chosen": 1.5373961925506592, + "logits/rejected": 1.3364956378936768, + "logps/chosen": -464.5713806152344, + "logps/rejected": -1062.9267578125, + "loss": 0.0592, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.311025619506836, + "rewards/margins": 5.964101791381836, + "rewards/rejected": -10.275126457214355, "step": 3080 }, { "epoch": 0.9720037747719409, - "grad_norm": 2.3061940670013428, + "grad_norm": 2.193678140640259, "learning_rate": 1.1929118978490361e-08, - "logits/chosen": 1.4735639095306396, - "logits/rejected": 1.3414032459259033, - "logps/chosen": -499.18719482421875, - "logps/rejected": -1091.6337890625, - "loss": 0.0589, + "logits/chosen": 1.5750865936279297, + "logits/rejected": 1.4524461030960083, + "logps/chosen": -484.18890380859375, + "logps/rejected": -1047.558349609375, + "loss": 0.0547, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -4.663874626159668, - "rewards/margins": 5.900238990783691, - "rewards/rejected": -10.564112663269043, + "rewards/chosen": -4.513777732849121, + "rewards/margins": 5.6092610359191895, + "rewards/rejected": -10.123039245605469, "step": 3090 }, { "epoch": 0.9751494180559924, - "grad_norm": 3.2079808712005615, + "grad_norm": 3.110525608062744, "learning_rate": 9.400606244196753e-09, - "logits/chosen": 1.5242373943328857, - "logits/rejected": 1.161348581314087, - "logps/chosen": -513.7073364257812, - "logps/rejected": -1092.349365234375, - "loss": 0.0593, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -4.811236381530762, - "rewards/margins": 5.742733955383301, - "rewards/rejected": -10.553971290588379, + "logits/chosen": 1.6422191858291626, + "logits/rejected": 1.2251650094985962, + "logps/chosen": -517.6271362304688, + "logps/rejected": -1077.4622802734375, + "loss": 0.0602, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.850308895111084, + "rewards/margins": 5.554625034332275, + "rewards/rejected": -10.404932975769043, "step": 3100 }, { "epoch": 0.9751494180559924, - "eval_logits/chosen": 2.146430015563965, - "eval_logits/rejected": 1.9468775987625122, - "eval_logps/chosen": -502.61627197265625, - "eval_logps/rejected": -1101.212158203125, - "eval_loss": 0.02869391068816185, - "eval_rewards/accuracies": 0.9276119470596313, - "eval_rewards/chosen": -4.698452949523926, - "eval_rewards/margins": 5.95468282699585, - "eval_rewards/rejected": -10.653135299682617, - "eval_runtime": 215.2746, - "eval_samples_per_second": 99.487, - "eval_steps_per_second": 1.556, + "eval_logits/chosen": 2.2623343467712402, + "eval_logits/rejected": 2.0536904335021973, + "eval_logps/chosen": -495.68597412109375, + "eval_logps/rejected": -1079.9603271484375, + "eval_loss": 0.029893433675169945, + "eval_rewards/accuracies": 0.9302238821983337, + "eval_rewards/chosen": -4.628891468048096, + "eval_rewards/margins": 5.811552047729492, + "eval_rewards/rejected": -10.440443992614746, + "eval_runtime": 216.5239, + "eval_samples_per_second": 98.913, + "eval_steps_per_second": 1.547, "step": 3100 }, { "epoch": 0.9782950613400441, - "grad_norm": 3.9912333488464355, + "grad_norm": 3.7997848987579346, "learning_rate": 7.172399188140611e-09, - "logits/chosen": 1.643225908279419, - "logits/rejected": 1.293428659439087, - "logps/chosen": -533.8253784179688, - "logps/rejected": -1149.5816650390625, - "loss": 0.0602, - "rewards/accuracies": 0.90625, - "rewards/chosen": -5.007025718688965, - "rewards/margins": 6.1268744468688965, - "rewards/rejected": -11.13390064239502, + "logits/chosen": 1.757672667503357, + "logits/rejected": 1.3846279382705688, + "logps/chosen": -524.0980834960938, + "logps/rejected": -1114.28271484375, + "loss": 0.0582, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.909502983093262, + "rewards/margins": 5.87128210067749, + "rewards/rejected": -10.78078556060791, "step": 3110 }, { "epoch": 0.9814407046240956, - "grad_norm": 1.301156997680664, + "grad_norm": 1.7551467418670654, "learning_rate": 5.244766477869034e-09, - "logits/chosen": 1.4164081811904907, - "logits/rejected": 1.1432323455810547, - "logps/chosen": -517.7205200195312, - "logps/rejected": -1175.121826171875, - "loss": 0.0563, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -4.851759910583496, - "rewards/margins": 6.547132968902588, - "rewards/rejected": -11.398893356323242, + "logits/chosen": 1.533881664276123, + "logits/rejected": 1.2340834140777588, + "logps/chosen": -509.0604553222656, + "logps/rejected": -1127.41943359375, + "loss": 0.0557, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.765114784240723, + "rewards/margins": 6.15649938583374, + "rewards/rejected": -10.921614646911621, "step": 3120 }, { "epoch": 0.9845863479081473, - "grad_norm": 3.761977195739746, + "grad_norm": 1.941530704498291, "learning_rate": 3.617940538978848e-09, - "logits/chosen": 1.3819186687469482, - "logits/rejected": 1.1908156871795654, - "logps/chosen": -517.9227294921875, - "logps/rejected": -1027.6142578125, - "loss": 0.0452, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -4.846681594848633, - "rewards/margins": 5.071451187133789, - "rewards/rejected": -9.918134689331055, + "logits/chosen": 1.4696052074432373, + "logits/rejected": 1.2560389041900635, + "logps/chosen": -507.57843017578125, + "logps/rejected": -1006.82470703125, + "loss": 0.0475, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.7429070472717285, + "rewards/margins": 4.967124938964844, + "rewards/rejected": -9.71003246307373, "step": 3130 }, { "epoch": 0.9877319911921988, - "grad_norm": 2.8583881855010986, + "grad_norm": 2.3873629570007324, "learning_rate": 2.2921175270890217e-09, - "logits/chosen": 1.5854167938232422, - "logits/rejected": 1.247761845588684, - "logps/chosen": -521.75, - "logps/rejected": -1083.730224609375, - "loss": 0.0674, - "rewards/accuracies": 0.90625, - "rewards/chosen": -4.886361122131348, - "rewards/margins": 5.578226089477539, - "rewards/rejected": -10.464587211608887, + "logits/chosen": 1.7028379440307617, + "logits/rejected": 1.3064117431640625, + "logps/chosen": -509.66363525390625, + "logps/rejected": -1081.079833984375, + "loss": 0.0633, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.765096664428711, + "rewards/margins": 5.672849655151367, + "rewards/rejected": -10.437946319580078, "step": 3140 }, { "epoch": 0.9908776344762504, - "grad_norm": 1.9092971086502075, + "grad_norm": 2.30230450630188, "learning_rate": 1.2674573041909776e-09, - "logits/chosen": 1.2872904539108276, - "logits/rejected": 1.259684443473816, - "logps/chosen": -520.8709106445312, - "logps/rejected": -1122.4615478515625, - "loss": 0.0591, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -4.865385055541992, - "rewards/margins": 6.030797958374023, - "rewards/rejected": -10.896183967590332, + "logits/chosen": 1.3582854270935059, + "logits/rejected": 1.3528727293014526, + "logps/chosen": -506.40081787109375, + "logps/rejected": -1082.99560546875, + "loss": 0.0522, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.720505714416504, + "rewards/margins": 5.780804634094238, + "rewards/rejected": -10.501310348510742, "step": 3150 }, { "epoch": 0.994023277760302, - "grad_norm": 4.959075927734375, + "grad_norm": 3.560915231704712, "learning_rate": 5.440834193726208e-10, - "logits/chosen": 1.6272627115249634, - "logits/rejected": 1.4106671810150146, - "logps/chosen": -505.4927673339844, - "logps/rejected": -1114.03173828125, - "loss": 0.0573, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -4.746209144592285, - "rewards/margins": 6.043259620666504, - "rewards/rejected": -10.789468765258789, + "logits/chosen": 1.7087326049804688, + "logits/rejected": 1.488204002380371, + "logps/chosen": -502.759033203125, + "logps/rejected": -1079.620849609375, + "loss": 0.0549, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.718785285949707, + "rewards/margins": 5.726217269897461, + "rewards/rejected": -10.445002555847168, "step": 3160 }, { "epoch": 0.9971689210443536, - "grad_norm": 3.2813379764556885, + "grad_norm": 3.5719094276428223, "learning_rate": 1.2208309392081064e-10, - "logits/chosen": 1.2191925048828125, - "logits/rejected": 1.0304396152496338, - "logps/chosen": -555.1168212890625, - "logps/rejected": -1115.899169921875, - "loss": 0.0633, + "logits/chosen": 1.3343207836151123, + "logits/rejected": 1.0999490022659302, + "logps/chosen": -544.6361083984375, + "logps/rejected": -1093.305419921875, + "loss": 0.0604, "rewards/accuracies": 0.90625, - "rewards/chosen": -5.212588310241699, - "rewards/margins": 5.577348709106445, - "rewards/rejected": -10.789937973022461, + "rewards/chosen": -5.107422828674316, + "rewards/margins": 5.456589698791504, + "rewards/rejected": -10.56401252746582, "step": 3170 }, { "epoch": 1.0, "step": 3179, "total_flos": 0.0, - "train_loss": 0.19036180805619987, - "train_runtime": 15997.0818, - "train_samples_per_second": 25.436, - "train_steps_per_second": 0.199 + "train_loss": 0.18858218038370866, + "train_runtime": 16104.4918, + "train_samples_per_second": 25.267, + "train_steps_per_second": 0.197 } ], "logging_steps": 10,