{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.5625e-07, "logits/chosen": 0.37485986948013306, "logits/rejected": 0.6487500071525574, "logps/chosen": -1078.384765625, "logps/rejected": -1101.77490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logits/chosen": 0.47351381182670593, "logits/rejected": 0.5273572206497192, "logps/chosen": -1056.42822265625, "logps/rejected": -1169.265625, "loss": 0.6932, "rewards/accuracies": 0.3958333432674408, "rewards/chosen": -0.00091694132424891, "rewards/margins": -0.00018613642896525562, "rewards/rejected": -0.0007308049243874848, "step": 10 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": 0.42252635955810547, "logits/rejected": 0.49473732709884644, "logps/chosen": -1147.17236328125, "logps/rejected": -1265.768798828125, "loss": 0.6914, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.022750383242964745, "rewards/margins": 0.0030057504773139954, "rewards/rejected": -0.02575613185763359, "step": 20 }, { "epoch": 0.1, "learning_rate": 4.6875000000000004e-06, "logits/chosen": 0.5097017884254456, "logits/rejected": 0.5685985088348389, "logps/chosen": -1142.3890380859375, "logps/rejected": -1274.76171875, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": -0.09474565088748932, "rewards/margins": 0.013504189439117908, "rewards/rejected": -0.10824984312057495, "step": 30 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": 0.6262896656990051, "logits/rejected": 0.5369861125946045, "logps/chosen": -1379.107177734375, "logps/rejected": -1572.9727783203125, "loss": 0.679, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.25016888976097107, "rewards/margins": 0.05141867324709892, "rewards/rejected": -0.3015875816345215, "step": 40 }, { "epoch": 0.16, "learning_rate": 4.949188496058089e-06, "logits/chosen": 0.595242977142334, "logits/rejected": 0.6554633378982544, "logps/chosen": -1302.7606201171875, "logps/rejected": -1508.60107421875, "loss": 0.6637, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.35188037157058716, "rewards/margins": 0.07706869393587112, "rewards/rejected": -0.4289490282535553, "step": 50 }, { "epoch": 0.19, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 0.6328147053718567, "logits/rejected": 0.695043683052063, "logps/chosen": -1562.052978515625, "logps/rejected": -1646.878662109375, "loss": 0.6789, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.4670296609401703, "rewards/margins": 0.025544878095388412, "rewards/rejected": -0.492574542760849, "step": 60 }, { "epoch": 0.22, "learning_rate": 4.7761938666470405e-06, "logits/chosen": 0.5762341618537903, "logits/rejected": 0.6952670812606812, "logps/chosen": -1321.309814453125, "logps/rejected": -1596.348876953125, "loss": 0.6667, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.27966535091400146, "rewards/margins": 0.1273583322763443, "rewards/rejected": -0.40702366828918457, "step": 70 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": 0.5692261457443237, "logits/rejected": 0.8441296815872192, "logps/chosen": -1434.124755859375, "logps/rejected": -1714.9644775390625, "loss": 0.6653, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.38832995295524597, "rewards/margins": 0.12836144864559174, "rewards/rejected": -0.5166913866996765, "step": 80 }, { "epoch": 0.29, "learning_rate": 4.4890613722044526e-06, "logits/chosen": 0.5933000445365906, "logits/rejected": 0.7363389730453491, "logps/chosen": -1363.330810546875, "logps/rejected": -1636.5830078125, "loss": 0.6614, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.36261868476867676, "rewards/margins": 0.11317511647939682, "rewards/rejected": -0.47579383850097656, "step": 90 }, { "epoch": 0.32, "learning_rate": 4.3069871595684795e-06, "logits/chosen": 0.6000683903694153, "logits/rejected": 0.7250877618789673, "logps/chosen": -1501.1116943359375, "logps/rejected": -1688.3638916015625, "loss": 0.6729, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.42007994651794434, "rewards/margins": 0.07729745656251907, "rewards/rejected": -0.49737733602523804, "step": 100 }, { "epoch": 0.35, "learning_rate": 4.102189034962561e-06, "logits/chosen": 0.6664993166923523, "logits/rejected": 0.8522599935531616, "logps/chosen": -1442.220703125, "logps/rejected": -1732.263671875, "loss": 0.6579, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.3821481168270111, "rewards/margins": 0.13758106529712677, "rewards/rejected": -0.5197292566299438, "step": 110 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": 0.7149074077606201, "logits/rejected": 0.7848154902458191, "logps/chosen": -1421.830322265625, "logps/rejected": -1694.971923828125, "loss": 0.6525, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.38273271918296814, "rewards/margins": 0.12498722970485687, "rewards/rejected": -0.5077199935913086, "step": 120 }, { "epoch": 0.42, "learning_rate": 3.634976249348867e-06, "logits/chosen": 0.709136962890625, "logits/rejected": 0.7118825912475586, "logps/chosen": -1674.842041015625, "logps/rejected": -1917.5904541015625, "loss": 0.6612, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5186460614204407, "rewards/margins": 0.10890078544616699, "rewards/rejected": -0.6275469064712524, "step": 130 }, { "epoch": 0.45, "learning_rate": 3.3784370602033572e-06, "logits/chosen": 0.6097074747085571, "logits/rejected": 0.8623794317245483, "logps/chosen": -1559.768798828125, "logps/rejected": -1758.3902587890625, "loss": 0.6673, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.446970134973526, "rewards/margins": 0.08759806305170059, "rewards/rejected": -0.534568190574646, "step": 140 }, { "epoch": 0.48, "learning_rate": 3.1108510153447352e-06, "logits/chosen": 0.567806601524353, "logits/rejected": 0.8320453763008118, "logps/chosen": -1542.0999755859375, "logps/rejected": -1677.0787353515625, "loss": 0.6708, "rewards/accuracies": 0.46875, "rewards/chosen": -0.41464248299598694, "rewards/margins": 0.05952323600649834, "rewards/rejected": -0.47416573762893677, "step": 150 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": 0.7906177639961243, "logits/rejected": 0.7841562628746033, "logps/chosen": -1429.5552978515625, "logps/rejected": -1619.2171630859375, "loss": 0.6578, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.3786751627922058, "rewards/margins": 0.09149602800607681, "rewards/rejected": -0.47017115354537964, "step": 160 }, { "epoch": 0.54, "learning_rate": 2.556095160739513e-06, "logits/chosen": 0.7081605195999146, "logits/rejected": 0.6871576905250549, "logps/chosen": -1445.623779296875, "logps/rejected": -1650.127197265625, "loss": 0.6577, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.41454702615737915, "rewards/margins": 0.10776009410619736, "rewards/rejected": -0.5223071575164795, "step": 170 }, { "epoch": 0.58, "learning_rate": 2.2759017277414165e-06, "logits/chosen": 0.7904581427574158, "logits/rejected": 0.7038453817367554, "logps/chosen": -1671.363525390625, "logps/rejected": -1868.4300537109375, "loss": 0.6579, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5421901941299438, "rewards/margins": 0.10026909410953522, "rewards/rejected": -0.6424592733383179, "step": 180 }, { "epoch": 0.61, "learning_rate": 1.9985264605418185e-06, "logits/chosen": 0.6540366411209106, "logits/rejected": 0.7285584807395935, "logps/chosen": -1518.58935546875, "logps/rejected": -1726.029052734375, "loss": 0.652, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4512515962123871, "rewards/margins": 0.09267839789390564, "rewards/rejected": -0.5439299941062927, "step": 190 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.879226565361023, "logits/rejected": 0.8555147051811218, "logps/chosen": -1627.2894287109375, "logps/rejected": -1928.3841552734375, "loss": 0.6602, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5301727056503296, "rewards/margins": 0.161948561668396, "rewards/rejected": -0.6921212673187256, "step": 200 }, { "epoch": 0.67, "learning_rate": 1.466103737583699e-06, "logits/chosen": 0.6861797571182251, "logits/rejected": 0.9016023874282837, "logps/chosen": -1539.524658203125, "logps/rejected": -1874.627685546875, "loss": 0.659, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4866175651550293, "rewards/margins": 0.17361479997634888, "rewards/rejected": -0.6602323651313782, "step": 210 }, { "epoch": 0.7, "learning_rate": 1.217751806485235e-06, "logits/chosen": 0.7002454996109009, "logits/rejected": 0.8032233119010925, "logps/chosen": -1692.8636474609375, "logps/rejected": -1974.6669921875, "loss": 0.6663, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.5267156958580017, "rewards/margins": 0.14205826818943024, "rewards/rejected": -0.6687740087509155, "step": 220 }, { "epoch": 0.74, "learning_rate": 9.855248903979505e-07, "logits/chosen": 0.7965744733810425, "logits/rejected": 0.7885487079620361, "logps/chosen": -1604.53466796875, "logps/rejected": -1794.8245849609375, "loss": 0.6672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.48480549454689026, "rewards/margins": 0.0899248868227005, "rewards/rejected": -0.574730396270752, "step": 230 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": 0.7397829294204712, "logits/rejected": 0.8248909711837769, "logps/chosen": -1530.4615478515625, "logps/rejected": -1826.6988525390625, "loss": 0.654, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.41139334440231323, "rewards/margins": 0.1389993578195572, "rewards/rejected": -0.550392746925354, "step": 240 }, { "epoch": 0.8, "learning_rate": 5.808881491049723e-07, "logits/chosen": 0.6129003763198853, "logits/rejected": 0.8366864919662476, "logps/chosen": -1451.37890625, "logps/rejected": -1729.635986328125, "loss": 0.6728, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.40951260924339294, "rewards/margins": 0.1274668127298355, "rewards/rejected": -0.5369793772697449, "step": 250 }, { "epoch": 0.83, "learning_rate": 4.1356686569674344e-07, "logits/chosen": 0.7178203463554382, "logits/rejected": 0.8371836543083191, "logps/chosen": -1524.3497314453125, "logps/rejected": -1778.346923828125, "loss": 0.6495, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4218766689300537, "rewards/margins": 0.12465916574001312, "rewards/rejected": -0.546535849571228, "step": 260 }, { "epoch": 0.86, "learning_rate": 2.7248368952908055e-07, "logits/chosen": 0.6586390733718872, "logits/rejected": 0.8500372767448425, "logps/chosen": -1523.057373046875, "logps/rejected": -1753.3544921875, "loss": 0.6641, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.43091854453086853, "rewards/margins": 0.11140650510787964, "rewards/rejected": -0.5423250198364258, "step": 270 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": 0.7076243162155151, "logits/rejected": 0.7846710681915283, "logps/chosen": -1433.0615234375, "logps/rejected": -1673.491455078125, "loss": 0.6713, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.39409512281417847, "rewards/margins": 0.11671394109725952, "rewards/rejected": -0.510809063911438, "step": 280 }, { "epoch": 0.93, "learning_rate": 7.577619905828281e-08, "logits/chosen": 0.6640155911445618, "logits/rejected": 0.7537108659744263, "logps/chosen": -1413.724853515625, "logps/rejected": -1650.0406494140625, "loss": 0.6651, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.38546374440193176, "rewards/margins": 0.11493394523859024, "rewards/rejected": -0.5003976821899414, "step": 290 }, { "epoch": 0.96, "learning_rate": 2.262559558016325e-08, "logits/chosen": 0.635712742805481, "logits/rejected": 0.8228713274002075, "logps/chosen": -1463.5419921875, "logps/rejected": -1637.050048828125, "loss": 0.6578, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41111892461776733, "rewards/margins": 0.08340780436992645, "rewards/rejected": -0.4945267140865326, "step": 300 }, { "epoch": 0.99, "learning_rate": 6.294126437336734e-10, "logits/chosen": 0.6655277013778687, "logits/rejected": 0.759280800819397, "logps/chosen": -1619.28857421875, "logps/rejected": -1727.4703369140625, "loss": 0.6663, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.4628829061985016, "rewards/margins": 0.05492577701807022, "rewards/rejected": -0.517808735370636, "step": 310 }, { "epoch": 1.0, "step": 312, "total_flos": 0.0, "train_loss": 0.666122229435505, "train_runtime": 4198.8516, "train_samples_per_second": 4.763, "train_steps_per_second": 0.074 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }