{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 4.5521214161319365, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.658930778503418, "logits/rejected": -2.6127209663391113, "logps/chosen": -310.2911682128906, "logps/rejected": -241.6273651123047, "loss": 0.6931, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.0002677589072845876, "rewards/margins": -0.0003533684357535094, "rewards/rejected": 8.560952846892178e-05, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 5.31294051195239, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.691143751144409, "logits/rejected": -2.6153030395507812, "logps/chosen": -293.57037353515625, "logps/rejected": -265.63885498046875, "loss": 0.6926, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.00123660231474787, "rewards/margins": 0.00207112031057477, "rewards/rejected": -0.0008345181122422218, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 4.285586645859575, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6962637901306152, "logits/rejected": -2.628978729248047, "logps/chosen": -277.83209228515625, "logps/rejected": -297.1653137207031, "loss": 0.6901, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004098366480320692, "rewards/margins": 0.00956634059548378, "rewards/rejected": -0.0054679736495018005, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 5.111796420680735, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.6144208908081055, "logits/rejected": -2.543553352355957, "logps/chosen": -284.1578674316406, "logps/rejected": -259.81475830078125, "loss": 0.6826, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.03460276871919632, "rewards/margins": 0.0436362698674202, "rewards/rejected": -0.009033503010869026, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 4.717770708541697, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.591963291168213, "logits/rejected": -2.5082640647888184, "logps/chosen": -285.80047607421875, "logps/rejected": -247.5881805419922, "loss": 0.674, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.011667141690850258, "rewards/margins": 0.055136788636446, "rewards/rejected": -0.06680391728878021, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.6023333072662354, "eval_logits/rejected": -2.5038623809814453, "eval_logps/chosen": -288.564208984375, "eval_logps/rejected": -260.0650329589844, "eval_loss": 0.6643866300582886, "eval_rewards/accuracies": 0.6724137663841248, "eval_rewards/chosen": -0.04558267444372177, "eval_rewards/margins": 0.09859247505664825, "eval_rewards/rejected": -0.14417517185211182, "eval_runtime": 91.2705, "eval_samples_per_second": 19.919, "eval_steps_per_second": 0.318, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 6.057042762669634, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.584840774536133, "logits/rejected": -2.5118155479431152, "logps/chosen": -291.8733215332031, "logps/rejected": -281.57476806640625, "loss": 0.6519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11285074800252914, "rewards/margins": 0.18535511195659637, "rewards/rejected": -0.2982058525085449, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 12.297420969455343, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.4776272773742676, "logits/rejected": -2.4097015857696533, "logps/chosen": -299.2547912597656, "logps/rejected": -271.7998046875, "loss": 0.6509, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.30904772877693176, "rewards/margins": 0.21535193920135498, "rewards/rejected": -0.5243996381759644, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 8.87671354702832, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.565410852432251, "logits/rejected": -2.4615464210510254, "logps/chosen": -310.5340270996094, "logps/rejected": -324.5120544433594, "loss": 0.637, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.40506935119628906, "rewards/margins": 0.2631128430366516, "rewards/rejected": -0.6681821942329407, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 13.125498153678837, "learning_rate": 4.832031033425662e-07, "logits/chosen": -1.73556649684906, "logits/rejected": -1.575148105621338, "logps/chosen": -354.5548400878906, "logps/rejected": -374.177978515625, "loss": 0.6117, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5037232041358948, "rewards/margins": 0.48656755685806274, "rewards/rejected": -0.9902908205986023, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 13.953968120873695, "learning_rate": 4.752422169756047e-07, "logits/chosen": -0.5573834180831909, "logits/rejected": -0.19994211196899414, "logps/chosen": -354.9537048339844, "logps/rejected": -379.1808166503906, "loss": 0.5874, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7568774819374084, "rewards/margins": 0.5189547538757324, "rewards/rejected": -1.2758322954177856, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -0.22263701260089874, "eval_logits/rejected": 0.3007841110229492, "eval_logps/chosen": -382.2076416015625, "eval_logps/rejected": -402.1482238769531, "eval_loss": 0.5920498371124268, "eval_rewards/accuracies": 0.681034505367279, "eval_rewards/chosen": -0.9820166230201721, "eval_rewards/margins": 0.5829907059669495, "eval_rewards/rejected": -1.5650073289871216, "eval_runtime": 91.5375, "eval_samples_per_second": 19.861, "eval_steps_per_second": 0.317, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 23.915856678856265, "learning_rate": 4.658354083558188e-07, "logits/chosen": -0.40947216749191284, "logits/rejected": 0.038003671914339066, "logps/chosen": -379.87451171875, "logps/rejected": -438.7657165527344, "loss": 0.5763, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.826363742351532, "rewards/margins": 0.7436057329177856, "rewards/rejected": -1.5699694156646729, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 14.304332284980681, "learning_rate": 4.550430636492389e-07, "logits/chosen": -0.18220779299736023, "logits/rejected": 0.7561357021331787, "logps/chosen": -424.419677734375, "logps/rejected": -439.09320068359375, "loss": 0.5937, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1400790214538574, "rewards/margins": 0.6504500508308411, "rewards/rejected": -1.7905290126800537, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 15.490141656078896, "learning_rate": 4.429344633468004e-07, "logits/chosen": 0.37701982259750366, "logits/rejected": 1.372661828994751, "logps/chosen": -391.7457580566406, "logps/rejected": -451.080810546875, "loss": 0.5829, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0409055948257446, "rewards/margins": 0.9019233584403992, "rewards/rejected": -1.942828893661499, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 15.271366347742205, "learning_rate": 4.2958733752443187e-07, "logits/chosen": -0.13376641273498535, "logits/rejected": 1.0010212659835815, "logps/chosen": -383.65301513671875, "logps/rejected": -419.6103515625, "loss": 0.5623, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9957970380783081, "rewards/margins": 0.7610489130020142, "rewards/rejected": -1.7568460702896118, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 22.33400136982847, "learning_rate": 4.150873668617898e-07, "logits/chosen": 0.09550192952156067, "logits/rejected": 1.2170594930648804, "logps/chosen": -387.51849365234375, "logps/rejected": -434.84136962890625, "loss": 0.5612, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9680271148681641, "rewards/margins": 0.772575318813324, "rewards/rejected": -1.7406024932861328, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": 1.4310089349746704, "eval_logits/rejected": 2.314030885696411, "eval_logps/chosen": -430.7732238769531, "eval_logps/rejected": -482.2997741699219, "eval_loss": 0.5694876909255981, "eval_rewards/accuracies": 0.6896551847457886, "eval_rewards/chosen": -1.4676721096038818, "eval_rewards/margins": 0.8988507390022278, "eval_rewards/rejected": -2.366523027420044, "eval_runtime": 91.6244, "eval_samples_per_second": 19.842, "eval_steps_per_second": 0.317, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 23.657529687976723, "learning_rate": 3.9952763262280397e-07, "logits/chosen": 1.280305027961731, "logits/rejected": 2.0924124717712402, "logps/chosen": -457.1259765625, "logps/rejected": -502.7013244628906, "loss": 0.5762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6072089672088623, "rewards/margins": 0.868707001209259, "rewards/rejected": -2.4759163856506348, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 20.186960313509775, "learning_rate": 3.8300801912883414e-07, "logits/chosen": -0.41927653551101685, "logits/rejected": 0.5089612007141113, "logps/chosen": -360.0221252441406, "logps/rejected": -393.9572448730469, "loss": 0.5498, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9409689903259277, "rewards/margins": 0.7842710614204407, "rewards/rejected": -1.7252399921417236, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 19.134949732610515, "learning_rate": 3.6563457256020884e-07, "logits/chosen": -0.13538230955600739, "logits/rejected": 0.8745505213737488, "logps/chosen": -365.5416259765625, "logps/rejected": -449.64080810546875, "loss": 0.5571, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0540037155151367, "rewards/margins": 1.0069358348846436, "rewards/rejected": -2.060939311981201, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 18.989463695568457, "learning_rate": 3.475188202022617e-07, "logits/chosen": 0.555556058883667, "logits/rejected": 1.7799046039581299, "logps/chosen": -379.05133056640625, "logps/rejected": -495.5801696777344, "loss": 0.5566, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2038367986679077, "rewards/margins": 1.1251088380813599, "rewards/rejected": -2.3289456367492676, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 20.721540306309656, "learning_rate": 3.287770545059052e-07, "logits/chosen": 0.1556408703327179, "logits/rejected": 1.0401029586791992, "logps/chosen": -403.26068115234375, "logps/rejected": -452.82684326171875, "loss": 0.5427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3739937543869019, "rewards/margins": 0.7915691137313843, "rewards/rejected": -2.1655631065368652, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": -0.3630250096321106, "eval_logits/rejected": 0.9222813248634338, "eval_logps/chosen": -418.6947021484375, "eval_logps/rejected": -471.8922119140625, "eval_loss": 0.5523015260696411, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -1.346887469291687, "eval_rewards/margins": 0.9155599474906921, "eval_rewards/rejected": -2.2624475955963135, "eval_runtime": 92.3698, "eval_samples_per_second": 19.682, "eval_steps_per_second": 0.314, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 17.428734986299386, "learning_rate": 3.0952958655864954e-07, "logits/chosen": 0.16718950867652893, "logits/rejected": 1.1019595861434937, "logps/chosen": -430.6678771972656, "logps/rejected": -517.311767578125, "loss": 0.5458, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5387192964553833, "rewards/margins": 0.787868857383728, "rewards/rejected": -2.3265881538391113, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 18.835466832309084, "learning_rate": 2.898999737583448e-07, "logits/chosen": -0.19043000042438507, "logits/rejected": 1.0126752853393555, "logps/chosen": -419.5379943847656, "logps/rejected": -484.6988830566406, "loss": 0.5528, "rewards/accuracies": 0.6875, "rewards/chosen": -1.511856198310852, "rewards/margins": 0.7948068976402283, "rewards/rejected": -2.3066630363464355, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 23.197020188064432, "learning_rate": 2.7001422664752333e-07, "logits/chosen": -0.30360710620880127, "logits/rejected": 1.0719916820526123, "logps/chosen": -386.96087646484375, "logps/rejected": -477.74029541015625, "loss": 0.5482, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1279244422912598, "rewards/margins": 1.1136093139648438, "rewards/rejected": -2.2415337562561035, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 23.485163773762974, "learning_rate": 2.5e-07, "logits/chosen": 0.9384336471557617, "logits/rejected": 1.5441758632659912, "logps/chosen": -413.4400329589844, "logps/rejected": -505.8965759277344, "loss": 0.5778, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4366745948791504, "rewards/margins": 0.9647194147109985, "rewards/rejected": -2.4013938903808594, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 18.02833446289089, "learning_rate": 2.2998577335247667e-07, "logits/chosen": 0.6526413559913635, "logits/rejected": 1.6761877536773682, "logps/chosen": -408.7395324707031, "logps/rejected": -484.34417724609375, "loss": 0.5474, "rewards/accuracies": 0.6875, "rewards/chosen": -1.438388705253601, "rewards/margins": 0.9599205851554871, "rewards/rejected": -2.3983092308044434, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": -0.44025513529777527, "eval_logits/rejected": 0.9070881605148315, "eval_logps/chosen": -393.58612060546875, "eval_logps/rejected": -449.3501281738281, "eval_loss": 0.5429869890213013, "eval_rewards/accuracies": 0.6896551847457886, "eval_rewards/chosen": -1.0958014726638794, "eval_rewards/margins": 0.9412251114845276, "eval_rewards/rejected": -2.0370266437530518, "eval_runtime": 91.0397, "eval_samples_per_second": 19.969, "eval_steps_per_second": 0.319, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 21.769039546192015, "learning_rate": 2.1010002624165524e-07, "logits/chosen": 0.1161709800362587, "logits/rejected": 1.337697982788086, "logps/chosen": -419.63995361328125, "logps/rejected": -492.03363037109375, "loss": 0.5494, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1713824272155762, "rewards/margins": 1.1571643352508545, "rewards/rejected": -2.3285470008850098, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 19.115539205351443, "learning_rate": 1.9047041344135043e-07, "logits/chosen": 1.2381702661514282, "logits/rejected": 2.2026009559631348, "logps/chosen": -462.31072998046875, "logps/rejected": -520.6209716796875, "loss": 0.5714, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8605678081512451, "rewards/margins": 0.8712261319160461, "rewards/rejected": -2.7317938804626465, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 20.348758953970556, "learning_rate": 1.7122294549409482e-07, "logits/chosen": 1.1374260187149048, "logits/rejected": 2.329713821411133, "logps/chosen": -465.6629333496094, "logps/rejected": -563.9349365234375, "loss": 0.543, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8833774328231812, "rewards/margins": 1.0653350353240967, "rewards/rejected": -2.948712110519409, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 18.788678367954812, "learning_rate": 1.524811797977383e-07, "logits/chosen": -0.17822352051734924, "logits/rejected": 0.844377338886261, "logps/chosen": -437.96435546875, "logps/rejected": -507.1295471191406, "loss": 0.542, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4787204265594482, "rewards/margins": 0.8510695695877075, "rewards/rejected": -2.329789876937866, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 22.907322461677648, "learning_rate": 1.3436542743979125e-07, "logits/chosen": 0.011055344715714455, "logits/rejected": 1.3963500261306763, "logps/chosen": -432.23321533203125, "logps/rejected": -509.43939208984375, "loss": 0.5556, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4701194763183594, "rewards/margins": 1.124694585800171, "rewards/rejected": -2.5948143005371094, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": -0.1993160843849182, "eval_logits/rejected": 1.1949646472930908, "eval_logps/chosen": -423.5918884277344, "eval_logps/rejected": -484.2666320800781, "eval_loss": 0.5403606295585632, "eval_rewards/accuracies": 0.7198275923728943, "eval_rewards/chosen": -1.3958592414855957, "eval_rewards/margins": 0.9903314709663391, "eval_rewards/rejected": -2.38619065284729, "eval_runtime": 90.6243, "eval_samples_per_second": 20.061, "eval_steps_per_second": 0.32, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 27.111269521337316, "learning_rate": 1.1699198087116588e-07, "logits/chosen": 0.3549458086490631, "logits/rejected": 1.506676435470581, "logps/chosen": -407.77020263671875, "logps/rejected": -494.737548828125, "loss": 0.5603, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.536117434501648, "rewards/margins": 0.9887636303901672, "rewards/rejected": -2.524880886077881, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 20.93281634880908, "learning_rate": 1.00472367377196e-07, "logits/chosen": -0.10006646066904068, "logits/rejected": 1.6550134420394897, "logps/chosen": -442.34283447265625, "logps/rejected": -508.07061767578125, "loss": 0.5395, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3644827604293823, "rewards/margins": 1.2163000106811523, "rewards/rejected": -2.580782651901245, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 22.09617953181825, "learning_rate": 8.49126331382102e-08, "logits/chosen": 0.12130077183246613, "logits/rejected": 1.2904590368270874, "logps/chosen": -437.9993591308594, "logps/rejected": -525.1572875976562, "loss": 0.5394, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.592714548110962, "rewards/margins": 0.9397724270820618, "rewards/rejected": -2.5324866771698, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 21.35703312189058, "learning_rate": 7.041266247556812e-08, "logits/chosen": 0.1835191398859024, "logits/rejected": 1.8147304058074951, "logps/chosen": -403.4627380371094, "logps/rejected": -524.2403564453125, "loss": 0.5415, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4125099182128906, "rewards/margins": 1.2409336566925049, "rewards/rejected": -2.6534433364868164, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 21.04805960149964, "learning_rate": 5.706553665319955e-08, "logits/chosen": 0.23725076019763947, "logits/rejected": 2.2347147464752197, "logps/chosen": -436.37939453125, "logps/rejected": -537.32373046875, "loss": 0.5373, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4519966840744019, "rewards/margins": 1.4744818210601807, "rewards/rejected": -2.926478624343872, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": 0.23955857753753662, "eval_logits/rejected": 1.7159334421157837, "eval_logps/chosen": -439.8386535644531, "eval_logps/rejected": -505.62298583984375, "eval_loss": 0.5416247844696045, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -1.5583269596099854, "eval_rewards/margins": 1.041427493095398, "eval_rewards/rejected": -2.599754810333252, "eval_runtime": 92.2289, "eval_samples_per_second": 19.712, "eval_steps_per_second": 0.314, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 28.15381228175136, "learning_rate": 4.4956936350761005e-08, "logits/chosen": 0.37452951073646545, "logits/rejected": 1.507530689239502, "logps/chosen": -437.071044921875, "logps/rejected": -533.8405151367188, "loss": 0.535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5732779502868652, "rewards/margins": 1.106673002243042, "rewards/rejected": -2.679950714111328, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 17.904935938367142, "learning_rate": 3.416459164418123e-08, "logits/chosen": -0.39112910628318787, "logits/rejected": 1.349570631980896, "logps/chosen": -471.1695861816406, "logps/rejected": -532.53271484375, "loss": 0.5285, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3933590650558472, "rewards/margins": 1.198838472366333, "rewards/rejected": -2.5921976566314697, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 21.395515100278608, "learning_rate": 2.475778302439524e-08, "logits/chosen": -0.16580775380134583, "logits/rejected": 1.5711965560913086, "logps/chosen": -442.42633056640625, "logps/rejected": -471.59808349609375, "loss": 0.5345, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4915008544921875, "rewards/margins": 0.9731476902961731, "rewards/rejected": -2.464648723602295, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 18.765474571368344, "learning_rate": 1.6796896657433805e-08, "logits/chosen": -0.5060194730758667, "logits/rejected": 1.3254286050796509, "logps/chosen": -439.9669494628906, "logps/rejected": -542.6121826171875, "loss": 0.5238, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3199481964111328, "rewards/margins": 1.575811743736267, "rewards/rejected": -2.8957600593566895, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 22.250966254695054, "learning_rate": 1.0333036740834855e-08, "logits/chosen": 0.07749566435813904, "logits/rejected": 1.2516423463821411, "logps/chosen": -449.790283203125, "logps/rejected": -540.7836303710938, "loss": 0.5405, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5793349742889404, "rewards/margins": 0.9962536692619324, "rewards/rejected": -2.5755887031555176, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": -0.2526824176311493, "eval_logits/rejected": 1.2749311923980713, "eval_logps/chosen": -423.50390625, "eval_logps/rejected": -490.2825927734375, "eval_loss": 0.5382627844810486, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -1.3949792385101318, "eval_rewards/margins": 1.0513713359832764, "eval_rewards/rejected": -2.446350574493408, "eval_runtime": 90.812, "eval_samples_per_second": 20.019, "eval_steps_per_second": 0.319, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 20.214960371040412, "learning_rate": 5.4076974448211685e-09, "logits/chosen": 0.10587388277053833, "logits/rejected": 1.2455976009368896, "logps/chosen": -443.05621337890625, "logps/rejected": -499.88531494140625, "loss": 0.551, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5565433502197266, "rewards/margins": 0.8829916715621948, "rewards/rejected": -2.4395346641540527, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 24.19047240295495, "learning_rate": 2.052496544188487e-09, "logits/chosen": -0.0691867247223854, "logits/rejected": 1.6679766178131104, "logps/chosen": -455.8050231933594, "logps/rejected": -492.05401611328125, "loss": 0.5468, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5099737644195557, "rewards/margins": 1.0747706890106201, "rewards/rejected": -2.584744691848755, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 16.600958628723166, "learning_rate": 2.889724508297886e-10, "logits/chosen": 0.0887439101934433, "logits/rejected": 1.3473814725875854, "logps/chosen": -406.8893737792969, "logps/rejected": -502.24786376953125, "loss": 0.5367, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4497406482696533, "rewards/margins": 1.044012188911438, "rewards/rejected": -2.4937527179718018, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.5759701597581216, "train_runtime": 11541.3114, "train_samples_per_second": 4.831, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }