{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9807355516637478, "eval_steps": 18, "global_step": 35, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028021015761821366, "grad_norm": 97.3614384660342, "learning_rate": 8e-08, "logits/chosen": -7.731139183044434, "logits/rejected": -8.25313663482666, "logps/chosen": -1.6605920791625977, "logps/rejected": -1.683160424232483, "loss": 6.8307, "rewards/accuracies": 0.5, "rewards/chosen": -16.60592269897461, "rewards/margins": 0.22568130493164062, "rewards/rejected": -16.83160400390625, "sft_loss": 0.04547927528619766, "step": 1 }, { "epoch": 0.05604203152364273, "grad_norm": 121.91139131956491, "learning_rate": 2.6e-07, "logits/chosen": -7.893815040588379, "logits/rejected": -8.019620895385742, "logps/chosen": -1.952430248260498, "logps/rejected": -1.8113142251968384, "loss": 7.3627, "rewards/accuracies": 0.5, "rewards/chosen": -19.52429962158203, "rewards/margins": -1.4111591577529907, "rewards/rejected": -18.113142013549805, "sft_loss": 0.00891563668847084, "step": 2 }, { "epoch": 0.0840630472854641, "grad_norm": 115.99875317435591, "learning_rate": 4.4e-07, "logits/chosen": -9.035008430480957, "logits/rejected": -8.950678825378418, "logps/chosen": -1.4869132041931152, "logps/rejected": -1.50464928150177, "loss": 7.025, "rewards/accuracies": 0.6875, "rewards/chosen": -14.869132995605469, "rewards/margins": 0.17736005783081055, "rewards/rejected": -15.046493530273438, "sft_loss": 0.010355046950280666, "step": 3 }, { "epoch": 0.11208406304728546, "grad_norm": 143.98542912632848, "learning_rate": 6.2e-07, "logits/chosen": -9.924211502075195, "logits/rejected": -9.45657730102539, "logps/chosen": -1.4292300939559937, "logps/rejected": -1.2033114433288574, "loss": 6.4674, "rewards/accuracies": 0.3125, "rewards/chosen": -14.292301177978516, "rewards/margins": -2.2591874599456787, "rewards/rejected": -12.033114433288574, "sft_loss": 0.006642716005444527, "step": 4 }, { "epoch": 0.14010507880910683, "grad_norm": 105.0416363604681, "learning_rate": 7.981529564210822e-07, "logits/chosen": -8.538932800292969, "logits/rejected": -8.046061515808105, "logps/chosen": -1.5071882009506226, "logps/rejected": -1.675721526145935, "loss": 7.1364, "rewards/accuracies": 0.625, "rewards/chosen": -15.071882247924805, "rewards/margins": 1.6853327751159668, "rewards/rejected": -16.757217407226562, "sft_loss": 0.01169000007212162, "step": 5 }, { "epoch": 0.1681260945709282, "grad_norm": 136.46572113440772, "learning_rate": 7.926307788508979e-07, "logits/chosen": -8.856929779052734, "logits/rejected": -9.344861030578613, "logps/chosen": -1.6353546380996704, "logps/rejected": -1.7302504777908325, "loss": 5.6445, "rewards/accuracies": 0.6875, "rewards/chosen": -16.353546142578125, "rewards/margins": 0.9489572644233704, "rewards/rejected": -17.30250358581543, "sft_loss": 0.03519538417458534, "step": 6 }, { "epoch": 0.19614711033274956, "grad_norm": 125.56078536529542, "learning_rate": 7.834901323040175e-07, "logits/chosen": -7.622992515563965, "logits/rejected": -7.627020835876465, "logps/chosen": -2.047703981399536, "logps/rejected": -1.6984105110168457, "loss": 6.5965, "rewards/accuracies": 0.375, "rewards/chosen": -20.477039337158203, "rewards/margins": -3.4929349422454834, "rewards/rejected": -16.98410415649414, "sft_loss": 0.006214356515556574, "step": 7 }, { "epoch": 0.22416812609457093, "grad_norm": 135.07237608120852, "learning_rate": 7.70824812183283e-07, "logits/chosen": -8.93583869934082, "logits/rejected": -9.021809577941895, "logps/chosen": -1.3730394840240479, "logps/rejected": -1.4458472728729248, "loss": 6.3454, "rewards/accuracies": 0.6875, "rewards/chosen": -13.730398178100586, "rewards/margins": 0.7280769944190979, "rewards/rejected": -14.458473205566406, "sft_loss": 0.03004990890622139, "step": 8 }, { "epoch": 0.2521891418563923, "grad_norm": 239.98627324152338, "learning_rate": 7.547647818120495e-07, "logits/chosen": -9.406291961669922, "logits/rejected": -9.9324951171875, "logps/chosen": -1.5991909503936768, "logps/rejected": -1.6271830797195435, "loss": 5.2256, "rewards/accuracies": 0.5, "rewards/chosen": -15.991909980773926, "rewards/margins": 0.2799214720726013, "rewards/rejected": -16.271831512451172, "sft_loss": 0.03231532499194145, "step": 9 }, { "epoch": 0.28021015761821366, "grad_norm": 234.53437032749468, "learning_rate": 7.354748388346194e-07, "logits/chosen": -7.813473701477051, "logits/rejected": -9.137899398803711, "logps/chosen": -1.8943036794662476, "logps/rejected": -1.9154584407806396, "loss": 4.8325, "rewards/accuracies": 0.5625, "rewards/chosen": -18.943037033081055, "rewards/margins": 0.2115485668182373, "rewards/rejected": -19.154584884643555, "sft_loss": 0.008024048060178757, "step": 10 }, { "epoch": 0.30823117338003503, "grad_norm": 287.5499688149926, "learning_rate": 7.131529241694047e-07, "logits/chosen": -10.093656539916992, "logits/rejected": -10.823583602905273, "logps/chosen": -1.676328182220459, "logps/rejected": -2.193615198135376, "loss": 4.8686, "rewards/accuracies": 0.875, "rewards/chosen": -16.763280868530273, "rewards/margins": 5.17287015914917, "rewards/rejected": -21.9361515045166, "sft_loss": 0.018158258870244026, "step": 11 }, { "epoch": 0.3362521891418564, "grad_norm": 279.35544138821984, "learning_rate": 6.880280908672471e-07, "logits/chosen": -7.598231792449951, "logits/rejected": -8.863749504089355, "logps/chosen": -1.8558087348937988, "logps/rejected": -2.048665761947632, "loss": 4.1241, "rewards/accuracies": 0.625, "rewards/chosen": -18.558086395263672, "rewards/margins": 1.9285707473754883, "rewards/rejected": -20.486658096313477, "sft_loss": 0.003531986614689231, "step": 12 }, { "epoch": 0.36427320490367776, "grad_norm": 308.20678092603185, "learning_rate": 6.603581537171586e-07, "logits/chosen": -8.397397994995117, "logits/rejected": -10.13599681854248, "logps/chosen": -1.745999813079834, "logps/rejected": -1.8406281471252441, "loss": 3.8904, "rewards/accuracies": 0.75, "rewards/chosen": -17.459999084472656, "rewards/margins": 0.9462810754776001, "rewards/rejected": -18.406280517578125, "sft_loss": 0.010710272938013077, "step": 13 }, { "epoch": 0.3922942206654991, "grad_norm": 88.53513750352447, "learning_rate": 6.304270437177064e-07, "logits/chosen": -11.391769409179688, "logits/rejected": -13.656466484069824, "logps/chosen": -1.820195198059082, "logps/rejected": -2.2572662830352783, "loss": 2.6976, "rewards/accuracies": 0.75, "rewards/chosen": -18.201950073242188, "rewards/margins": 4.370712757110596, "rewards/rejected": -22.572664260864258, "sft_loss": 0.006363618653267622, "step": 14 }, { "epoch": 0.4203152364273205, "grad_norm": 82.22128437782617, "learning_rate": 5.985418945607484e-07, "logits/chosen": -12.035834312438965, "logits/rejected": -14.527205467224121, "logps/chosen": -2.3111374378204346, "logps/rejected": -3.1396684646606445, "loss": 1.2788, "rewards/accuracies": 0.9375, "rewards/chosen": -23.111371994018555, "rewards/margins": 8.285309791564941, "rewards/rejected": -31.396682739257812, "sft_loss": 0.007131902035325766, "step": 15 }, { "epoch": 0.44833625218914186, "grad_norm": 120.40204277011782, "learning_rate": 5.650298910241353e-07, "logits/chosen": -12.784805297851562, "logits/rejected": -15.177325248718262, "logps/chosen": -2.0324227809906006, "logps/rejected": -3.119992971420288, "loss": 1.4116, "rewards/accuracies": 0.875, "rewards/chosen": -20.324228286743164, "rewards/margins": 10.875699996948242, "rewards/rejected": -31.199928283691406, "sft_loss": 0.005817623808979988, "step": 16 }, { "epoch": 0.4763572679509632, "grad_norm": 97.73770625062221, "learning_rate": 5.302349116131393e-07, "logits/chosen": -15.652244567871094, "logits/rejected": -17.80880355834961, "logps/chosen": -1.9218964576721191, "logps/rejected": -2.738666296005249, "loss": 1.6013, "rewards/accuracies": 0.9375, "rewards/chosen": -19.218965530395508, "rewards/margins": 8.167696952819824, "rewards/rejected": -27.386659622192383, "sft_loss": 0.004993550945073366, "step": 17 }, { "epoch": 0.5043782837127846, "grad_norm": 189.96208267597956, "learning_rate": 4.945139999016476e-07, "logits/chosen": -15.17531681060791, "logits/rejected": -15.677058219909668, "logps/chosen": -1.9880082607269287, "logps/rejected": -3.052130699157715, "loss": 1.8319, "rewards/accuracies": 0.875, "rewards/chosen": -19.880081176757812, "rewards/margins": 10.641225814819336, "rewards/rejected": -30.52130889892578, "sft_loss": 0.008743491023778915, "step": 18 }, { "epoch": 0.532399299474606, "grad_norm": 114.52318633727846, "learning_rate": 4.5823370078193663e-07, "logits/chosen": -10.547327995300293, "logits/rejected": -14.194029808044434, "logps/chosen": -2.2875614166259766, "logps/rejected": -3.8111658096313477, "loss": 1.7772, "rewards/accuracies": 0.8125, "rewards/chosen": -22.875612258911133, "rewards/margins": 15.236043930053711, "rewards/rejected": -38.111656188964844, "sft_loss": 0.01470925658941269, "step": 19 }, { "epoch": 0.5604203152364273, "grad_norm": 82.97272038448429, "learning_rate": 4.217662992180634e-07, "logits/chosen": -10.6708345413208, "logits/rejected": -15.503955841064453, "logps/chosen": -2.2511441707611084, "logps/rejected": -3.763054132461548, "loss": 1.6103, "rewards/accuracies": 0.8125, "rewards/chosen": -22.511442184448242, "rewards/margins": 15.119099617004395, "rewards/rejected": -37.63053894042969, "sft_loss": 0.012447498738765717, "step": 20 }, { "epoch": 0.5884413309982487, "grad_norm": 93.13289791788793, "learning_rate": 3.8548600009835237e-07, "logits/chosen": -11.672554016113281, "logits/rejected": -16.79704475402832, "logps/chosen": -2.7983806133270264, "logps/rejected": -4.703005790710449, "loss": 1.4708, "rewards/accuracies": 0.9375, "rewards/chosen": -27.983802795410156, "rewards/margins": 19.046255111694336, "rewards/rejected": -47.030059814453125, "sft_loss": 0.0047310409136116505, "step": 21 }, { "epoch": 0.6164623467600701, "grad_norm": 97.54389148924957, "learning_rate": 3.4976508838686066e-07, "logits/chosen": -16.980070114135742, "logits/rejected": -18.32730484008789, "logps/chosen": -2.150972843170166, "logps/rejected": -3.424234390258789, "loss": 1.1655, "rewards/accuracies": 1.0, "rewards/chosen": -21.509729385375977, "rewards/margins": 12.732614517211914, "rewards/rejected": -34.242340087890625, "sft_loss": 0.007569438312202692, "step": 22 }, { "epoch": 0.6444833625218914, "grad_norm": 73.41731467567557, "learning_rate": 3.149701089758648e-07, "logits/chosen": -11.438061714172363, "logits/rejected": -14.675212860107422, "logps/chosen": -2.3481569290161133, "logps/rejected": -4.2203450202941895, "loss": 1.2242, "rewards/accuracies": 0.9375, "rewards/chosen": -23.481565475463867, "rewards/margins": 18.721879959106445, "rewards/rejected": -42.20344543457031, "sft_loss": 0.010319937951862812, "step": 23 }, { "epoch": 0.6725043782837128, "grad_norm": 96.48356219878691, "learning_rate": 2.8145810543925163e-07, "logits/chosen": -11.793488502502441, "logits/rejected": -15.576847076416016, "logps/chosen": -2.3053696155548096, "logps/rejected": -4.343653202056885, "loss": 1.4752, "rewards/accuracies": 0.9375, "rewards/chosen": -23.053693771362305, "rewards/margins": 20.382837295532227, "rewards/rejected": -43.4365348815918, "sft_loss": 0.024834871292114258, "step": 24 }, { "epoch": 0.7005253940455342, "grad_norm": 122.92321660024119, "learning_rate": 2.495729562822935e-07, "logits/chosen": -16.064531326293945, "logits/rejected": -17.884010314941406, "logps/chosen": -2.647796869277954, "logps/rejected": -3.9600863456726074, "loss": 1.4732, "rewards/accuracies": 1.0, "rewards/chosen": -26.477968215942383, "rewards/margins": 13.12289810180664, "rewards/rejected": -39.60086441040039, "sft_loss": 0.021936513483524323, "step": 25 }, { "epoch": 0.7285464098073555, "grad_norm": 80.26268266585235, "learning_rate": 2.196418462828415e-07, "logits/chosen": -12.343572616577148, "logits/rejected": -15.574173927307129, "logps/chosen": -2.2388336658477783, "logps/rejected": -4.106793403625488, "loss": 1.2593, "rewards/accuracies": 0.8125, "rewards/chosen": -22.388338088989258, "rewards/margins": 18.679595947265625, "rewards/rejected": -41.06793212890625, "sft_loss": 0.013025043532252312, "step": 26 }, { "epoch": 0.7565674255691769, "grad_norm": 66.28989889413502, "learning_rate": 1.9197190913275294e-07, "logits/chosen": -12.330286026000977, "logits/rejected": -15.901168823242188, "logps/chosen": -2.5048580169677734, "logps/rejected": -4.5386857986450195, "loss": 1.1459, "rewards/accuracies": 0.8125, "rewards/chosen": -25.048580169677734, "rewards/margins": 20.338275909423828, "rewards/rejected": -45.38685607910156, "sft_loss": 0.006859698798507452, "step": 27 }, { "epoch": 0.7845884413309983, "grad_norm": 81.14927404545568, "learning_rate": 1.6684707583059529e-07, "logits/chosen": -17.190406799316406, "logits/rejected": -19.05614471435547, "logps/chosen": -2.768648147583008, "logps/rejected": -4.382925033569336, "loss": 1.3002, "rewards/accuracies": 0.9375, "rewards/chosen": -27.68647575378418, "rewards/margins": 16.142770767211914, "rewards/rejected": -43.82925033569336, "sft_loss": 0.005794988479465246, "step": 28 }, { "epoch": 0.8126094570928196, "grad_norm": 70.98888001094448, "learning_rate": 1.4452516116538054e-07, "logits/chosen": -10.294852256774902, "logits/rejected": -15.053112030029297, "logps/chosen": -2.106778383255005, "logps/rejected": -4.66357421875, "loss": 0.8862, "rewards/accuracies": 0.9375, "rewards/chosen": -21.067781448364258, "rewards/margins": 25.567956924438477, "rewards/rejected": -46.6357421875, "sft_loss": 0.008147615939378738, "step": 29 }, { "epoch": 0.840630472854641, "grad_norm": 99.24795263935427, "learning_rate": 1.2523521818795044e-07, "logits/chosen": -9.799162864685059, "logits/rejected": -15.37686538696289, "logps/chosen": -2.270378828048706, "logps/rejected": -5.39860725402832, "loss": 1.3672, "rewards/accuracies": 1.0, "rewards/chosen": -22.70379066467285, "rewards/margins": 31.282283782958984, "rewards/rejected": -53.98607635498047, "sft_loss": 0.005661052651703358, "step": 30 }, { "epoch": 0.8686514886164624, "grad_norm": 105.42160708641147, "learning_rate": 1.0917518781671699e-07, "logits/chosen": -10.193641662597656, "logits/rejected": -13.275125503540039, "logps/chosen": -2.434311866760254, "logps/rejected": -3.9474875926971436, "loss": 1.3672, "rewards/accuracies": 0.875, "rewards/chosen": -24.343116760253906, "rewards/margins": 15.131752967834473, "rewards/rejected": -39.47487258911133, "sft_loss": 0.01922934129834175, "step": 31 }, { "epoch": 0.8966725043782837, "grad_norm": 75.46584751909548, "learning_rate": 9.650986769598242e-08, "logits/chosen": -9.355328559875488, "logits/rejected": -16.710058212280273, "logps/chosen": -2.3611741065979004, "logps/rejected": -5.737414360046387, "loss": 1.6485, "rewards/accuracies": 0.9375, "rewards/chosen": -23.61174201965332, "rewards/margins": 33.76239776611328, "rewards/rejected": -57.3741455078125, "sft_loss": 0.010620678775012493, "step": 32 }, { "epoch": 0.9246935201401051, "grad_norm": 84.00269490747272, "learning_rate": 8.736922114910199e-08, "logits/chosen": -10.082009315490723, "logits/rejected": -14.500289916992188, "logps/chosen": -2.227128267288208, "logps/rejected": -5.038478374481201, "loss": 1.3242, "rewards/accuracies": 0.9375, "rewards/chosen": -22.271284103393555, "rewards/margins": 28.113502502441406, "rewards/rejected": -50.384788513183594, "sft_loss": 0.009218152612447739, "step": 33 }, { "epoch": 0.9527145359019265, "grad_norm": 75.42419711071338, "learning_rate": 8.184704357891779e-08, "logits/chosen": -10.325528144836426, "logits/rejected": -16.278217315673828, "logps/chosen": -2.2617452144622803, "logps/rejected": -4.84274435043335, "loss": 0.8932, "rewards/accuracies": 0.9375, "rewards/chosen": -22.617454528808594, "rewards/margins": 25.809988021850586, "rewards/rejected": -48.42743682861328, "sft_loss": 0.014543527737259865, "step": 34 }, { "epoch": 0.9807355516637478, "grad_norm": 108.8748972501278, "learning_rate": 8e-08, "logits/chosen": -12.176219940185547, "logits/rejected": -16.480701446533203, "logps/chosen": -2.5212955474853516, "logps/rejected": -4.665997505187988, "loss": 1.4444, "rewards/accuracies": 0.9375, "rewards/chosen": -25.21295166015625, "rewards/margins": 21.447019577026367, "rewards/rejected": -46.65997314453125, "sft_loss": 0.04498640075325966, "step": 35 }, { "epoch": 0.9807355516637478, "eval_logits/chosen": -12.004097938537598, "eval_logits/rejected": -17.047502517700195, "eval_logps/chosen": -2.168222427368164, "eval_logps/rejected": -4.787535667419434, "eval_loss": 1.0162526369094849, "eval_rewards/accuracies": 0.9166666865348816, "eval_rewards/chosen": -21.682226181030273, "eval_rewards/margins": 26.193130493164062, "eval_rewards/rejected": -47.875362396240234, "eval_runtime": 9.3123, "eval_samples_per_second": 10.094, "eval_sft_loss": 0.01844729855656624, "eval_steps_per_second": 1.289, "step": 35 }, { "before_init_mem_cpu": 3802071040, "before_init_mem_gpu": 22016, "epoch": 0.9807355516637478, "init_mem_cpu_alloc_delta": 364544, "init_mem_cpu_peaked_delta": 0, "init_mem_gpu_alloc_delta": 0, "init_mem_gpu_peaked_delta": 0, "step": 35, "total_flos": 39867492466688.0, "train_loss": 3.085822834287371, "train_mem_cpu_alloc_delta": 5213659136, "train_mem_cpu_peaked_delta": 22737326080, "train_mem_gpu_alloc_delta": 16267848704, "train_mem_gpu_peaked_delta": 36029468160, "train_runtime": 1628.7465, "train_samples_per_second": 2.805, "train_steps_per_second": 0.021 } ], "logging_steps": 1.0, "max_steps": 35, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 18, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 39867492466688.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }