{ "best_metric": null, "best_model_checkpoint": null, "episode": 17920, "epoch": 0.24579258507413554, "eval_steps": 200.0, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 256, "epoch": 0.003511322643916222, "eps": 6, "loss/policy_avg": -0.07090990990400314, "loss/value_avg": 0.0, "lr": 3e-06, "objective/entropy": 49.42120361328125, "objective/kl": 0.006465356796979904, "objective/non_score_reward": -0.000646535714622587, "objective/rlhf_reward": -1.1137903928756714, "objective/scores": -1.109375, "policy/approxkl_avg": 27.096786499023438, "policy/clipfrac_avg": 0.732421875, "policy/entropy_avg": 0.92181396484375, "step": 5, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.0399832725524902, "val/ratio_var": 0.010045886039733887 }, { "episode": 512, "epoch": 0.007022645287832444, "eps": 6, "loss/policy_avg": -0.06497187167406082, "loss/value_avg": 0.0, "lr": 2.9923273657289e-06, "objective/entropy": 48.286014556884766, "objective/kl": 0.8119473457336426, "objective/non_score_reward": -0.08119472861289978, "objective/rlhf_reward": -1.266162633895874, "objective/scores": -1.1875, "policy/approxkl_avg": 18.666072845458984, "policy/clipfrac_avg": 0.7314453125, "policy/entropy_avg": 0.912261962890625, "step": 10, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.020957112312317, "val/ratio_var": 0.00411860179156065 }, { "episode": 768, "epoch": 0.010533967931748666, "eps": 6, "loss/policy_avg": -0.0872286781668663, "loss/value_avg": 0.0, "lr": 2.9846547314578008e-06, "objective/entropy": 49.34376525878906, "objective/kl": 1.9591996669769287, "objective/non_score_reward": -0.1959199756383896, "objective/rlhf_reward": -1.2858657836914062, "objective/scores": -1.09375, "policy/approxkl_avg": 20.772502899169922, "policy/clipfrac_avg": 0.73828125, "policy/entropy_avg": 0.927978515625, "step": 15, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.0191609859466553, "val/ratio_var": 0.00307083735242486 }, { "episode": 1024, "epoch": 0.014045290575664887, "eps": 6, "loss/policy_avg": -0.07566041499376297, "loss/value_avg": 0.0, "lr": 2.9769820971867007e-06, "objective/entropy": 53.13662338256836, "objective/kl": 2.4811532497406006, "objective/non_score_reward": -0.24811533093452454, "objective/rlhf_reward": -1.2548893690109253, "objective/scores": -1.0078125, "policy/approxkl_avg": 20.665164947509766, "policy/clipfrac_avg": 0.7314453125, "policy/entropy_avg": 0.989776611328125, "step": 20, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.011010766029358, "val/ratio_var": 0.004201602190732956 }, { "episode": 1280, "epoch": 0.01755661321958111, "eps": 6, "loss/policy_avg": -0.08593496680259705, "loss/value_avg": 0.0, "lr": 2.9693094629156014e-06, "objective/entropy": 53.72633743286133, "objective/kl": 3.3111624717712402, "objective/non_score_reward": -0.3311161994934082, "objective/rlhf_reward": -1.339456558227539, "objective/scores": -1.0078125, "policy/approxkl_avg": 25.559288024902344, "policy/clipfrac_avg": 0.7353515625, "policy/entropy_avg": 0.997894287109375, "step": 25, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0134021043777466, "val/ratio_var": 0.0019979747012257576 }, { "episode": 1536, "epoch": 0.021067935863497332, "eps": 6, "loss/policy_avg": -0.09734417498111725, "loss/value_avg": 0.0, "lr": 2.9616368286445014e-06, "objective/entropy": 51.259735107421875, "objective/kl": 5.089182376861572, "objective/non_score_reward": -0.5089181661605835, "objective/rlhf_reward": -1.2202520370483398, "objective/scores": -0.7109375, "policy/approxkl_avg": 29.841636657714844, "policy/clipfrac_avg": 0.736328125, "policy/entropy_avg": 0.960479736328125, "step": 30, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 26, "val/ratio": 1.0178756713867188, "val/ratio_var": 0.009866585955023766 }, { "episode": 1792, "epoch": 0.024579258507413555, "eps": 6, "loss/policy_avg": -0.06831618398427963, "loss/value_avg": 0.0, "lr": 2.9539641943734013e-06, "objective/entropy": 40.643272399902344, "objective/kl": 6.974010944366455, "objective/non_score_reward": -0.6974011063575745, "objective/rlhf_reward": -1.2684605121612549, "objective/scores": -0.5703125, "policy/approxkl_avg": 35.33942413330078, "policy/clipfrac_avg": 0.6982421875, "policy/entropy_avg": 0.7505035400390625, "step": 35, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.00449800491333, "val/ratio_var": 0.0022142010275274515 }, { "episode": 2048, "epoch": 0.028090581151329775, "eps": 6, "loss/policy_avg": -0.04068079590797424, "loss/value_avg": 0.0, "lr": 2.946291560102302e-06, "objective/entropy": 23.142562866210938, "objective/kl": 8.180486679077148, "objective/non_score_reward": -0.8180487155914307, "objective/rlhf_reward": -1.0729957818984985, "objective/scores": -0.255859375, "policy/approxkl_avg": 23.68307876586914, "policy/clipfrac_avg": 0.5859375, "policy/entropy_avg": 0.4361400604248047, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.0077030658721924, "val/ratio_var": 0.0024766812566667795 }, { "episode": 2304, "epoch": 0.031601903795246, "eps": 6, "loss/policy_avg": -0.07307010889053345, "loss/value_avg": 0.0, "lr": 2.938618925831202e-06, "objective/entropy": 19.376842498779297, "objective/kl": 8.770210266113281, "objective/non_score_reward": -0.8770210146903992, "objective/rlhf_reward": -1.0002652406692505, "objective/scores": -0.12353515625, "policy/approxkl_avg": 31.00873565673828, "policy/clipfrac_avg": 0.5302734375, "policy/entropy_avg": 0.33237457275390625, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 0.996111273765564, "val/ratio_var": 0.001100091845728457 }, { "episode": 2560, "epoch": 0.03511322643916222, "eps": 6, "loss/policy_avg": -0.04584116116166115, "loss/value_avg": 0.0, "lr": 2.9309462915601027e-06, "objective/entropy": 11.984097480773926, "objective/kl": 8.4966402053833, "objective/non_score_reward": -0.849664032459259, "objective/rlhf_reward": -0.8017911911010742, "objective/scores": 0.0478515625, "policy/approxkl_avg": 22.561037063598633, "policy/clipfrac_avg": 0.451171875, "policy/entropy_avg": 0.19393539428710938, "step": 50, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 0.9952375888824463, "val/ratio_var": 0.000761833623982966 }, { "episode": 2816, "epoch": 0.03862454908307844, "eps": 5, "loss/policy_avg": -0.029720915481448174, "loss/value_avg": 0.0, "lr": 2.9232736572890026e-06, "objective/entropy": 4.9489898681640625, "objective/kl": 8.733837127685547, "objective/non_score_reward": -0.8733837604522705, "objective/rlhf_reward": -0.7492713928222656, "objective/scores": 0.1240234375, "policy/approxkl_avg": 16.253189086914062, "policy/clipfrac_avg": 0.341796875, "policy/entropy_avg": 0.07728099822998047, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 18, "val/ratio": 0.9972053170204163, "val/ratio_var": 0.00032430028659291565 }, { "episode": 3072, "epoch": 0.042135871726994664, "eps": 5, "loss/policy_avg": -0.01298562902957201, "loss/value_avg": 0.0, "lr": 2.9156010230179026e-06, "objective/entropy": 1.3101667165756226, "objective/kl": 8.699792861938477, "objective/non_score_reward": -0.8699792623519897, "objective/rlhf_reward": -0.5752952098846436, "objective/scores": 0.294921875, "policy/approxkl_avg": 2.27925968170166, "policy/clipfrac_avg": 0.236328125, "policy/entropy_avg": 0.02513742446899414, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.0017118453979492, "val/ratio_var": 0.00016639505338389426 }, { "episode": 3328, "epoch": 0.04564719437091089, "eps": 5, "loss/policy_avg": -0.02618303708732128, "loss/value_avg": 0.0, "lr": 2.9079283887468033e-06, "objective/entropy": 2.3685269355773926, "objective/kl": 9.208517074584961, "objective/non_score_reward": -0.9208516478538513, "objective/rlhf_reward": -0.5182289481163025, "objective/scores": 0.40234375, "policy/approxkl_avg": 2.6189699172973633, "policy/clipfrac_avg": 0.310546875, "policy/entropy_avg": 0.04020071029663086, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.003983497619629, "val/ratio_var": 0.0009448421187698841 }, { "episode": 3584, "epoch": 0.04915851701482711, "eps": 5, "loss/policy_avg": -0.02327096462249756, "loss/value_avg": 0.0, "lr": 2.9002557544757032e-06, "objective/entropy": 2.0416018962860107, "objective/kl": 9.701976776123047, "objective/non_score_reward": -0.9701976776123047, "objective/rlhf_reward": -0.49486449360847473, "objective/scores": 0.474609375, "policy/approxkl_avg": 1.271956443786621, "policy/clipfrac_avg": 0.2734375, "policy/entropy_avg": 0.041253089904785156, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.0039558410644531, "val/ratio_var": 0.00041477559716440737 }, { "episode": 3840, "epoch": 0.052669839658743334, "eps": 5, "loss/policy_avg": -0.033096276223659515, "loss/value_avg": 0.0, "lr": 2.892583120204604e-06, "objective/entropy": 2.7795495986938477, "objective/kl": 10.028523445129395, "objective/non_score_reward": -1.0028523206710815, "objective/rlhf_reward": -0.46555712819099426, "objective/scores": 0.5390625, "policy/approxkl_avg": 3.055203676223755, "policy/clipfrac_avg": 0.3427734375, "policy/entropy_avg": 0.053270816802978516, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 23, "val/ratio": 1.0012407302856445, "val/ratio_var": 0.00011274257121840492 }, { "episode": 4096, "epoch": 0.05618116230265955, "eps": 5, "loss/policy_avg": -0.01961323618888855, "loss/value_avg": 0.0, "lr": 2.884910485933504e-06, "objective/entropy": 2.5525641441345215, "objective/kl": 10.111019134521484, "objective/non_score_reward": -1.0111019611358643, "objective/rlhf_reward": -0.510233461856842, "objective/scores": 0.5, "policy/approxkl_avg": 1.331697940826416, "policy/clipfrac_avg": 0.2861328125, "policy/entropy_avg": 0.048857688903808594, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 25, "val/ratio": 1.011049509048462, "val/ratio_var": 0.004252108279615641 }, { "episode": 4352, "epoch": 0.05969248494657577, "eps": 5, "loss/policy_avg": -0.009127877652645111, "loss/value_avg": 0.0, "lr": 2.877237851662404e-06, "objective/entropy": 3.016789674758911, "objective/kl": 11.257818222045898, "objective/non_score_reward": -1.125781774520874, "objective/rlhf_reward": -0.4276960492134094, "objective/scores": 0.69921875, "policy/approxkl_avg": 1.4772686958312988, "policy/clipfrac_avg": 0.35546875, "policy/entropy_avg": 0.053719520568847656, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 6, "val/ratio": 1.0042904615402222, "val/ratio_var": 0.0008556774700991809 }, { "episode": 4608, "epoch": 0.063203807590492, "eps": 5, "loss/policy_avg": -0.025049656629562378, "loss/value_avg": 0.0, "lr": 2.8695652173913046e-06, "objective/entropy": 2.5907459259033203, "objective/kl": 10.457273483276367, "objective/non_score_reward": -1.0457274913787842, "objective/rlhf_reward": -0.3816419839859009, "objective/scores": 0.6640625, "policy/approxkl_avg": 2.3460922241210938, "policy/clipfrac_avg": 0.322265625, "policy/entropy_avg": 0.04626178741455078, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.0003862380981445, "val/ratio_var": 7.93520302977413e-05 }, { "episode": 4864, "epoch": 0.06671513023440821, "eps": 5, "loss/policy_avg": -0.01828361675143242, "loss/value_avg": 0.0, "lr": 2.8618925831202045e-06, "objective/entropy": 2.397810220718384, "objective/kl": 10.732559204101562, "objective/non_score_reward": -1.073256015777588, "objective/rlhf_reward": -0.35966813564300537, "objective/scores": 0.71484375, "policy/approxkl_avg": 1.1093428134918213, "policy/clipfrac_avg": 0.32421875, "policy/entropy_avg": 0.041881561279296875, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.0054664611816406, "val/ratio_var": 0.0017973663052543998 }, { "episode": 5120, "epoch": 0.07022645287832444, "eps": 5, "loss/policy_avg": -0.04088423401117325, "loss/value_avg": 0.0, "lr": 2.8542199488491053e-06, "objective/entropy": 2.343449592590332, "objective/kl": 11.780994415283203, "objective/non_score_reward": -1.1780993938446045, "objective/rlhf_reward": -0.4628324806690216, "objective/scores": 0.71484375, "policy/approxkl_avg": 0.894420325756073, "policy/clipfrac_avg": 0.46875, "policy/entropy_avg": 0.04486083984375, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.0009559392929077, "val/ratio_var": 4.804596756002866e-05 }, { "episode": 5376, "epoch": 0.07373777552224066, "eps": 5, "loss/policy_avg": -0.020697183907032013, "loss/value_avg": 0.0, "lr": 2.846547314578005e-06, "objective/entropy": 1.9023351669311523, "objective/kl": 10.29288101196289, "objective/non_score_reward": -1.0292882919311523, "objective/rlhf_reward": -0.29047834873199463, "objective/scores": 0.73828125, "policy/approxkl_avg": 0.9143690466880798, "policy/clipfrac_avg": 0.373046875, "policy/entropy_avg": 0.028568267822265625, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 1.000715732574463, "val/ratio_var": 4.201457340968773e-05 }, { "episode": 5632, "epoch": 0.07724909816615688, "eps": 5, "loss/policy_avg": -0.012633640319108963, "loss/value_avg": 0.0, "lr": 2.8388746803069055e-06, "objective/entropy": 1.3839142322540283, "objective/kl": 10.57151985168457, "objective/non_score_reward": -1.0571520328521729, "objective/rlhf_reward": -0.2935946583747864, "objective/scores": 0.765625, "policy/approxkl_avg": 0.6525547504425049, "policy/clipfrac_avg": 0.2646484375, "policy/entropy_avg": 0.0345916748046875, "step": 110, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 10, "val/ratio": 0.9999199509620667, "val/ratio_var": 2.6978697860613465e-05 }, { "episode": 5888, "epoch": 0.0807604208100731, "eps": 5, "loss/policy_avg": -0.026668714359402657, "loss/value_avg": 0.0, "lr": 2.831202046035806e-06, "objective/entropy": 2.17741322517395, "objective/kl": 11.39688491821289, "objective/non_score_reward": -1.139688491821289, "objective/rlhf_reward": -0.3027456998825073, "objective/scores": 0.8359375, "policy/approxkl_avg": 8.829752922058105, "policy/clipfrac_avg": 0.35546875, "policy/entropy_avg": 0.034277915954589844, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.0012441873550415, "val/ratio_var": 9.009366476675496e-05 }, { "episode": 6144, "epoch": 0.08427174345398933, "eps": 5, "loss/policy_avg": -0.011602860875427723, "loss/value_avg": 0.0, "lr": 2.823529411764706e-06, "objective/entropy": 1.418602466583252, "objective/kl": 10.246469497680664, "objective/non_score_reward": -1.0246469974517822, "objective/rlhf_reward": -0.22599510848522186, "objective/scores": 0.796875, "policy/approxkl_avg": 0.31790149211883545, "policy/clipfrac_avg": 0.2314453125, "policy/entropy_avg": 0.028847694396972656, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 9, "val/ratio": 1.0009679794311523, "val/ratio_var": 3.900106457876973e-05 }, { "episode": 6400, "epoch": 0.08778306609790555, "eps": 5, "loss/policy_avg": -0.0157505851238966, "loss/value_avg": 0.0, "lr": 2.8158567774936066e-06, "objective/entropy": 1.936393141746521, "objective/kl": 10.550077438354492, "objective/non_score_reward": -1.0550076961517334, "objective/rlhf_reward": -0.252943217754364, "objective/scores": 0.80078125, "policy/approxkl_avg": 6.545133113861084, "policy/clipfrac_avg": 0.341796875, "policy/entropy_avg": 0.039971351623535156, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.0001187324523926, "val/ratio_var": 0.00011527155584190041 }, { "episode": 6656, "epoch": 0.09129438874182177, "eps": 5, "loss/policy_avg": -0.00908716581761837, "loss/value_avg": 0.0, "lr": 2.8081841432225065e-06, "objective/entropy": 1.9167767763137817, "objective/kl": 10.831771850585938, "objective/non_score_reward": -1.0831772089004517, "objective/rlhf_reward": -0.24270595610141754, "objective/scores": 0.83984375, "policy/approxkl_avg": 13.507976531982422, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.034499168395996094, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.0004911422729492, "val/ratio_var": 0.00018595268193166703 }, { "episode": 6912, "epoch": 0.094805711385738, "eps": 5, "loss/policy_avg": -0.017197387292981148, "loss/value_avg": 0.0, "lr": 2.800511508951407e-06, "objective/entropy": 1.7237651348114014, "objective/kl": 11.095592498779297, "objective/non_score_reward": -1.1095592975616455, "objective/rlhf_reward": -0.21057555079460144, "objective/scores": 0.8984375, "policy/approxkl_avg": 2.7560040950775146, "policy/clipfrac_avg": 0.2841796875, "policy/entropy_avg": 0.032952308654785156, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 2, "val/ratio": 0.9994020462036133, "val/ratio_var": 3.074964843108319e-05 }, { "episode": 7168, "epoch": 0.09831703402965422, "eps": 5, "loss/policy_avg": -0.012010859325528145, "loss/value_avg": 0.0, "lr": 2.792838874680307e-06, "objective/entropy": 1.5862581729888916, "objective/kl": 10.674396514892578, "objective/non_score_reward": -1.0674396753311157, "objective/rlhf_reward": -0.14433012902736664, "objective/scores": 0.921875, "policy/approxkl_avg": 1.1186727285385132, "policy/clipfrac_avg": 0.2783203125, "policy/entropy_avg": 0.0295562744140625, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0007727146148682, "val/ratio_var": 4.557183274300769e-05 }, { "episode": 7424, "epoch": 0.10182835667357044, "eps": 5, "loss/policy_avg": -0.013728385791182518, "loss/value_avg": 0.0, "lr": 2.785166240409207e-06, "objective/entropy": 1.5388869047164917, "objective/kl": 10.359582901000977, "objective/non_score_reward": -1.035958170890808, "objective/rlhf_reward": -0.14511710405349731, "objective/scores": 0.890625, "policy/approxkl_avg": 0.5204602479934692, "policy/clipfrac_avg": 0.283203125, "policy/entropy_avg": 0.028924942016601562, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.056097149848938, "val/ratio_var": 0.13372056186199188 }, { "episode": 7680, "epoch": 0.10533967931748667, "eps": 5, "loss/policy_avg": -0.014945434406399727, "loss/value_avg": 0.0, "lr": 2.7774936061381074e-06, "objective/entropy": 2.0769755840301514, "objective/kl": 11.147063255310059, "objective/non_score_reward": -1.11470627784729, "objective/rlhf_reward": -0.08940108120441437, "objective/scores": 1.0234375, "policy/approxkl_avg": 0.5961493253707886, "policy/clipfrac_avg": 0.3681640625, "policy/entropy_avg": 0.037804603576660156, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0033739805221558, "val/ratio_var": 0.00030022990540601313 }, { "episode": 7936, "epoch": 0.10885100196140288, "eps": 5, "loss/policy_avg": -0.02276831492781639, "loss/value_avg": 0.0, "lr": 2.7698209718670078e-06, "objective/entropy": 2.1412830352783203, "objective/kl": 11.697949409484863, "objective/non_score_reward": -1.169795036315918, "objective/rlhf_reward": -0.13582009077072144, "objective/scores": 1.03125, "policy/approxkl_avg": 0.7155288457870483, "policy/clipfrac_avg": 0.3193359375, "policy/entropy_avg": 0.037835121154785156, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0014090538024902, "val/ratio_var": 5.2470270020421594e-05 }, { "episode": 8192, "epoch": 0.1123623246053191, "eps": 5, "loss/policy_avg": -0.013076605275273323, "loss/value_avg": 0.0, "lr": 2.762148337595908e-06, "objective/entropy": 1.634714126586914, "objective/kl": 11.629154205322266, "objective/non_score_reward": -1.1629154682159424, "objective/rlhf_reward": -0.28488799929618835, "objective/scores": 0.87890625, "policy/approxkl_avg": 0.4181188941001892, "policy/clipfrac_avg": 0.3037109375, "policy/entropy_avg": 0.029273509979248047, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.0008339881896973, "val/ratio_var": 1.4662801731901709e-05 }, { "episode": 8448, "epoch": 0.11587364724923532, "eps": 5, "loss/policy_avg": -0.01651182770729065, "loss/value_avg": 0.0, "lr": 2.7544757033248085e-06, "objective/entropy": 1.9540742635726929, "objective/kl": 11.4830322265625, "objective/non_score_reward": -1.1483032703399658, "objective/rlhf_reward": -0.05983233451843262, "objective/scores": 1.0859375, "policy/approxkl_avg": 18.791297912597656, "policy/clipfrac_avg": 0.2880859375, "policy/entropy_avg": 0.03601264953613281, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.0220942497253418, "val/ratio_var": 0.02208283357322216 }, { "episode": 8704, "epoch": 0.11938496989315155, "eps": 5, "loss/policy_avg": -0.013821810483932495, "loss/value_avg": 0.0, "lr": 2.7468030690537084e-06, "objective/entropy": 1.6243339776992798, "objective/kl": 11.435280799865723, "objective/non_score_reward": -1.1435281038284302, "objective/rlhf_reward": -0.12443088740110397, "objective/scores": 1.015625, "policy/approxkl_avg": 0.29013216495513916, "policy/clipfrac_avg": 0.28125, "policy/entropy_avg": 0.03498649597167969, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.0027971267700195, "val/ratio_var": 0.0002298366161994636 }, { "episode": 8960, "epoch": 0.12289629253706777, "eps": 5, "loss/policy_avg": -0.011003649793565273, "loss/value_avg": 0.0, "lr": 2.7391304347826087e-06, "objective/entropy": 2.000375986099243, "objective/kl": 11.78514575958252, "objective/non_score_reward": -1.1785145998001099, "objective/rlhf_reward": -0.2609584331512451, "objective/scores": 0.91796875, "policy/approxkl_avg": 0.8603074550628662, "policy/clipfrac_avg": 0.2998046875, "policy/entropy_avg": 0.034775733947753906, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.0012288093566895, "val/ratio_var": 3.532394111971371e-05 }, { "episode": 9216, "epoch": 0.126407615180984, "eps": 5, "loss/policy_avg": -0.010885423980653286, "loss/value_avg": 0.0, "lr": 2.731457800511509e-06, "objective/entropy": 1.5240473747253418, "objective/kl": 12.420597076416016, "objective/non_score_reward": -1.2420598268508911, "objective/rlhf_reward": -0.16641265153884888, "objective/scores": 1.078125, "policy/approxkl_avg": 0.46217110753059387, "policy/clipfrac_avg": 0.2783203125, "policy/entropy_avg": 0.029424667358398438, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.0007582902908325, "val/ratio_var": 2.4759892767178826e-05 }, { "episode": 9472, "epoch": 0.12991893782490022, "eps": 5, "loss/policy_avg": -0.01097183395177126, "loss/value_avg": 0.0, "lr": 2.7237851662404094e-06, "objective/entropy": 1.6292238235473633, "objective/kl": 12.73173713684082, "objective/non_score_reward": -1.2731736898422241, "objective/rlhf_reward": -0.10916168242692947, "objective/scores": 1.1640625, "policy/approxkl_avg": 0.5525862574577332, "policy/clipfrac_avg": 0.310546875, "policy/entropy_avg": 0.031815528869628906, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 22, "val/ratio": 1.0027148723602295, "val/ratio_var": 0.00016600274830125272 }, { "episode": 9728, "epoch": 0.13343026046881643, "eps": 5, "loss/policy_avg": -0.010572239756584167, "loss/value_avg": 0.0, "lr": 2.7161125319693097e-06, "objective/entropy": 2.028618335723877, "objective/kl": 12.439943313598633, "objective/non_score_reward": -1.2439942359924316, "objective/rlhf_reward": -0.06748821586370468, "objective/scores": 1.171875, "policy/approxkl_avg": 0.4930054843425751, "policy/clipfrac_avg": 0.2841796875, "policy/entropy_avg": 0.03688812255859375, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 22, "val/ratio": 1.001340627670288, "val/ratio_var": 4.4035481550963596e-05 }, { "episode": 9984, "epoch": 0.13694158311273266, "eps": 5, "loss/policy_avg": -0.019254155457019806, "loss/value_avg": 0.0, "lr": 2.7084398976982097e-06, "objective/entropy": 2.295351266860962, "objective/kl": 13.32223892211914, "objective/non_score_reward": -1.332223892211914, "objective/rlhf_reward": -0.1836824268102646, "objective/scores": 1.1484375, "policy/approxkl_avg": 3.1426281929016113, "policy/clipfrac_avg": 0.3251953125, "policy/entropy_avg": 0.03939247131347656, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 1.0032271146774292, "val/ratio_var": 0.00019827872165478766 }, { "episode": 10240, "epoch": 0.14045290575664887, "eps": 5, "loss/policy_avg": -0.018122296780347824, "loss/value_avg": 0.0, "lr": 2.70076726342711e-06, "objective/entropy": 2.345075845718384, "objective/kl": 12.536066055297852, "objective/non_score_reward": -1.2536065578460693, "objective/rlhf_reward": -0.056986674666404724, "objective/scores": 1.1953125, "policy/approxkl_avg": 27.5201473236084, "policy/clipfrac_avg": 0.3046875, "policy/entropy_avg": 0.04156017303466797, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 0.9993807077407837, "val/ratio_var": 0.00011275127326371148 }, { "episode": 10496, "epoch": 0.1439642284005651, "eps": 5, "loss/policy_avg": -0.019295353442430496, "loss/value_avg": 0.0, "lr": 2.6930946291560103e-06, "objective/entropy": 2.091012477874756, "objective/kl": 12.746508598327637, "objective/non_score_reward": -1.2746508121490479, "objective/rlhf_reward": -0.09065462648868561, "objective/scores": 1.1875, "policy/approxkl_avg": 0.5554059743881226, "policy/clipfrac_avg": 0.2998046875, "policy/entropy_avg": 0.03620719909667969, "step": 205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 1.001387119293213, "val/ratio_var": 3.4958156902575865e-05 }, { "episode": 10752, "epoch": 0.14747555104448132, "eps": 5, "loss/policy_avg": -0.010203800164163113, "loss/value_avg": 0.0, "lr": 2.6854219948849107e-06, "objective/entropy": 2.1808600425720215, "objective/kl": 12.404802322387695, "objective/non_score_reward": -1.2404803037643433, "objective/rlhf_reward": -0.059675075113773346, "objective/scores": 1.1796875, "policy/approxkl_avg": 0.5876989364624023, "policy/clipfrac_avg": 0.27734375, "policy/entropy_avg": 0.041385650634765625, "step": 210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.0035128593444824, "val/ratio_var": 0.0004931605653837323 }, { "episode": 11008, "epoch": 0.15098687368839755, "eps": 5, "loss/policy_avg": -0.018955286592245102, "loss/value_avg": 0.0, "lr": 2.677749360613811e-06, "objective/entropy": 1.968322992324829, "objective/kl": 13.322561264038086, "objective/non_score_reward": -1.3322560787200928, "objective/rlhf_reward": -0.0670965313911438, "objective/scores": 1.265625, "policy/approxkl_avg": 0.39782679080963135, "policy/clipfrac_avg": 0.373046875, "policy/entropy_avg": 0.03279399871826172, "step": 215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.0017969608306885, "val/ratio_var": 6.843745359219611e-05 }, { "episode": 11264, "epoch": 0.15449819633231376, "eps": 5, "loss/policy_avg": -0.014947709627449512, "loss/value_avg": 0.0, "lr": 2.670076726342711e-06, "objective/entropy": 1.7985560894012451, "objective/kl": 12.856376647949219, "objective/non_score_reward": -1.285637617111206, "objective/rlhf_reward": -0.03251491114497185, "objective/scores": 1.25, "policy/approxkl_avg": 0.4516296982765198, "policy/clipfrac_avg": 0.3671875, "policy/entropy_avg": 0.031859397888183594, "step": 220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0009992122650146, "val/ratio_var": 4.1194460209226236e-05 }, { "episode": 11520, "epoch": 0.15800951897623, "eps": 5, "loss/policy_avg": -0.019819077104330063, "loss/value_avg": 0.0, "lr": 2.6624040920716113e-06, "objective/entropy": 1.5284242630004883, "objective/kl": 14.283391952514648, "objective/non_score_reward": -1.4283392429351807, "objective/rlhf_reward": -0.014965277165174484, "objective/scores": 1.4140625, "policy/approxkl_avg": 1.5518393516540527, "policy/clipfrac_avg": 0.2744140625, "policy/entropy_avg": 0.026048660278320312, "step": 225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 8, "val/ratio": 1.0007925033569336, "val/ratio_var": 3.721785105881281e-05 }, { "episode": 11776, "epoch": 0.1615208416201462, "eps": 5, "loss/policy_avg": -0.015632648020982742, "loss/value_avg": 0.0, "lr": 2.6547314578005116e-06, "objective/entropy": 1.5101430416107178, "objective/kl": 13.435927391052246, "objective/non_score_reward": -1.3435927629470825, "objective/rlhf_reward": 0.017792798578739166, "objective/scores": 1.359375, "policy/approxkl_avg": 0.22922199964523315, "policy/clipfrac_avg": 0.271484375, "policy/entropy_avg": 0.025536060333251953, "step": 230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 1.0355898141860962, "val/ratio_var": 0.09009659290313721 }, { "episode": 12032, "epoch": 0.16503216426406245, "eps": 5, "loss/policy_avg": -0.014460040256381035, "loss/value_avg": 0.0, "lr": 2.647058823529412e-06, "objective/entropy": 1.412046194076538, "objective/kl": 13.981653213500977, "objective/non_score_reward": -1.3981653451919556, "objective/rlhf_reward": -0.19434592127799988, "objective/scores": 1.203125, "policy/approxkl_avg": 0.48358476161956787, "policy/clipfrac_avg": 0.287109375, "policy/entropy_avg": 0.027116775512695312, "step": 235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.0002431869506836, "val/ratio_var": 1.317531587119447e-05 }, { "episode": 12288, "epoch": 0.16854348690797866, "eps": 5, "loss/policy_avg": -0.0144148338586092, "loss/value_avg": 0.0, "lr": 2.6393861892583123e-06, "objective/entropy": 1.5728825330734253, "objective/kl": 13.091099739074707, "objective/non_score_reward": -1.3091099262237549, "objective/rlhf_reward": -0.12438549101352692, "objective/scores": 1.1875, "policy/approxkl_avg": 0.5084937810897827, "policy/clipfrac_avg": 0.2587890625, "policy/entropy_avg": 0.028881072998046875, "step": 240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.004534125328064, "val/ratio_var": 0.001037073670886457 }, { "episode": 12544, "epoch": 0.17205480955189487, "eps": 5, "loss/policy_avg": -0.02535724639892578, "loss/value_avg": 0.0, "lr": 2.6317135549872122e-06, "objective/entropy": 1.6895666122436523, "objective/kl": 13.036446571350098, "objective/non_score_reward": -1.3036446571350098, "objective/rlhf_reward": -0.08131173253059387, "objective/scores": 1.21875, "policy/approxkl_avg": 1.397173285484314, "policy/clipfrac_avg": 0.296875, "policy/entropy_avg": 0.025877952575683594, "step": 245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 11, "val/ratio": 0.9993641972541809, "val/ratio_var": 3.554957584128715e-05 }, { "episode": 12800, "epoch": 0.1755661321958111, "eps": 5, "loss/policy_avg": -0.013989413157105446, "loss/value_avg": 0.0, "lr": 2.6240409207161126e-06, "objective/entropy": 1.4321318864822388, "objective/kl": 13.751260757446289, "objective/non_score_reward": -1.3751261234283447, "objective/rlhf_reward": 0.024946460500359535, "objective/scores": 1.3984375, "policy/approxkl_avg": 0.3265579044818878, "policy/clipfrac_avg": 0.3095703125, "policy/entropy_avg": 0.02507495880126953, "step": 250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.0016669034957886, "val/ratio_var": 3.951354665332474e-05 }, { "episode": 13056, "epoch": 0.1790774548397273, "eps": 5, "loss/policy_avg": -0.01614242233335972, "loss/value_avg": 0.0, "lr": 2.616368286445013e-06, "objective/entropy": 1.2477443218231201, "objective/kl": 14.385757446289062, "objective/non_score_reward": -1.4385757446289062, "objective/rlhf_reward": -0.048571567982435226, "objective/scores": 1.390625, "policy/approxkl_avg": 0.38643181324005127, "policy/clipfrac_avg": 0.33203125, "policy/entropy_avg": 0.02417755126953125, "step": 255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.0020815134048462, "val/ratio_var": 0.00012469623470678926 }, { "episode": 13312, "epoch": 0.18258877748364355, "eps": 5, "loss/policy_avg": -0.013632966205477715, "loss/value_avg": 0.0, "lr": 2.6086956521739132e-06, "objective/entropy": 1.5228471755981445, "objective/kl": 14.862211227416992, "objective/non_score_reward": -1.486221194267273, "objective/rlhf_reward": -0.07545565813779831, "objective/scores": 1.40625, "policy/approxkl_avg": 2.011383056640625, "policy/clipfrac_avg": 0.3359375, "policy/entropy_avg": 0.027433395385742188, "step": 260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 17, "val/ratio": 0.9998003244400024, "val/ratio_var": 2.738163857429754e-05 }, { "episode": 13568, "epoch": 0.18610010012755976, "eps": 5, "loss/policy_avg": -0.020372817292809486, "loss/value_avg": 0.0, "lr": 2.6010230179028136e-06, "objective/entropy": 1.633180856704712, "objective/kl": 14.094629287719727, "objective/non_score_reward": -1.4094629287719727, "objective/rlhf_reward": -0.015713702887296677, "objective/scores": 1.390625, "policy/approxkl_avg": 0.47778478264808655, "policy/clipfrac_avg": 0.3701171875, "policy/entropy_avg": 0.02643442153930664, "step": 265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 14, "val/ratio": 1.0079278945922852, "val/ratio_var": 0.0023215855471789837 }, { "episode": 13824, "epoch": 0.189611422771476, "eps": 5, "loss/policy_avg": -0.01413625106215477, "loss/value_avg": 0.0, "lr": 2.5933503836317135e-06, "objective/entropy": 1.2899070978164673, "objective/kl": 14.59975528717041, "objective/non_score_reward": -1.4599756002426147, "objective/rlhf_reward": -0.09675531834363937, "objective/scores": 1.359375, "policy/approxkl_avg": 0.4568091630935669, "policy/clipfrac_avg": 0.3076171875, "policy/entropy_avg": 0.02667713165283203, "step": 270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.003631830215454, "val/ratio_var": 0.00041694415267556906 }, { "episode": 14080, "epoch": 0.1931227454153922, "eps": 5, "loss/policy_avg": -0.018304049968719482, "loss/value_avg": 0.0, "lr": 2.585677749360614e-06, "objective/entropy": 1.3464603424072266, "objective/kl": 14.915502548217773, "objective/non_score_reward": -1.491550326347351, "objective/rlhf_reward": -0.013601185753941536, "objective/scores": 1.4765625, "policy/approxkl_avg": 0.5154660940170288, "policy/clipfrac_avg": 0.3251953125, "policy/entropy_avg": 0.024927139282226562, "step": 275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 20, "val/ratio": 1.0014865398406982, "val/ratio_var": 8.263294148491696e-05 }, { "episode": 14336, "epoch": 0.19663406805930844, "eps": 5, "loss/policy_avg": -0.009162629023194313, "loss/value_avg": 0.0, "lr": 2.578005115089514e-06, "objective/entropy": 1.3251242637634277, "objective/kl": 14.600137710571289, "objective/non_score_reward": -1.460013747215271, "objective/rlhf_reward": -0.10571230947971344, "objective/scores": 1.3515625, "policy/approxkl_avg": 0.4187917411327362, "policy/clipfrac_avg": 0.3046875, "policy/entropy_avg": 0.023657798767089844, "step": 280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.008554458618164, "val/ratio_var": 0.001467635971494019 }, { "episode": 14592, "epoch": 0.20014539070322465, "eps": 5, "loss/policy_avg": -0.01609072834253311, "loss/value_avg": 0.0, "lr": 2.5703324808184145e-06, "objective/entropy": 1.3078004121780396, "objective/kl": 14.999523162841797, "objective/non_score_reward": -1.4999523162841797, "objective/rlhf_reward": -0.15238332748413086, "objective/scores": 1.3515625, "policy/approxkl_avg": 0.3968128561973572, "policy/clipfrac_avg": 0.36328125, "policy/entropy_avg": 0.023943424224853516, "step": 285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 12, "val/ratio": 1.0019521713256836, "val/ratio_var": 8.228437945945188e-05 }, { "episode": 14848, "epoch": 0.2036567133471409, "eps": 5, "loss/policy_avg": -0.014186807908117771, "loss/value_avg": 0.0, "lr": 2.562659846547315e-06, "objective/entropy": 1.2583755254745483, "objective/kl": 15.623100280761719, "objective/non_score_reward": -1.5623100996017456, "objective/rlhf_reward": -0.09625323116779327, "objective/scores": 1.46875, "policy/approxkl_avg": 0.5678977370262146, "policy/clipfrac_avg": 0.3076171875, "policy/entropy_avg": 0.024990558624267578, "step": 290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 13, "val/ratio": 1.0014848709106445, "val/ratio_var": 4.219371476210654e-05 }, { "episode": 15104, "epoch": 0.2071680359910571, "eps": 5, "loss/policy_avg": -0.013804701156914234, "loss/value_avg": 0.0, "lr": 2.5549872122762148e-06, "objective/entropy": 1.568720817565918, "objective/kl": 14.687668800354004, "objective/non_score_reward": -1.4687669277191162, "objective/rlhf_reward": -0.17009752988815308, "objective/scores": 1.296875, "policy/approxkl_avg": 0.3046334981918335, "policy/clipfrac_avg": 0.26953125, "policy/entropy_avg": 0.027862548828125, "step": 295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.0005717277526855, "val/ratio_var": 1.1324932529532816e-05 }, { "episode": 15360, "epoch": 0.21067935863497333, "eps": 5, "loss/policy_avg": -0.018133502453565598, "loss/value_avg": 0.0, "lr": 2.547314578005115e-06, "objective/entropy": 1.2987349033355713, "objective/kl": 13.89183235168457, "objective/non_score_reward": -1.3891831636428833, "objective/rlhf_reward": -0.12500587105751038, "objective/scores": 1.265625, "policy/approxkl_avg": 0.31936001777648926, "policy/clipfrac_avg": 0.33984375, "policy/entropy_avg": 0.02469015121459961, "step": 300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 18, "val/ratio": 1.0020601749420166, "val/ratio_var": 3.1293042411562055e-05 }, { "episode": 15616, "epoch": 0.21419068127888954, "eps": 5, "loss/policy_avg": -0.01710616797208786, "loss/value_avg": 0.0, "lr": 2.5396419437340155e-06, "objective/entropy": 1.4288297891616821, "objective/kl": 14.952780723571777, "objective/non_score_reward": -1.4952781200408936, "objective/rlhf_reward": -0.15792769193649292, "objective/scores": 1.3359375, "policy/approxkl_avg": 0.6461950540542603, "policy/clipfrac_avg": 0.3125, "policy/entropy_avg": 0.02637958526611328, "step": 305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.0013606548309326, "val/ratio_var": 4.203971548122354e-05 }, { "episode": 15872, "epoch": 0.21770200392280575, "eps": 5, "loss/policy_avg": -0.016128187999129295, "loss/value_avg": 0.0, "lr": 2.531969309462916e-06, "objective/entropy": 1.3288850784301758, "objective/kl": 15.583921432495117, "objective/non_score_reward": -1.55839204788208, "objective/rlhf_reward": -0.07665687799453735, "objective/scores": 1.484375, "policy/approxkl_avg": 0.3284182548522949, "policy/clipfrac_avg": 0.328125, "policy/entropy_avg": 0.02404165267944336, "step": 310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.0064442157745361, "val/ratio_var": 0.0015344778075814247 }, { "episode": 16128, "epoch": 0.221213326566722, "eps": 5, "loss/policy_avg": -0.015278931707143784, "loss/value_avg": 0.0, "lr": 2.524296675191816e-06, "objective/entropy": 1.3236112594604492, "objective/kl": 14.773448944091797, "objective/non_score_reward": -1.4773449897766113, "objective/rlhf_reward": -0.08708612620830536, "objective/scores": 1.390625, "policy/approxkl_avg": 0.2980467975139618, "policy/clipfrac_avg": 0.34765625, "policy/entropy_avg": 0.02494335174560547, "step": 315, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 7, "val/ratio": 1.0008249282836914, "val/ratio_var": 1.927867742779199e-05 }, { "episode": 16384, "epoch": 0.2247246492106382, "eps": 5, "loss/policy_avg": -0.020951703190803528, "loss/value_avg": 0.0, "lr": 2.516624040920716e-06, "objective/entropy": 1.2670817375183105, "objective/kl": 14.8348970413208, "objective/non_score_reward": -1.483489751815796, "objective/rlhf_reward": 0.03382519632577896, "objective/scores": 1.515625, "policy/approxkl_avg": 1.0973663330078125, "policy/clipfrac_avg": 0.361328125, "policy/entropy_avg": 0.02091073989868164, "step": 320, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.003138542175293, "val/ratio_var": 0.0001205340595333837 }, { "episode": 16640, "epoch": 0.22823597185455444, "eps": 5, "loss/policy_avg": -0.006676271557807922, "loss/value_avg": 0.0, "lr": 2.5089514066496164e-06, "objective/entropy": 1.191056728363037, "objective/kl": 16.46404457092285, "objective/non_score_reward": -1.646404504776001, "objective/rlhf_reward": -0.14990828931331635, "objective/scores": 1.5, "policy/approxkl_avg": 0.31475046277046204, "policy/clipfrac_avg": 0.2578125, "policy/entropy_avg": 0.021608352661132812, "step": 325, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 25, "val/ratio": 1.000575065612793, "val/ratio_var": 1.735856494633481e-05 }, { "episode": 16896, "epoch": 0.23174729449847065, "eps": 5, "loss/policy_avg": -0.02127978205680847, "loss/value_avg": 0.0, "lr": 2.5012787723785167e-06, "objective/entropy": 1.490570068359375, "objective/kl": 16.22044563293457, "objective/non_score_reward": -1.622044563293457, "objective/rlhf_reward": -0.015120631083846092, "objective/scores": 1.609375, "policy/approxkl_avg": 2.5871665477752686, "policy/clipfrac_avg": 0.35546875, "policy/entropy_avg": 0.027353286743164062, "step": 330, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 25, "val/ratio": 1.003063440322876, "val/ratio_var": 6.509448576252908e-05 }, { "episode": 17152, "epoch": 0.23525861714238688, "eps": 5, "loss/policy_avg": -0.013108542189002037, "loss/value_avg": 0.0, "lr": 2.493606138107417e-06, "objective/entropy": 1.2718842029571533, "objective/kl": 16.047882080078125, "objective/non_score_reward": -1.604788064956665, "objective/rlhf_reward": -0.12415145337581635, "objective/scores": 1.484375, "policy/approxkl_avg": 0.2758824825286865, "policy/clipfrac_avg": 0.287109375, "policy/entropy_avg": 0.024268627166748047, "step": 335, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 16, "val/ratio": 1.0036962032318115, "val/ratio_var": 0.0003905180492438376 }, { "episode": 17408, "epoch": 0.2387699397863031, "eps": 5, "loss/policy_avg": -0.014837839640676975, "loss/value_avg": 0.0, "lr": 2.4859335038363174e-06, "objective/entropy": 1.3406567573547363, "objective/kl": 16.428348541259766, "objective/non_score_reward": -1.642835021018982, "objective/rlhf_reward": 0.018702151253819466, "objective/scores": 1.6640625, "policy/approxkl_avg": 0.192110076546669, "policy/clipfrac_avg": 0.3388671875, "policy/entropy_avg": 0.025295734405517578, "step": 340, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.0008959770202637, "val/ratio_var": 1.205760781886056e-05 }, { "episode": 17664, "epoch": 0.24228126243021933, "eps": 5, "loss/policy_avg": -0.01899782381951809, "loss/value_avg": 0.0, "lr": 2.4782608695652173e-06, "objective/entropy": 1.5880205631256104, "objective/kl": 15.775943756103516, "objective/non_score_reward": -1.577594518661499, "objective/rlhf_reward": -0.08363974094390869, "objective/scores": 1.4921875, "policy/approxkl_avg": 0.9254180192947388, "policy/clipfrac_avg": 0.3251953125, "policy/entropy_avg": 0.024907588958740234, "step": 345, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 15, "val/ratio": 1.000749111175537, "val/ratio_var": 2.8957187168998644e-05 }, { "episode": 17920, "epoch": 0.24579258507413554, "eps": 5, "loss/policy_avg": -0.014669202268123627, "loss/value_avg": 0.0, "lr": 2.4705882352941177e-06, "objective/entropy": 1.2656543254852295, "objective/kl": 16.034730911254883, "objective/non_score_reward": -1.60347318649292, "objective/rlhf_reward": -0.011744961142539978, "objective/scores": 1.59375, "policy/approxkl_avg": 1.004683017730713, "policy/clipfrac_avg": 0.2646484375, "policy/entropy_avg": 0.023741722106933594, "step": 350, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 19, "val/ratio": 1.0009608268737793, "val/ratio_var": 4.649764014175162e-05 } ], "logging_steps": 100, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1.3716104077797742, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }