falcon-7b-instruct-rlhf-ckpt-350 / trainer_state.json
jiagaoxiang's picture
Upload folder using huggingface_hub
67d0111 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"episode": 17920,
"epoch": 0.24579258507413554,
"eval_steps": 200.0,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"episode": 256,
"epoch": 0.003511322643916222,
"eps": 6,
"loss/policy_avg": -0.07090990990400314,
"loss/value_avg": 0.0,
"lr": 3e-06,
"objective/entropy": 49.42120361328125,
"objective/kl": 0.006465356796979904,
"objective/non_score_reward": -0.000646535714622587,
"objective/rlhf_reward": -1.1137903928756714,
"objective/scores": -1.109375,
"policy/approxkl_avg": 27.096786499023438,
"policy/clipfrac_avg": 0.732421875,
"policy/entropy_avg": 0.92181396484375,
"step": 5,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 12,
"val/ratio": 1.0399832725524902,
"val/ratio_var": 0.010045886039733887
},
{
"episode": 512,
"epoch": 0.007022645287832444,
"eps": 6,
"loss/policy_avg": -0.06497187167406082,
"loss/value_avg": 0.0,
"lr": 2.9923273657289e-06,
"objective/entropy": 48.286014556884766,
"objective/kl": 0.8119473457336426,
"objective/non_score_reward": -0.08119472861289978,
"objective/rlhf_reward": -1.266162633895874,
"objective/scores": -1.1875,
"policy/approxkl_avg": 18.666072845458984,
"policy/clipfrac_avg": 0.7314453125,
"policy/entropy_avg": 0.912261962890625,
"step": 10,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.020957112312317,
"val/ratio_var": 0.00411860179156065
},
{
"episode": 768,
"epoch": 0.010533967931748666,
"eps": 6,
"loss/policy_avg": -0.0872286781668663,
"loss/value_avg": 0.0,
"lr": 2.9846547314578008e-06,
"objective/entropy": 49.34376525878906,
"objective/kl": 1.9591996669769287,
"objective/non_score_reward": -0.1959199756383896,
"objective/rlhf_reward": -1.2858657836914062,
"objective/scores": -1.09375,
"policy/approxkl_avg": 20.772502899169922,
"policy/clipfrac_avg": 0.73828125,
"policy/entropy_avg": 0.927978515625,
"step": 15,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 12,
"val/ratio": 1.0191609859466553,
"val/ratio_var": 0.00307083735242486
},
{
"episode": 1024,
"epoch": 0.014045290575664887,
"eps": 6,
"loss/policy_avg": -0.07566041499376297,
"loss/value_avg": 0.0,
"lr": 2.9769820971867007e-06,
"objective/entropy": 53.13662338256836,
"objective/kl": 2.4811532497406006,
"objective/non_score_reward": -0.24811533093452454,
"objective/rlhf_reward": -1.2548893690109253,
"objective/scores": -1.0078125,
"policy/approxkl_avg": 20.665164947509766,
"policy/clipfrac_avg": 0.7314453125,
"policy/entropy_avg": 0.989776611328125,
"step": 20,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 11,
"val/ratio": 1.011010766029358,
"val/ratio_var": 0.004201602190732956
},
{
"episode": 1280,
"epoch": 0.01755661321958111,
"eps": 6,
"loss/policy_avg": -0.08593496680259705,
"loss/value_avg": 0.0,
"lr": 2.9693094629156014e-06,
"objective/entropy": 53.72633743286133,
"objective/kl": 3.3111624717712402,
"objective/non_score_reward": -0.3311161994934082,
"objective/rlhf_reward": -1.339456558227539,
"objective/scores": -1.0078125,
"policy/approxkl_avg": 25.559288024902344,
"policy/clipfrac_avg": 0.7353515625,
"policy/entropy_avg": 0.997894287109375,
"step": 25,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 13,
"val/ratio": 1.0134021043777466,
"val/ratio_var": 0.0019979747012257576
},
{
"episode": 1536,
"epoch": 0.021067935863497332,
"eps": 6,
"loss/policy_avg": -0.09734417498111725,
"loss/value_avg": 0.0,
"lr": 2.9616368286445014e-06,
"objective/entropy": 51.259735107421875,
"objective/kl": 5.089182376861572,
"objective/non_score_reward": -0.5089181661605835,
"objective/rlhf_reward": -1.2202520370483398,
"objective/scores": -0.7109375,
"policy/approxkl_avg": 29.841636657714844,
"policy/clipfrac_avg": 0.736328125,
"policy/entropy_avg": 0.960479736328125,
"step": 30,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 26,
"val/ratio": 1.0178756713867188,
"val/ratio_var": 0.009866585955023766
},
{
"episode": 1792,
"epoch": 0.024579258507413555,
"eps": 6,
"loss/policy_avg": -0.06831618398427963,
"loss/value_avg": 0.0,
"lr": 2.9539641943734013e-06,
"objective/entropy": 40.643272399902344,
"objective/kl": 6.974010944366455,
"objective/non_score_reward": -0.6974011063575745,
"objective/rlhf_reward": -1.2684605121612549,
"objective/scores": -0.5703125,
"policy/approxkl_avg": 35.33942413330078,
"policy/clipfrac_avg": 0.6982421875,
"policy/entropy_avg": 0.7505035400390625,
"step": 35,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.00449800491333,
"val/ratio_var": 0.0022142010275274515
},
{
"episode": 2048,
"epoch": 0.028090581151329775,
"eps": 6,
"loss/policy_avg": -0.04068079590797424,
"loss/value_avg": 0.0,
"lr": 2.946291560102302e-06,
"objective/entropy": 23.142562866210938,
"objective/kl": 8.180486679077148,
"objective/non_score_reward": -0.8180487155914307,
"objective/rlhf_reward": -1.0729957818984985,
"objective/scores": -0.255859375,
"policy/approxkl_avg": 23.68307876586914,
"policy/clipfrac_avg": 0.5859375,
"policy/entropy_avg": 0.4361400604248047,
"step": 40,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 1.0077030658721924,
"val/ratio_var": 0.0024766812566667795
},
{
"episode": 2304,
"epoch": 0.031601903795246,
"eps": 6,
"loss/policy_avg": -0.07307010889053345,
"loss/value_avg": 0.0,
"lr": 2.938618925831202e-06,
"objective/entropy": 19.376842498779297,
"objective/kl": 8.770210266113281,
"objective/non_score_reward": -0.8770210146903992,
"objective/rlhf_reward": -1.0002652406692505,
"objective/scores": -0.12353515625,
"policy/approxkl_avg": 31.00873565673828,
"policy/clipfrac_avg": 0.5302734375,
"policy/entropy_avg": 0.33237457275390625,
"step": 45,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 20,
"val/ratio": 0.996111273765564,
"val/ratio_var": 0.001100091845728457
},
{
"episode": 2560,
"epoch": 0.03511322643916222,
"eps": 6,
"loss/policy_avg": -0.04584116116166115,
"loss/value_avg": 0.0,
"lr": 2.9309462915601027e-06,
"objective/entropy": 11.984097480773926,
"objective/kl": 8.4966402053833,
"objective/non_score_reward": -0.849664032459259,
"objective/rlhf_reward": -0.8017911911010742,
"objective/scores": 0.0478515625,
"policy/approxkl_avg": 22.561037063598633,
"policy/clipfrac_avg": 0.451171875,
"policy/entropy_avg": 0.19393539428710938,
"step": 50,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 20,
"val/ratio": 0.9952375888824463,
"val/ratio_var": 0.000761833623982966
},
{
"episode": 2816,
"epoch": 0.03862454908307844,
"eps": 5,
"loss/policy_avg": -0.029720915481448174,
"loss/value_avg": 0.0,
"lr": 2.9232736572890026e-06,
"objective/entropy": 4.9489898681640625,
"objective/kl": 8.733837127685547,
"objective/non_score_reward": -0.8733837604522705,
"objective/rlhf_reward": -0.7492713928222656,
"objective/scores": 0.1240234375,
"policy/approxkl_avg": 16.253189086914062,
"policy/clipfrac_avg": 0.341796875,
"policy/entropy_avg": 0.07728099822998047,
"step": 55,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 18,
"val/ratio": 0.9972053170204163,
"val/ratio_var": 0.00032430028659291565
},
{
"episode": 3072,
"epoch": 0.042135871726994664,
"eps": 5,
"loss/policy_avg": -0.01298562902957201,
"loss/value_avg": 0.0,
"lr": 2.9156010230179026e-06,
"objective/entropy": 1.3101667165756226,
"objective/kl": 8.699792861938477,
"objective/non_score_reward": -0.8699792623519897,
"objective/rlhf_reward": -0.5752952098846436,
"objective/scores": 0.294921875,
"policy/approxkl_avg": 2.27925968170166,
"policy/clipfrac_avg": 0.236328125,
"policy/entropy_avg": 0.02513742446899414,
"step": 60,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 20,
"val/ratio": 1.0017118453979492,
"val/ratio_var": 0.00016639505338389426
},
{
"episode": 3328,
"epoch": 0.04564719437091089,
"eps": 5,
"loss/policy_avg": -0.02618303708732128,
"loss/value_avg": 0.0,
"lr": 2.9079283887468033e-06,
"objective/entropy": 2.3685269355773926,
"objective/kl": 9.208517074584961,
"objective/non_score_reward": -0.9208516478538513,
"objective/rlhf_reward": -0.5182289481163025,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 2.6189699172973633,
"policy/clipfrac_avg": 0.310546875,
"policy/entropy_avg": 0.04020071029663086,
"step": 65,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 20,
"val/ratio": 1.003983497619629,
"val/ratio_var": 0.0009448421187698841
},
{
"episode": 3584,
"epoch": 0.04915851701482711,
"eps": 5,
"loss/policy_avg": -0.02327096462249756,
"loss/value_avg": 0.0,
"lr": 2.9002557544757032e-06,
"objective/entropy": 2.0416018962860107,
"objective/kl": 9.701976776123047,
"objective/non_score_reward": -0.9701976776123047,
"objective/rlhf_reward": -0.49486449360847473,
"objective/scores": 0.474609375,
"policy/approxkl_avg": 1.271956443786621,
"policy/clipfrac_avg": 0.2734375,
"policy/entropy_avg": 0.041253089904785156,
"step": 70,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.0039558410644531,
"val/ratio_var": 0.00041477559716440737
},
{
"episode": 3840,
"epoch": 0.052669839658743334,
"eps": 5,
"loss/policy_avg": -0.033096276223659515,
"loss/value_avg": 0.0,
"lr": 2.892583120204604e-06,
"objective/entropy": 2.7795495986938477,
"objective/kl": 10.028523445129395,
"objective/non_score_reward": -1.0028523206710815,
"objective/rlhf_reward": -0.46555712819099426,
"objective/scores": 0.5390625,
"policy/approxkl_avg": 3.055203676223755,
"policy/clipfrac_avg": 0.3427734375,
"policy/entropy_avg": 0.053270816802978516,
"step": 75,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 23,
"val/ratio": 1.0012407302856445,
"val/ratio_var": 0.00011274257121840492
},
{
"episode": 4096,
"epoch": 0.05618116230265955,
"eps": 5,
"loss/policy_avg": -0.01961323618888855,
"loss/value_avg": 0.0,
"lr": 2.884910485933504e-06,
"objective/entropy": 2.5525641441345215,
"objective/kl": 10.111019134521484,
"objective/non_score_reward": -1.0111019611358643,
"objective/rlhf_reward": -0.510233461856842,
"objective/scores": 0.5,
"policy/approxkl_avg": 1.331697940826416,
"policy/clipfrac_avg": 0.2861328125,
"policy/entropy_avg": 0.048857688903808594,
"step": 80,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 25,
"val/ratio": 1.011049509048462,
"val/ratio_var": 0.004252108279615641
},
{
"episode": 4352,
"epoch": 0.05969248494657577,
"eps": 5,
"loss/policy_avg": -0.009127877652645111,
"loss/value_avg": 0.0,
"lr": 2.877237851662404e-06,
"objective/entropy": 3.016789674758911,
"objective/kl": 11.257818222045898,
"objective/non_score_reward": -1.125781774520874,
"objective/rlhf_reward": -0.4276960492134094,
"objective/scores": 0.69921875,
"policy/approxkl_avg": 1.4772686958312988,
"policy/clipfrac_avg": 0.35546875,
"policy/entropy_avg": 0.053719520568847656,
"step": 85,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.0042904615402222,
"val/ratio_var": 0.0008556774700991809
},
{
"episode": 4608,
"epoch": 0.063203807590492,
"eps": 5,
"loss/policy_avg": -0.025049656629562378,
"loss/value_avg": 0.0,
"lr": 2.8695652173913046e-06,
"objective/entropy": 2.5907459259033203,
"objective/kl": 10.457273483276367,
"objective/non_score_reward": -1.0457274913787842,
"objective/rlhf_reward": -0.3816419839859009,
"objective/scores": 0.6640625,
"policy/approxkl_avg": 2.3460922241210938,
"policy/clipfrac_avg": 0.322265625,
"policy/entropy_avg": 0.04626178741455078,
"step": 90,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 11,
"val/ratio": 1.0003862380981445,
"val/ratio_var": 7.93520302977413e-05
},
{
"episode": 4864,
"epoch": 0.06671513023440821,
"eps": 5,
"loss/policy_avg": -0.01828361675143242,
"loss/value_avg": 0.0,
"lr": 2.8618925831202045e-06,
"objective/entropy": 2.397810220718384,
"objective/kl": 10.732559204101562,
"objective/non_score_reward": -1.073256015777588,
"objective/rlhf_reward": -0.35966813564300537,
"objective/scores": 0.71484375,
"policy/approxkl_avg": 1.1093428134918213,
"policy/clipfrac_avg": 0.32421875,
"policy/entropy_avg": 0.041881561279296875,
"step": 95,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 15,
"val/ratio": 1.0054664611816406,
"val/ratio_var": 0.0017973663052543998
},
{
"episode": 5120,
"epoch": 0.07022645287832444,
"eps": 5,
"loss/policy_avg": -0.04088423401117325,
"loss/value_avg": 0.0,
"lr": 2.8542199488491053e-06,
"objective/entropy": 2.343449592590332,
"objective/kl": 11.780994415283203,
"objective/non_score_reward": -1.1780993938446045,
"objective/rlhf_reward": -0.4628324806690216,
"objective/scores": 0.71484375,
"policy/approxkl_avg": 0.894420325756073,
"policy/clipfrac_avg": 0.46875,
"policy/entropy_avg": 0.04486083984375,
"step": 100,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 11,
"val/ratio": 1.0009559392929077,
"val/ratio_var": 4.804596756002866e-05
},
{
"episode": 5376,
"epoch": 0.07373777552224066,
"eps": 5,
"loss/policy_avg": -0.020697183907032013,
"loss/value_avg": 0.0,
"lr": 2.846547314578005e-06,
"objective/entropy": 1.9023351669311523,
"objective/kl": 10.29288101196289,
"objective/non_score_reward": -1.0292882919311523,
"objective/rlhf_reward": -0.29047834873199463,
"objective/scores": 0.73828125,
"policy/approxkl_avg": 0.9143690466880798,
"policy/clipfrac_avg": 0.373046875,
"policy/entropy_avg": 0.028568267822265625,
"step": 105,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 10,
"val/ratio": 1.000715732574463,
"val/ratio_var": 4.201457340968773e-05
},
{
"episode": 5632,
"epoch": 0.07724909816615688,
"eps": 5,
"loss/policy_avg": -0.012633640319108963,
"loss/value_avg": 0.0,
"lr": 2.8388746803069055e-06,
"objective/entropy": 1.3839142322540283,
"objective/kl": 10.57151985168457,
"objective/non_score_reward": -1.0571520328521729,
"objective/rlhf_reward": -0.2935946583747864,
"objective/scores": 0.765625,
"policy/approxkl_avg": 0.6525547504425049,
"policy/clipfrac_avg": 0.2646484375,
"policy/entropy_avg": 0.0345916748046875,
"step": 110,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 10,
"val/ratio": 0.9999199509620667,
"val/ratio_var": 2.6978697860613465e-05
},
{
"episode": 5888,
"epoch": 0.0807604208100731,
"eps": 5,
"loss/policy_avg": -0.026668714359402657,
"loss/value_avg": 0.0,
"lr": 2.831202046035806e-06,
"objective/entropy": 2.17741322517395,
"objective/kl": 11.39688491821289,
"objective/non_score_reward": -1.139688491821289,
"objective/rlhf_reward": -0.3027456998825073,
"objective/scores": 0.8359375,
"policy/approxkl_avg": 8.829752922058105,
"policy/clipfrac_avg": 0.35546875,
"policy/entropy_avg": 0.034277915954589844,
"step": 115,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 1.0012441873550415,
"val/ratio_var": 9.009366476675496e-05
},
{
"episode": 6144,
"epoch": 0.08427174345398933,
"eps": 5,
"loss/policy_avg": -0.011602860875427723,
"loss/value_avg": 0.0,
"lr": 2.823529411764706e-06,
"objective/entropy": 1.418602466583252,
"objective/kl": 10.246469497680664,
"objective/non_score_reward": -1.0246469974517822,
"objective/rlhf_reward": -0.22599510848522186,
"objective/scores": 0.796875,
"policy/approxkl_avg": 0.31790149211883545,
"policy/clipfrac_avg": 0.2314453125,
"policy/entropy_avg": 0.028847694396972656,
"step": 120,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 9,
"val/ratio": 1.0009679794311523,
"val/ratio_var": 3.900106457876973e-05
},
{
"episode": 6400,
"epoch": 0.08778306609790555,
"eps": 5,
"loss/policy_avg": -0.0157505851238966,
"loss/value_avg": 0.0,
"lr": 2.8158567774936066e-06,
"objective/entropy": 1.936393141746521,
"objective/kl": 10.550077438354492,
"objective/non_score_reward": -1.0550076961517334,
"objective/rlhf_reward": -0.252943217754364,
"objective/scores": 0.80078125,
"policy/approxkl_avg": 6.545133113861084,
"policy/clipfrac_avg": 0.341796875,
"policy/entropy_avg": 0.039971351623535156,
"step": 125,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 12,
"val/ratio": 1.0001187324523926,
"val/ratio_var": 0.00011527155584190041
},
{
"episode": 6656,
"epoch": 0.09129438874182177,
"eps": 5,
"loss/policy_avg": -0.00908716581761837,
"loss/value_avg": 0.0,
"lr": 2.8081841432225065e-06,
"objective/entropy": 1.9167767763137817,
"objective/kl": 10.831771850585938,
"objective/non_score_reward": -1.0831772089004517,
"objective/rlhf_reward": -0.24270595610141754,
"objective/scores": 0.83984375,
"policy/approxkl_avg": 13.507976531982422,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.034499168395996094,
"step": 130,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.0004911422729492,
"val/ratio_var": 0.00018595268193166703
},
{
"episode": 6912,
"epoch": 0.094805711385738,
"eps": 5,
"loss/policy_avg": -0.017197387292981148,
"loss/value_avg": 0.0,
"lr": 2.800511508951407e-06,
"objective/entropy": 1.7237651348114014,
"objective/kl": 11.095592498779297,
"objective/non_score_reward": -1.1095592975616455,
"objective/rlhf_reward": -0.21057555079460144,
"objective/scores": 0.8984375,
"policy/approxkl_avg": 2.7560040950775146,
"policy/clipfrac_avg": 0.2841796875,
"policy/entropy_avg": 0.032952308654785156,
"step": 135,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 0.9994020462036133,
"val/ratio_var": 3.074964843108319e-05
},
{
"episode": 7168,
"epoch": 0.09831703402965422,
"eps": 5,
"loss/policy_avg": -0.012010859325528145,
"loss/value_avg": 0.0,
"lr": 2.792838874680307e-06,
"objective/entropy": 1.5862581729888916,
"objective/kl": 10.674396514892578,
"objective/non_score_reward": -1.0674396753311157,
"objective/rlhf_reward": -0.14433012902736664,
"objective/scores": 0.921875,
"policy/approxkl_avg": 1.1186727285385132,
"policy/clipfrac_avg": 0.2783203125,
"policy/entropy_avg": 0.0295562744140625,
"step": 140,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 13,
"val/ratio": 1.0007727146148682,
"val/ratio_var": 4.557183274300769e-05
},
{
"episode": 7424,
"epoch": 0.10182835667357044,
"eps": 5,
"loss/policy_avg": -0.013728385791182518,
"loss/value_avg": 0.0,
"lr": 2.785166240409207e-06,
"objective/entropy": 1.5388869047164917,
"objective/kl": 10.359582901000977,
"objective/non_score_reward": -1.035958170890808,
"objective/rlhf_reward": -0.14511710405349731,
"objective/scores": 0.890625,
"policy/approxkl_avg": 0.5204602479934692,
"policy/clipfrac_avg": 0.283203125,
"policy/entropy_avg": 0.028924942016601562,
"step": 145,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 14,
"val/ratio": 1.056097149848938,
"val/ratio_var": 0.13372056186199188
},
{
"episode": 7680,
"epoch": 0.10533967931748667,
"eps": 5,
"loss/policy_avg": -0.014945434406399727,
"loss/value_avg": 0.0,
"lr": 2.7774936061381074e-06,
"objective/entropy": 2.0769755840301514,
"objective/kl": 11.147063255310059,
"objective/non_score_reward": -1.11470627784729,
"objective/rlhf_reward": -0.08940108120441437,
"objective/scores": 1.0234375,
"policy/approxkl_avg": 0.5961493253707886,
"policy/clipfrac_avg": 0.3681640625,
"policy/entropy_avg": 0.037804603576660156,
"step": 150,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 13,
"val/ratio": 1.0033739805221558,
"val/ratio_var": 0.00030022990540601313
},
{
"episode": 7936,
"epoch": 0.10885100196140288,
"eps": 5,
"loss/policy_avg": -0.02276831492781639,
"loss/value_avg": 0.0,
"lr": 2.7698209718670078e-06,
"objective/entropy": 2.1412830352783203,
"objective/kl": 11.697949409484863,
"objective/non_score_reward": -1.169795036315918,
"objective/rlhf_reward": -0.13582009077072144,
"objective/scores": 1.03125,
"policy/approxkl_avg": 0.7155288457870483,
"policy/clipfrac_avg": 0.3193359375,
"policy/entropy_avg": 0.037835121154785156,
"step": 155,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 13,
"val/ratio": 1.0014090538024902,
"val/ratio_var": 5.2470270020421594e-05
},
{
"episode": 8192,
"epoch": 0.1123623246053191,
"eps": 5,
"loss/policy_avg": -0.013076605275273323,
"loss/value_avg": 0.0,
"lr": 2.762148337595908e-06,
"objective/entropy": 1.634714126586914,
"objective/kl": 11.629154205322266,
"objective/non_score_reward": -1.1629154682159424,
"objective/rlhf_reward": -0.28488799929618835,
"objective/scores": 0.87890625,
"policy/approxkl_avg": 0.4181188941001892,
"policy/clipfrac_avg": 0.3037109375,
"policy/entropy_avg": 0.029273509979248047,
"step": 160,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 15,
"val/ratio": 1.0008339881896973,
"val/ratio_var": 1.4662801731901709e-05
},
{
"episode": 8448,
"epoch": 0.11587364724923532,
"eps": 5,
"loss/policy_avg": -0.01651182770729065,
"loss/value_avg": 0.0,
"lr": 2.7544757033248085e-06,
"objective/entropy": 1.9540742635726929,
"objective/kl": 11.4830322265625,
"objective/non_score_reward": -1.1483032703399658,
"objective/rlhf_reward": -0.05983233451843262,
"objective/scores": 1.0859375,
"policy/approxkl_avg": 18.791297912597656,
"policy/clipfrac_avg": 0.2880859375,
"policy/entropy_avg": 0.03601264953613281,
"step": 165,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.0220942497253418,
"val/ratio_var": 0.02208283357322216
},
{
"episode": 8704,
"epoch": 0.11938496989315155,
"eps": 5,
"loss/policy_avg": -0.013821810483932495,
"loss/value_avg": 0.0,
"lr": 2.7468030690537084e-06,
"objective/entropy": 1.6243339776992798,
"objective/kl": 11.435280799865723,
"objective/non_score_reward": -1.1435281038284302,
"objective/rlhf_reward": -0.12443088740110397,
"objective/scores": 1.015625,
"policy/approxkl_avg": 0.29013216495513916,
"policy/clipfrac_avg": 0.28125,
"policy/entropy_avg": 0.03498649597167969,
"step": 170,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 15,
"val/ratio": 1.0027971267700195,
"val/ratio_var": 0.0002298366161994636
},
{
"episode": 8960,
"epoch": 0.12289629253706777,
"eps": 5,
"loss/policy_avg": -0.011003649793565273,
"loss/value_avg": 0.0,
"lr": 2.7391304347826087e-06,
"objective/entropy": 2.000375986099243,
"objective/kl": 11.78514575958252,
"objective/non_score_reward": -1.1785145998001099,
"objective/rlhf_reward": -0.2609584331512451,
"objective/scores": 0.91796875,
"policy/approxkl_avg": 0.8603074550628662,
"policy/clipfrac_avg": 0.2998046875,
"policy/entropy_avg": 0.034775733947753906,
"step": 175,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 19,
"val/ratio": 1.0012288093566895,
"val/ratio_var": 3.532394111971371e-05
},
{
"episode": 9216,
"epoch": 0.126407615180984,
"eps": 5,
"loss/policy_avg": -0.010885423980653286,
"loss/value_avg": 0.0,
"lr": 2.731457800511509e-06,
"objective/entropy": 1.5240473747253418,
"objective/kl": 12.420597076416016,
"objective/non_score_reward": -1.2420598268508911,
"objective/rlhf_reward": -0.16641265153884888,
"objective/scores": 1.078125,
"policy/approxkl_avg": 0.46217110753059387,
"policy/clipfrac_avg": 0.2783203125,
"policy/entropy_avg": 0.029424667358398438,
"step": 180,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 20,
"val/ratio": 1.0007582902908325,
"val/ratio_var": 2.4759892767178826e-05
},
{
"episode": 9472,
"epoch": 0.12991893782490022,
"eps": 5,
"loss/policy_avg": -0.01097183395177126,
"loss/value_avg": 0.0,
"lr": 2.7237851662404094e-06,
"objective/entropy": 1.6292238235473633,
"objective/kl": 12.73173713684082,
"objective/non_score_reward": -1.2731736898422241,
"objective/rlhf_reward": -0.10916168242692947,
"objective/scores": 1.1640625,
"policy/approxkl_avg": 0.5525862574577332,
"policy/clipfrac_avg": 0.310546875,
"policy/entropy_avg": 0.031815528869628906,
"step": 185,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 22,
"val/ratio": 1.0027148723602295,
"val/ratio_var": 0.00016600274830125272
},
{
"episode": 9728,
"epoch": 0.13343026046881643,
"eps": 5,
"loss/policy_avg": -0.010572239756584167,
"loss/value_avg": 0.0,
"lr": 2.7161125319693097e-06,
"objective/entropy": 2.028618335723877,
"objective/kl": 12.439943313598633,
"objective/non_score_reward": -1.2439942359924316,
"objective/rlhf_reward": -0.06748821586370468,
"objective/scores": 1.171875,
"policy/approxkl_avg": 0.4930054843425751,
"policy/clipfrac_avg": 0.2841796875,
"policy/entropy_avg": 0.03688812255859375,
"step": 190,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 22,
"val/ratio": 1.001340627670288,
"val/ratio_var": 4.4035481550963596e-05
},
{
"episode": 9984,
"epoch": 0.13694158311273266,
"eps": 5,
"loss/policy_avg": -0.019254155457019806,
"loss/value_avg": 0.0,
"lr": 2.7084398976982097e-06,
"objective/entropy": 2.295351266860962,
"objective/kl": 13.32223892211914,
"objective/non_score_reward": -1.332223892211914,
"objective/rlhf_reward": -0.1836824268102646,
"objective/scores": 1.1484375,
"policy/approxkl_avg": 3.1426281929016113,
"policy/clipfrac_avg": 0.3251953125,
"policy/entropy_avg": 0.03939247131347656,
"step": 195,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 17,
"val/ratio": 1.0032271146774292,
"val/ratio_var": 0.00019827872165478766
},
{
"episode": 10240,
"epoch": 0.14045290575664887,
"eps": 5,
"loss/policy_avg": -0.018122296780347824,
"loss/value_avg": 0.0,
"lr": 2.70076726342711e-06,
"objective/entropy": 2.345075845718384,
"objective/kl": 12.536066055297852,
"objective/non_score_reward": -1.2536065578460693,
"objective/rlhf_reward": -0.056986674666404724,
"objective/scores": 1.1953125,
"policy/approxkl_avg": 27.5201473236084,
"policy/clipfrac_avg": 0.3046875,
"policy/entropy_avg": 0.04156017303466797,
"step": 200,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 20,
"val/ratio": 0.9993807077407837,
"val/ratio_var": 0.00011275127326371148
},
{
"episode": 10496,
"epoch": 0.1439642284005651,
"eps": 5,
"loss/policy_avg": -0.019295353442430496,
"loss/value_avg": 0.0,
"lr": 2.6930946291560103e-06,
"objective/entropy": 2.091012477874756,
"objective/kl": 12.746508598327637,
"objective/non_score_reward": -1.2746508121490479,
"objective/rlhf_reward": -0.09065462648868561,
"objective/scores": 1.1875,
"policy/approxkl_avg": 0.5554059743881226,
"policy/clipfrac_avg": 0.2998046875,
"policy/entropy_avg": 0.03620719909667969,
"step": 205,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 17,
"val/ratio": 1.001387119293213,
"val/ratio_var": 3.4958156902575865e-05
},
{
"episode": 10752,
"epoch": 0.14747555104448132,
"eps": 5,
"loss/policy_avg": -0.010203800164163113,
"loss/value_avg": 0.0,
"lr": 2.6854219948849107e-06,
"objective/entropy": 2.1808600425720215,
"objective/kl": 12.404802322387695,
"objective/non_score_reward": -1.2404803037643433,
"objective/rlhf_reward": -0.059675075113773346,
"objective/scores": 1.1796875,
"policy/approxkl_avg": 0.5876989364624023,
"policy/clipfrac_avg": 0.27734375,
"policy/entropy_avg": 0.041385650634765625,
"step": 210,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 14,
"val/ratio": 1.0035128593444824,
"val/ratio_var": 0.0004931605653837323
},
{
"episode": 11008,
"epoch": 0.15098687368839755,
"eps": 5,
"loss/policy_avg": -0.018955286592245102,
"loss/value_avg": 0.0,
"lr": 2.677749360613811e-06,
"objective/entropy": 1.968322992324829,
"objective/kl": 13.322561264038086,
"objective/non_score_reward": -1.3322560787200928,
"objective/rlhf_reward": -0.0670965313911438,
"objective/scores": 1.265625,
"policy/approxkl_avg": 0.39782679080963135,
"policy/clipfrac_avg": 0.373046875,
"policy/entropy_avg": 0.03279399871826172,
"step": 215,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 14,
"val/ratio": 1.0017969608306885,
"val/ratio_var": 6.843745359219611e-05
},
{
"episode": 11264,
"epoch": 0.15449819633231376,
"eps": 5,
"loss/policy_avg": -0.014947709627449512,
"loss/value_avg": 0.0,
"lr": 2.670076726342711e-06,
"objective/entropy": 1.7985560894012451,
"objective/kl": 12.856376647949219,
"objective/non_score_reward": -1.285637617111206,
"objective/rlhf_reward": -0.03251491114497185,
"objective/scores": 1.25,
"policy/approxkl_avg": 0.4516296982765198,
"policy/clipfrac_avg": 0.3671875,
"policy/entropy_avg": 0.031859397888183594,
"step": 220,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 13,
"val/ratio": 1.0009992122650146,
"val/ratio_var": 4.1194460209226236e-05
},
{
"episode": 11520,
"epoch": 0.15800951897623,
"eps": 5,
"loss/policy_avg": -0.019819077104330063,
"loss/value_avg": 0.0,
"lr": 2.6624040920716113e-06,
"objective/entropy": 1.5284242630004883,
"objective/kl": 14.283391952514648,
"objective/non_score_reward": -1.4283392429351807,
"objective/rlhf_reward": -0.014965277165174484,
"objective/scores": 1.4140625,
"policy/approxkl_avg": 1.5518393516540527,
"policy/clipfrac_avg": 0.2744140625,
"policy/entropy_avg": 0.026048660278320312,
"step": 225,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 1.0007925033569336,
"val/ratio_var": 3.721785105881281e-05
},
{
"episode": 11776,
"epoch": 0.1615208416201462,
"eps": 5,
"loss/policy_avg": -0.015632648020982742,
"loss/value_avg": 0.0,
"lr": 2.6547314578005116e-06,
"objective/entropy": 1.5101430416107178,
"objective/kl": 13.435927391052246,
"objective/non_score_reward": -1.3435927629470825,
"objective/rlhf_reward": 0.017792798578739166,
"objective/scores": 1.359375,
"policy/approxkl_avg": 0.22922199964523315,
"policy/clipfrac_avg": 0.271484375,
"policy/entropy_avg": 0.025536060333251953,
"step": 230,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 11,
"val/ratio": 1.0355898141860962,
"val/ratio_var": 0.09009659290313721
},
{
"episode": 12032,
"epoch": 0.16503216426406245,
"eps": 5,
"loss/policy_avg": -0.014460040256381035,
"loss/value_avg": 0.0,
"lr": 2.647058823529412e-06,
"objective/entropy": 1.412046194076538,
"objective/kl": 13.981653213500977,
"objective/non_score_reward": -1.3981653451919556,
"objective/rlhf_reward": -0.19434592127799988,
"objective/scores": 1.203125,
"policy/approxkl_avg": 0.48358476161956787,
"policy/clipfrac_avg": 0.287109375,
"policy/entropy_avg": 0.027116775512695312,
"step": 235,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 19,
"val/ratio": 1.0002431869506836,
"val/ratio_var": 1.317531587119447e-05
},
{
"episode": 12288,
"epoch": 0.16854348690797866,
"eps": 5,
"loss/policy_avg": -0.0144148338586092,
"loss/value_avg": 0.0,
"lr": 2.6393861892583123e-06,
"objective/entropy": 1.5728825330734253,
"objective/kl": 13.091099739074707,
"objective/non_score_reward": -1.3091099262237549,
"objective/rlhf_reward": -0.12438549101352692,
"objective/scores": 1.1875,
"policy/approxkl_avg": 0.5084937810897827,
"policy/clipfrac_avg": 0.2587890625,
"policy/entropy_avg": 0.028881072998046875,
"step": 240,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 14,
"val/ratio": 1.004534125328064,
"val/ratio_var": 0.001037073670886457
},
{
"episode": 12544,
"epoch": 0.17205480955189487,
"eps": 5,
"loss/policy_avg": -0.02535724639892578,
"loss/value_avg": 0.0,
"lr": 2.6317135549872122e-06,
"objective/entropy": 1.6895666122436523,
"objective/kl": 13.036446571350098,
"objective/non_score_reward": -1.3036446571350098,
"objective/rlhf_reward": -0.08131173253059387,
"objective/scores": 1.21875,
"policy/approxkl_avg": 1.397173285484314,
"policy/clipfrac_avg": 0.296875,
"policy/entropy_avg": 0.025877952575683594,
"step": 245,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 11,
"val/ratio": 0.9993641972541809,
"val/ratio_var": 3.554957584128715e-05
},
{
"episode": 12800,
"epoch": 0.1755661321958111,
"eps": 5,
"loss/policy_avg": -0.013989413157105446,
"loss/value_avg": 0.0,
"lr": 2.6240409207161126e-06,
"objective/entropy": 1.4321318864822388,
"objective/kl": 13.751260757446289,
"objective/non_score_reward": -1.3751261234283447,
"objective/rlhf_reward": 0.024946460500359535,
"objective/scores": 1.3984375,
"policy/approxkl_avg": 0.3265579044818878,
"policy/clipfrac_avg": 0.3095703125,
"policy/entropy_avg": 0.02507495880126953,
"step": 250,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 14,
"val/ratio": 1.0016669034957886,
"val/ratio_var": 3.951354665332474e-05
},
{
"episode": 13056,
"epoch": 0.1790774548397273,
"eps": 5,
"loss/policy_avg": -0.01614242233335972,
"loss/value_avg": 0.0,
"lr": 2.616368286445013e-06,
"objective/entropy": 1.2477443218231201,
"objective/kl": 14.385757446289062,
"objective/non_score_reward": -1.4385757446289062,
"objective/rlhf_reward": -0.048571567982435226,
"objective/scores": 1.390625,
"policy/approxkl_avg": 0.38643181324005127,
"policy/clipfrac_avg": 0.33203125,
"policy/entropy_avg": 0.02417755126953125,
"step": 255,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 19,
"val/ratio": 1.0020815134048462,
"val/ratio_var": 0.00012469623470678926
},
{
"episode": 13312,
"epoch": 0.18258877748364355,
"eps": 5,
"loss/policy_avg": -0.013632966205477715,
"loss/value_avg": 0.0,
"lr": 2.6086956521739132e-06,
"objective/entropy": 1.5228471755981445,
"objective/kl": 14.862211227416992,
"objective/non_score_reward": -1.486221194267273,
"objective/rlhf_reward": -0.07545565813779831,
"objective/scores": 1.40625,
"policy/approxkl_avg": 2.011383056640625,
"policy/clipfrac_avg": 0.3359375,
"policy/entropy_avg": 0.027433395385742188,
"step": 260,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 17,
"val/ratio": 0.9998003244400024,
"val/ratio_var": 2.738163857429754e-05
},
{
"episode": 13568,
"epoch": 0.18610010012755976,
"eps": 5,
"loss/policy_avg": -0.020372817292809486,
"loss/value_avg": 0.0,
"lr": 2.6010230179028136e-06,
"objective/entropy": 1.633180856704712,
"objective/kl": 14.094629287719727,
"objective/non_score_reward": -1.4094629287719727,
"objective/rlhf_reward": -0.015713702887296677,
"objective/scores": 1.390625,
"policy/approxkl_avg": 0.47778478264808655,
"policy/clipfrac_avg": 0.3701171875,
"policy/entropy_avg": 0.02643442153930664,
"step": 265,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 14,
"val/ratio": 1.0079278945922852,
"val/ratio_var": 0.0023215855471789837
},
{
"episode": 13824,
"epoch": 0.189611422771476,
"eps": 5,
"loss/policy_avg": -0.01413625106215477,
"loss/value_avg": 0.0,
"lr": 2.5933503836317135e-06,
"objective/entropy": 1.2899070978164673,
"objective/kl": 14.59975528717041,
"objective/non_score_reward": -1.4599756002426147,
"objective/rlhf_reward": -0.09675531834363937,
"objective/scores": 1.359375,
"policy/approxkl_avg": 0.4568091630935669,
"policy/clipfrac_avg": 0.3076171875,
"policy/entropy_avg": 0.02667713165283203,
"step": 270,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 19,
"val/ratio": 1.003631830215454,
"val/ratio_var": 0.00041694415267556906
},
{
"episode": 14080,
"epoch": 0.1931227454153922,
"eps": 5,
"loss/policy_avg": -0.018304049968719482,
"loss/value_avg": 0.0,
"lr": 2.585677749360614e-06,
"objective/entropy": 1.3464603424072266,
"objective/kl": 14.915502548217773,
"objective/non_score_reward": -1.491550326347351,
"objective/rlhf_reward": -0.013601185753941536,
"objective/scores": 1.4765625,
"policy/approxkl_avg": 0.5154660940170288,
"policy/clipfrac_avg": 0.3251953125,
"policy/entropy_avg": 0.024927139282226562,
"step": 275,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 20,
"val/ratio": 1.0014865398406982,
"val/ratio_var": 8.263294148491696e-05
},
{
"episode": 14336,
"epoch": 0.19663406805930844,
"eps": 5,
"loss/policy_avg": -0.009162629023194313,
"loss/value_avg": 0.0,
"lr": 2.578005115089514e-06,
"objective/entropy": 1.3251242637634277,
"objective/kl": 14.600137710571289,
"objective/non_score_reward": -1.460013747215271,
"objective/rlhf_reward": -0.10571230947971344,
"objective/scores": 1.3515625,
"policy/approxkl_avg": 0.4187917411327362,
"policy/clipfrac_avg": 0.3046875,
"policy/entropy_avg": 0.023657798767089844,
"step": 280,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.008554458618164,
"val/ratio_var": 0.001467635971494019
},
{
"episode": 14592,
"epoch": 0.20014539070322465,
"eps": 5,
"loss/policy_avg": -0.01609072834253311,
"loss/value_avg": 0.0,
"lr": 2.5703324808184145e-06,
"objective/entropy": 1.3078004121780396,
"objective/kl": 14.999523162841797,
"objective/non_score_reward": -1.4999523162841797,
"objective/rlhf_reward": -0.15238332748413086,
"objective/scores": 1.3515625,
"policy/approxkl_avg": 0.3968128561973572,
"policy/clipfrac_avg": 0.36328125,
"policy/entropy_avg": 0.023943424224853516,
"step": 285,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 12,
"val/ratio": 1.0019521713256836,
"val/ratio_var": 8.228437945945188e-05
},
{
"episode": 14848,
"epoch": 0.2036567133471409,
"eps": 5,
"loss/policy_avg": -0.014186807908117771,
"loss/value_avg": 0.0,
"lr": 2.562659846547315e-06,
"objective/entropy": 1.2583755254745483,
"objective/kl": 15.623100280761719,
"objective/non_score_reward": -1.5623100996017456,
"objective/rlhf_reward": -0.09625323116779327,
"objective/scores": 1.46875,
"policy/approxkl_avg": 0.5678977370262146,
"policy/clipfrac_avg": 0.3076171875,
"policy/entropy_avg": 0.024990558624267578,
"step": 290,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 13,
"val/ratio": 1.0014848709106445,
"val/ratio_var": 4.219371476210654e-05
},
{
"episode": 15104,
"epoch": 0.2071680359910571,
"eps": 5,
"loss/policy_avg": -0.013804701156914234,
"loss/value_avg": 0.0,
"lr": 2.5549872122762148e-06,
"objective/entropy": 1.568720817565918,
"objective/kl": 14.687668800354004,
"objective/non_score_reward": -1.4687669277191162,
"objective/rlhf_reward": -0.17009752988815308,
"objective/scores": 1.296875,
"policy/approxkl_avg": 0.3046334981918335,
"policy/clipfrac_avg": 0.26953125,
"policy/entropy_avg": 0.027862548828125,
"step": 295,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.0005717277526855,
"val/ratio_var": 1.1324932529532816e-05
},
{
"episode": 15360,
"epoch": 0.21067935863497333,
"eps": 5,
"loss/policy_avg": -0.018133502453565598,
"loss/value_avg": 0.0,
"lr": 2.547314578005115e-06,
"objective/entropy": 1.2987349033355713,
"objective/kl": 13.89183235168457,
"objective/non_score_reward": -1.3891831636428833,
"objective/rlhf_reward": -0.12500587105751038,
"objective/scores": 1.265625,
"policy/approxkl_avg": 0.31936001777648926,
"policy/clipfrac_avg": 0.33984375,
"policy/entropy_avg": 0.02469015121459961,
"step": 300,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 18,
"val/ratio": 1.0020601749420166,
"val/ratio_var": 3.1293042411562055e-05
},
{
"episode": 15616,
"epoch": 0.21419068127888954,
"eps": 5,
"loss/policy_avg": -0.01710616797208786,
"loss/value_avg": 0.0,
"lr": 2.5396419437340155e-06,
"objective/entropy": 1.4288297891616821,
"objective/kl": 14.952780723571777,
"objective/non_score_reward": -1.4952781200408936,
"objective/rlhf_reward": -0.15792769193649292,
"objective/scores": 1.3359375,
"policy/approxkl_avg": 0.6461950540542603,
"policy/clipfrac_avg": 0.3125,
"policy/entropy_avg": 0.02637958526611328,
"step": 305,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 19,
"val/ratio": 1.0013606548309326,
"val/ratio_var": 4.203971548122354e-05
},
{
"episode": 15872,
"epoch": 0.21770200392280575,
"eps": 5,
"loss/policy_avg": -0.016128187999129295,
"loss/value_avg": 0.0,
"lr": 2.531969309462916e-06,
"objective/entropy": 1.3288850784301758,
"objective/kl": 15.583921432495117,
"objective/non_score_reward": -1.55839204788208,
"objective/rlhf_reward": -0.07665687799453735,
"objective/scores": 1.484375,
"policy/approxkl_avg": 0.3284182548522949,
"policy/clipfrac_avg": 0.328125,
"policy/entropy_avg": 0.02404165267944336,
"step": 310,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.0064442157745361,
"val/ratio_var": 0.0015344778075814247
},
{
"episode": 16128,
"epoch": 0.221213326566722,
"eps": 5,
"loss/policy_avg": -0.015278931707143784,
"loss/value_avg": 0.0,
"lr": 2.524296675191816e-06,
"objective/entropy": 1.3236112594604492,
"objective/kl": 14.773448944091797,
"objective/non_score_reward": -1.4773449897766113,
"objective/rlhf_reward": -0.08708612620830536,
"objective/scores": 1.390625,
"policy/approxkl_avg": 0.2980467975139618,
"policy/clipfrac_avg": 0.34765625,
"policy/entropy_avg": 0.02494335174560547,
"step": 315,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.0008249282836914,
"val/ratio_var": 1.927867742779199e-05
},
{
"episode": 16384,
"epoch": 0.2247246492106382,
"eps": 5,
"loss/policy_avg": -0.020951703190803528,
"loss/value_avg": 0.0,
"lr": 2.516624040920716e-06,
"objective/entropy": 1.2670817375183105,
"objective/kl": 14.8348970413208,
"objective/non_score_reward": -1.483489751815796,
"objective/rlhf_reward": 0.03382519632577896,
"objective/scores": 1.515625,
"policy/approxkl_avg": 1.0973663330078125,
"policy/clipfrac_avg": 0.361328125,
"policy/entropy_avg": 0.02091073989868164,
"step": 320,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 15,
"val/ratio": 1.003138542175293,
"val/ratio_var": 0.0001205340595333837
},
{
"episode": 16640,
"epoch": 0.22823597185455444,
"eps": 5,
"loss/policy_avg": -0.006676271557807922,
"loss/value_avg": 0.0,
"lr": 2.5089514066496164e-06,
"objective/entropy": 1.191056728363037,
"objective/kl": 16.46404457092285,
"objective/non_score_reward": -1.646404504776001,
"objective/rlhf_reward": -0.14990828931331635,
"objective/scores": 1.5,
"policy/approxkl_avg": 0.31475046277046204,
"policy/clipfrac_avg": 0.2578125,
"policy/entropy_avg": 0.021608352661132812,
"step": 325,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 25,
"val/ratio": 1.000575065612793,
"val/ratio_var": 1.735856494633481e-05
},
{
"episode": 16896,
"epoch": 0.23174729449847065,
"eps": 5,
"loss/policy_avg": -0.02127978205680847,
"loss/value_avg": 0.0,
"lr": 2.5012787723785167e-06,
"objective/entropy": 1.490570068359375,
"objective/kl": 16.22044563293457,
"objective/non_score_reward": -1.622044563293457,
"objective/rlhf_reward": -0.015120631083846092,
"objective/scores": 1.609375,
"policy/approxkl_avg": 2.5871665477752686,
"policy/clipfrac_avg": 0.35546875,
"policy/entropy_avg": 0.027353286743164062,
"step": 330,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 25,
"val/ratio": 1.003063440322876,
"val/ratio_var": 6.509448576252908e-05
},
{
"episode": 17152,
"epoch": 0.23525861714238688,
"eps": 5,
"loss/policy_avg": -0.013108542189002037,
"loss/value_avg": 0.0,
"lr": 2.493606138107417e-06,
"objective/entropy": 1.2718842029571533,
"objective/kl": 16.047882080078125,
"objective/non_score_reward": -1.604788064956665,
"objective/rlhf_reward": -0.12415145337581635,
"objective/scores": 1.484375,
"policy/approxkl_avg": 0.2758824825286865,
"policy/clipfrac_avg": 0.287109375,
"policy/entropy_avg": 0.024268627166748047,
"step": 335,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 16,
"val/ratio": 1.0036962032318115,
"val/ratio_var": 0.0003905180492438376
},
{
"episode": 17408,
"epoch": 0.2387699397863031,
"eps": 5,
"loss/policy_avg": -0.014837839640676975,
"loss/value_avg": 0.0,
"lr": 2.4859335038363174e-06,
"objective/entropy": 1.3406567573547363,
"objective/kl": 16.428348541259766,
"objective/non_score_reward": -1.642835021018982,
"objective/rlhf_reward": 0.018702151253819466,
"objective/scores": 1.6640625,
"policy/approxkl_avg": 0.192110076546669,
"policy/clipfrac_avg": 0.3388671875,
"policy/entropy_avg": 0.025295734405517578,
"step": 340,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 19,
"val/ratio": 1.0008959770202637,
"val/ratio_var": 1.205760781886056e-05
},
{
"episode": 17664,
"epoch": 0.24228126243021933,
"eps": 5,
"loss/policy_avg": -0.01899782381951809,
"loss/value_avg": 0.0,
"lr": 2.4782608695652173e-06,
"objective/entropy": 1.5880205631256104,
"objective/kl": 15.775943756103516,
"objective/non_score_reward": -1.577594518661499,
"objective/rlhf_reward": -0.08363974094390869,
"objective/scores": 1.4921875,
"policy/approxkl_avg": 0.9254180192947388,
"policy/clipfrac_avg": 0.3251953125,
"policy/entropy_avg": 0.024907588958740234,
"step": 345,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 15,
"val/ratio": 1.000749111175537,
"val/ratio_var": 2.8957187168998644e-05
},
{
"episode": 17920,
"epoch": 0.24579258507413554,
"eps": 5,
"loss/policy_avg": -0.014669202268123627,
"loss/value_avg": 0.0,
"lr": 2.4705882352941177e-06,
"objective/entropy": 1.2656543254852295,
"objective/kl": 16.034730911254883,
"objective/non_score_reward": -1.60347318649292,
"objective/rlhf_reward": -0.011744961142539978,
"objective/scores": 1.59375,
"policy/approxkl_avg": 1.004683017730713,
"policy/clipfrac_avg": 0.2646484375,
"policy/entropy_avg": 0.023741722106933594,
"step": 350,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 19,
"val/ratio": 1.0009608268737793,
"val/ratio_var": 4.649764014175162e-05
}
],
"logging_steps": 100,
"max_steps": 391,
"num_input_tokens_seen": 0,
"num_train_epochs": 1.3716104077797742,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0,
"train_batch_size": null,
"trial_name": null,
"trial_params": null
}