|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"episode": 17920, |
|
"epoch": 0.24579258507413554, |
|
"eval_steps": 200.0, |
|
"global_step": 350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"episode": 256, |
|
"epoch": 0.003511322643916222, |
|
"eps": 6, |
|
"loss/policy_avg": -0.07090990990400314, |
|
"loss/value_avg": 0.0, |
|
"lr": 3e-06, |
|
"objective/entropy": 49.42120361328125, |
|
"objective/kl": 0.006465356796979904, |
|
"objective/non_score_reward": -0.000646535714622587, |
|
"objective/rlhf_reward": -1.1137903928756714, |
|
"objective/scores": -1.109375, |
|
"policy/approxkl_avg": 27.096786499023438, |
|
"policy/clipfrac_avg": 0.732421875, |
|
"policy/entropy_avg": 0.92181396484375, |
|
"step": 5, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 12, |
|
"val/ratio": 1.0399832725524902, |
|
"val/ratio_var": 0.010045886039733887 |
|
}, |
|
{ |
|
"episode": 512, |
|
"epoch": 0.007022645287832444, |
|
"eps": 6, |
|
"loss/policy_avg": -0.06497187167406082, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9923273657289e-06, |
|
"objective/entropy": 48.286014556884766, |
|
"objective/kl": 0.8119473457336426, |
|
"objective/non_score_reward": -0.08119472861289978, |
|
"objective/rlhf_reward": -1.266162633895874, |
|
"objective/scores": -1.1875, |
|
"policy/approxkl_avg": 18.666072845458984, |
|
"policy/clipfrac_avg": 0.7314453125, |
|
"policy/entropy_avg": 0.912261962890625, |
|
"step": 10, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.020957112312317, |
|
"val/ratio_var": 0.00411860179156065 |
|
}, |
|
{ |
|
"episode": 768, |
|
"epoch": 0.010533967931748666, |
|
"eps": 6, |
|
"loss/policy_avg": -0.0872286781668663, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9846547314578008e-06, |
|
"objective/entropy": 49.34376525878906, |
|
"objective/kl": 1.9591996669769287, |
|
"objective/non_score_reward": -0.1959199756383896, |
|
"objective/rlhf_reward": -1.2858657836914062, |
|
"objective/scores": -1.09375, |
|
"policy/approxkl_avg": 20.772502899169922, |
|
"policy/clipfrac_avg": 0.73828125, |
|
"policy/entropy_avg": 0.927978515625, |
|
"step": 15, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 12, |
|
"val/ratio": 1.0191609859466553, |
|
"val/ratio_var": 0.00307083735242486 |
|
}, |
|
{ |
|
"episode": 1024, |
|
"epoch": 0.014045290575664887, |
|
"eps": 6, |
|
"loss/policy_avg": -0.07566041499376297, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9769820971867007e-06, |
|
"objective/entropy": 53.13662338256836, |
|
"objective/kl": 2.4811532497406006, |
|
"objective/non_score_reward": -0.24811533093452454, |
|
"objective/rlhf_reward": -1.2548893690109253, |
|
"objective/scores": -1.0078125, |
|
"policy/approxkl_avg": 20.665164947509766, |
|
"policy/clipfrac_avg": 0.7314453125, |
|
"policy/entropy_avg": 0.989776611328125, |
|
"step": 20, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 1.011010766029358, |
|
"val/ratio_var": 0.004201602190732956 |
|
}, |
|
{ |
|
"episode": 1280, |
|
"epoch": 0.01755661321958111, |
|
"eps": 6, |
|
"loss/policy_avg": -0.08593496680259705, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9693094629156014e-06, |
|
"objective/entropy": 53.72633743286133, |
|
"objective/kl": 3.3111624717712402, |
|
"objective/non_score_reward": -0.3311161994934082, |
|
"objective/rlhf_reward": -1.339456558227539, |
|
"objective/scores": -1.0078125, |
|
"policy/approxkl_avg": 25.559288024902344, |
|
"policy/clipfrac_avg": 0.7353515625, |
|
"policy/entropy_avg": 0.997894287109375, |
|
"step": 25, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0134021043777466, |
|
"val/ratio_var": 0.0019979747012257576 |
|
}, |
|
{ |
|
"episode": 1536, |
|
"epoch": 0.021067935863497332, |
|
"eps": 6, |
|
"loss/policy_avg": -0.09734417498111725, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9616368286445014e-06, |
|
"objective/entropy": 51.259735107421875, |
|
"objective/kl": 5.089182376861572, |
|
"objective/non_score_reward": -0.5089181661605835, |
|
"objective/rlhf_reward": -1.2202520370483398, |
|
"objective/scores": -0.7109375, |
|
"policy/approxkl_avg": 29.841636657714844, |
|
"policy/clipfrac_avg": 0.736328125, |
|
"policy/entropy_avg": 0.960479736328125, |
|
"step": 30, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 26, |
|
"val/ratio": 1.0178756713867188, |
|
"val/ratio_var": 0.009866585955023766 |
|
}, |
|
{ |
|
"episode": 1792, |
|
"epoch": 0.024579258507413555, |
|
"eps": 6, |
|
"loss/policy_avg": -0.06831618398427963, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9539641943734013e-06, |
|
"objective/entropy": 40.643272399902344, |
|
"objective/kl": 6.974010944366455, |
|
"objective/non_score_reward": -0.6974011063575745, |
|
"objective/rlhf_reward": -1.2684605121612549, |
|
"objective/scores": -0.5703125, |
|
"policy/approxkl_avg": 35.33942413330078, |
|
"policy/clipfrac_avg": 0.6982421875, |
|
"policy/entropy_avg": 0.7505035400390625, |
|
"step": 35, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.00449800491333, |
|
"val/ratio_var": 0.0022142010275274515 |
|
}, |
|
{ |
|
"episode": 2048, |
|
"epoch": 0.028090581151329775, |
|
"eps": 6, |
|
"loss/policy_avg": -0.04068079590797424, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.946291560102302e-06, |
|
"objective/entropy": 23.142562866210938, |
|
"objective/kl": 8.180486679077148, |
|
"objective/non_score_reward": -0.8180487155914307, |
|
"objective/rlhf_reward": -1.0729957818984985, |
|
"objective/scores": -0.255859375, |
|
"policy/approxkl_avg": 23.68307876586914, |
|
"policy/clipfrac_avg": 0.5859375, |
|
"policy/entropy_avg": 0.4361400604248047, |
|
"step": 40, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 8, |
|
"val/ratio": 1.0077030658721924, |
|
"val/ratio_var": 0.0024766812566667795 |
|
}, |
|
{ |
|
"episode": 2304, |
|
"epoch": 0.031601903795246, |
|
"eps": 6, |
|
"loss/policy_avg": -0.07307010889053345, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.938618925831202e-06, |
|
"objective/entropy": 19.376842498779297, |
|
"objective/kl": 8.770210266113281, |
|
"objective/non_score_reward": -0.8770210146903992, |
|
"objective/rlhf_reward": -1.0002652406692505, |
|
"objective/scores": -0.12353515625, |
|
"policy/approxkl_avg": 31.00873565673828, |
|
"policy/clipfrac_avg": 0.5302734375, |
|
"policy/entropy_avg": 0.33237457275390625, |
|
"step": 45, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 0.996111273765564, |
|
"val/ratio_var": 0.001100091845728457 |
|
}, |
|
{ |
|
"episode": 2560, |
|
"epoch": 0.03511322643916222, |
|
"eps": 6, |
|
"loss/policy_avg": -0.04584116116166115, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9309462915601027e-06, |
|
"objective/entropy": 11.984097480773926, |
|
"objective/kl": 8.4966402053833, |
|
"objective/non_score_reward": -0.849664032459259, |
|
"objective/rlhf_reward": -0.8017911911010742, |
|
"objective/scores": 0.0478515625, |
|
"policy/approxkl_avg": 22.561037063598633, |
|
"policy/clipfrac_avg": 0.451171875, |
|
"policy/entropy_avg": 0.19393539428710938, |
|
"step": 50, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 0.9952375888824463, |
|
"val/ratio_var": 0.000761833623982966 |
|
}, |
|
{ |
|
"episode": 2816, |
|
"epoch": 0.03862454908307844, |
|
"eps": 5, |
|
"loss/policy_avg": -0.029720915481448174, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9232736572890026e-06, |
|
"objective/entropy": 4.9489898681640625, |
|
"objective/kl": 8.733837127685547, |
|
"objective/non_score_reward": -0.8733837604522705, |
|
"objective/rlhf_reward": -0.7492713928222656, |
|
"objective/scores": 0.1240234375, |
|
"policy/approxkl_avg": 16.253189086914062, |
|
"policy/clipfrac_avg": 0.341796875, |
|
"policy/entropy_avg": 0.07728099822998047, |
|
"step": 55, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 18, |
|
"val/ratio": 0.9972053170204163, |
|
"val/ratio_var": 0.00032430028659291565 |
|
}, |
|
{ |
|
"episode": 3072, |
|
"epoch": 0.042135871726994664, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01298562902957201, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9156010230179026e-06, |
|
"objective/entropy": 1.3101667165756226, |
|
"objective/kl": 8.699792861938477, |
|
"objective/non_score_reward": -0.8699792623519897, |
|
"objective/rlhf_reward": -0.5752952098846436, |
|
"objective/scores": 0.294921875, |
|
"policy/approxkl_avg": 2.27925968170166, |
|
"policy/clipfrac_avg": 0.236328125, |
|
"policy/entropy_avg": 0.02513742446899414, |
|
"step": 60, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 1.0017118453979492, |
|
"val/ratio_var": 0.00016639505338389426 |
|
}, |
|
{ |
|
"episode": 3328, |
|
"epoch": 0.04564719437091089, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02618303708732128, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9079283887468033e-06, |
|
"objective/entropy": 2.3685269355773926, |
|
"objective/kl": 9.208517074584961, |
|
"objective/non_score_reward": -0.9208516478538513, |
|
"objective/rlhf_reward": -0.5182289481163025, |
|
"objective/scores": 0.40234375, |
|
"policy/approxkl_avg": 2.6189699172973633, |
|
"policy/clipfrac_avg": 0.310546875, |
|
"policy/entropy_avg": 0.04020071029663086, |
|
"step": 65, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 1.003983497619629, |
|
"val/ratio_var": 0.0009448421187698841 |
|
}, |
|
{ |
|
"episode": 3584, |
|
"epoch": 0.04915851701482711, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02327096462249756, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.9002557544757032e-06, |
|
"objective/entropy": 2.0416018962860107, |
|
"objective/kl": 9.701976776123047, |
|
"objective/non_score_reward": -0.9701976776123047, |
|
"objective/rlhf_reward": -0.49486449360847473, |
|
"objective/scores": 0.474609375, |
|
"policy/approxkl_avg": 1.271956443786621, |
|
"policy/clipfrac_avg": 0.2734375, |
|
"policy/entropy_avg": 0.041253089904785156, |
|
"step": 70, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.0039558410644531, |
|
"val/ratio_var": 0.00041477559716440737 |
|
}, |
|
{ |
|
"episode": 3840, |
|
"epoch": 0.052669839658743334, |
|
"eps": 5, |
|
"loss/policy_avg": -0.033096276223659515, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.892583120204604e-06, |
|
"objective/entropy": 2.7795495986938477, |
|
"objective/kl": 10.028523445129395, |
|
"objective/non_score_reward": -1.0028523206710815, |
|
"objective/rlhf_reward": -0.46555712819099426, |
|
"objective/scores": 0.5390625, |
|
"policy/approxkl_avg": 3.055203676223755, |
|
"policy/clipfrac_avg": 0.3427734375, |
|
"policy/entropy_avg": 0.053270816802978516, |
|
"step": 75, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 23, |
|
"val/ratio": 1.0012407302856445, |
|
"val/ratio_var": 0.00011274257121840492 |
|
}, |
|
{ |
|
"episode": 4096, |
|
"epoch": 0.05618116230265955, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01961323618888855, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.884910485933504e-06, |
|
"objective/entropy": 2.5525641441345215, |
|
"objective/kl": 10.111019134521484, |
|
"objective/non_score_reward": -1.0111019611358643, |
|
"objective/rlhf_reward": -0.510233461856842, |
|
"objective/scores": 0.5, |
|
"policy/approxkl_avg": 1.331697940826416, |
|
"policy/clipfrac_avg": 0.2861328125, |
|
"policy/entropy_avg": 0.048857688903808594, |
|
"step": 80, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 25, |
|
"val/ratio": 1.011049509048462, |
|
"val/ratio_var": 0.004252108279615641 |
|
}, |
|
{ |
|
"episode": 4352, |
|
"epoch": 0.05969248494657577, |
|
"eps": 5, |
|
"loss/policy_avg": -0.009127877652645111, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.877237851662404e-06, |
|
"objective/entropy": 3.016789674758911, |
|
"objective/kl": 11.257818222045898, |
|
"objective/non_score_reward": -1.125781774520874, |
|
"objective/rlhf_reward": -0.4276960492134094, |
|
"objective/scores": 0.69921875, |
|
"policy/approxkl_avg": 1.4772686958312988, |
|
"policy/clipfrac_avg": 0.35546875, |
|
"policy/entropy_avg": 0.053719520568847656, |
|
"step": 85, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 6, |
|
"val/ratio": 1.0042904615402222, |
|
"val/ratio_var": 0.0008556774700991809 |
|
}, |
|
{ |
|
"episode": 4608, |
|
"epoch": 0.063203807590492, |
|
"eps": 5, |
|
"loss/policy_avg": -0.025049656629562378, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8695652173913046e-06, |
|
"objective/entropy": 2.5907459259033203, |
|
"objective/kl": 10.457273483276367, |
|
"objective/non_score_reward": -1.0457274913787842, |
|
"objective/rlhf_reward": -0.3816419839859009, |
|
"objective/scores": 0.6640625, |
|
"policy/approxkl_avg": 2.3460922241210938, |
|
"policy/clipfrac_avg": 0.322265625, |
|
"policy/entropy_avg": 0.04626178741455078, |
|
"step": 90, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 1.0003862380981445, |
|
"val/ratio_var": 7.93520302977413e-05 |
|
}, |
|
{ |
|
"episode": 4864, |
|
"epoch": 0.06671513023440821, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01828361675143242, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8618925831202045e-06, |
|
"objective/entropy": 2.397810220718384, |
|
"objective/kl": 10.732559204101562, |
|
"objective/non_score_reward": -1.073256015777588, |
|
"objective/rlhf_reward": -0.35966813564300537, |
|
"objective/scores": 0.71484375, |
|
"policy/approxkl_avg": 1.1093428134918213, |
|
"policy/clipfrac_avg": 0.32421875, |
|
"policy/entropy_avg": 0.041881561279296875, |
|
"step": 95, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 15, |
|
"val/ratio": 1.0054664611816406, |
|
"val/ratio_var": 0.0017973663052543998 |
|
}, |
|
{ |
|
"episode": 5120, |
|
"epoch": 0.07022645287832444, |
|
"eps": 5, |
|
"loss/policy_avg": -0.04088423401117325, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8542199488491053e-06, |
|
"objective/entropy": 2.343449592590332, |
|
"objective/kl": 11.780994415283203, |
|
"objective/non_score_reward": -1.1780993938446045, |
|
"objective/rlhf_reward": -0.4628324806690216, |
|
"objective/scores": 0.71484375, |
|
"policy/approxkl_avg": 0.894420325756073, |
|
"policy/clipfrac_avg": 0.46875, |
|
"policy/entropy_avg": 0.04486083984375, |
|
"step": 100, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 1.0009559392929077, |
|
"val/ratio_var": 4.804596756002866e-05 |
|
}, |
|
{ |
|
"episode": 5376, |
|
"epoch": 0.07373777552224066, |
|
"eps": 5, |
|
"loss/policy_avg": -0.020697183907032013, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.846547314578005e-06, |
|
"objective/entropy": 1.9023351669311523, |
|
"objective/kl": 10.29288101196289, |
|
"objective/non_score_reward": -1.0292882919311523, |
|
"objective/rlhf_reward": -0.29047834873199463, |
|
"objective/scores": 0.73828125, |
|
"policy/approxkl_avg": 0.9143690466880798, |
|
"policy/clipfrac_avg": 0.373046875, |
|
"policy/entropy_avg": 0.028568267822265625, |
|
"step": 105, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 10, |
|
"val/ratio": 1.000715732574463, |
|
"val/ratio_var": 4.201457340968773e-05 |
|
}, |
|
{ |
|
"episode": 5632, |
|
"epoch": 0.07724909816615688, |
|
"eps": 5, |
|
"loss/policy_avg": -0.012633640319108963, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8388746803069055e-06, |
|
"objective/entropy": 1.3839142322540283, |
|
"objective/kl": 10.57151985168457, |
|
"objective/non_score_reward": -1.0571520328521729, |
|
"objective/rlhf_reward": -0.2935946583747864, |
|
"objective/scores": 0.765625, |
|
"policy/approxkl_avg": 0.6525547504425049, |
|
"policy/clipfrac_avg": 0.2646484375, |
|
"policy/entropy_avg": 0.0345916748046875, |
|
"step": 110, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 10, |
|
"val/ratio": 0.9999199509620667, |
|
"val/ratio_var": 2.6978697860613465e-05 |
|
}, |
|
{ |
|
"episode": 5888, |
|
"epoch": 0.0807604208100731, |
|
"eps": 5, |
|
"loss/policy_avg": -0.026668714359402657, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.831202046035806e-06, |
|
"objective/entropy": 2.17741322517395, |
|
"objective/kl": 11.39688491821289, |
|
"objective/non_score_reward": -1.139688491821289, |
|
"objective/rlhf_reward": -0.3027456998825073, |
|
"objective/scores": 0.8359375, |
|
"policy/approxkl_avg": 8.829752922058105, |
|
"policy/clipfrac_avg": 0.35546875, |
|
"policy/entropy_avg": 0.034277915954589844, |
|
"step": 115, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 8, |
|
"val/ratio": 1.0012441873550415, |
|
"val/ratio_var": 9.009366476675496e-05 |
|
}, |
|
{ |
|
"episode": 6144, |
|
"epoch": 0.08427174345398933, |
|
"eps": 5, |
|
"loss/policy_avg": -0.011602860875427723, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.823529411764706e-06, |
|
"objective/entropy": 1.418602466583252, |
|
"objective/kl": 10.246469497680664, |
|
"objective/non_score_reward": -1.0246469974517822, |
|
"objective/rlhf_reward": -0.22599510848522186, |
|
"objective/scores": 0.796875, |
|
"policy/approxkl_avg": 0.31790149211883545, |
|
"policy/clipfrac_avg": 0.2314453125, |
|
"policy/entropy_avg": 0.028847694396972656, |
|
"step": 120, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 9, |
|
"val/ratio": 1.0009679794311523, |
|
"val/ratio_var": 3.900106457876973e-05 |
|
}, |
|
{ |
|
"episode": 6400, |
|
"epoch": 0.08778306609790555, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0157505851238966, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8158567774936066e-06, |
|
"objective/entropy": 1.936393141746521, |
|
"objective/kl": 10.550077438354492, |
|
"objective/non_score_reward": -1.0550076961517334, |
|
"objective/rlhf_reward": -0.252943217754364, |
|
"objective/scores": 0.80078125, |
|
"policy/approxkl_avg": 6.545133113861084, |
|
"policy/clipfrac_avg": 0.341796875, |
|
"policy/entropy_avg": 0.039971351623535156, |
|
"step": 125, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 12, |
|
"val/ratio": 1.0001187324523926, |
|
"val/ratio_var": 0.00011527155584190041 |
|
}, |
|
{ |
|
"episode": 6656, |
|
"epoch": 0.09129438874182177, |
|
"eps": 5, |
|
"loss/policy_avg": -0.00908716581761837, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.8081841432225065e-06, |
|
"objective/entropy": 1.9167767763137817, |
|
"objective/kl": 10.831771850585938, |
|
"objective/non_score_reward": -1.0831772089004517, |
|
"objective/rlhf_reward": -0.24270595610141754, |
|
"objective/scores": 0.83984375, |
|
"policy/approxkl_avg": 13.507976531982422, |
|
"policy/clipfrac_avg": 0.25, |
|
"policy/entropy_avg": 0.034499168395996094, |
|
"step": 130, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 7, |
|
"val/ratio": 1.0004911422729492, |
|
"val/ratio_var": 0.00018595268193166703 |
|
}, |
|
{ |
|
"episode": 6912, |
|
"epoch": 0.094805711385738, |
|
"eps": 5, |
|
"loss/policy_avg": -0.017197387292981148, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.800511508951407e-06, |
|
"objective/entropy": 1.7237651348114014, |
|
"objective/kl": 11.095592498779297, |
|
"objective/non_score_reward": -1.1095592975616455, |
|
"objective/rlhf_reward": -0.21057555079460144, |
|
"objective/scores": 0.8984375, |
|
"policy/approxkl_avg": 2.7560040950775146, |
|
"policy/clipfrac_avg": 0.2841796875, |
|
"policy/entropy_avg": 0.032952308654785156, |
|
"step": 135, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 2, |
|
"val/ratio": 0.9994020462036133, |
|
"val/ratio_var": 3.074964843108319e-05 |
|
}, |
|
{ |
|
"episode": 7168, |
|
"epoch": 0.09831703402965422, |
|
"eps": 5, |
|
"loss/policy_avg": -0.012010859325528145, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.792838874680307e-06, |
|
"objective/entropy": 1.5862581729888916, |
|
"objective/kl": 10.674396514892578, |
|
"objective/non_score_reward": -1.0674396753311157, |
|
"objective/rlhf_reward": -0.14433012902736664, |
|
"objective/scores": 0.921875, |
|
"policy/approxkl_avg": 1.1186727285385132, |
|
"policy/clipfrac_avg": 0.2783203125, |
|
"policy/entropy_avg": 0.0295562744140625, |
|
"step": 140, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0007727146148682, |
|
"val/ratio_var": 4.557183274300769e-05 |
|
}, |
|
{ |
|
"episode": 7424, |
|
"epoch": 0.10182835667357044, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013728385791182518, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.785166240409207e-06, |
|
"objective/entropy": 1.5388869047164917, |
|
"objective/kl": 10.359582901000977, |
|
"objective/non_score_reward": -1.035958170890808, |
|
"objective/rlhf_reward": -0.14511710405349731, |
|
"objective/scores": 0.890625, |
|
"policy/approxkl_avg": 0.5204602479934692, |
|
"policy/clipfrac_avg": 0.283203125, |
|
"policy/entropy_avg": 0.028924942016601562, |
|
"step": 145, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 14, |
|
"val/ratio": 1.056097149848938, |
|
"val/ratio_var": 0.13372056186199188 |
|
}, |
|
{ |
|
"episode": 7680, |
|
"epoch": 0.10533967931748667, |
|
"eps": 5, |
|
"loss/policy_avg": -0.014945434406399727, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7774936061381074e-06, |
|
"objective/entropy": 2.0769755840301514, |
|
"objective/kl": 11.147063255310059, |
|
"objective/non_score_reward": -1.11470627784729, |
|
"objective/rlhf_reward": -0.08940108120441437, |
|
"objective/scores": 1.0234375, |
|
"policy/approxkl_avg": 0.5961493253707886, |
|
"policy/clipfrac_avg": 0.3681640625, |
|
"policy/entropy_avg": 0.037804603576660156, |
|
"step": 150, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0033739805221558, |
|
"val/ratio_var": 0.00030022990540601313 |
|
}, |
|
{ |
|
"episode": 7936, |
|
"epoch": 0.10885100196140288, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02276831492781639, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7698209718670078e-06, |
|
"objective/entropy": 2.1412830352783203, |
|
"objective/kl": 11.697949409484863, |
|
"objective/non_score_reward": -1.169795036315918, |
|
"objective/rlhf_reward": -0.13582009077072144, |
|
"objective/scores": 1.03125, |
|
"policy/approxkl_avg": 0.7155288457870483, |
|
"policy/clipfrac_avg": 0.3193359375, |
|
"policy/entropy_avg": 0.037835121154785156, |
|
"step": 155, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0014090538024902, |
|
"val/ratio_var": 5.2470270020421594e-05 |
|
}, |
|
{ |
|
"episode": 8192, |
|
"epoch": 0.1123623246053191, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013076605275273323, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.762148337595908e-06, |
|
"objective/entropy": 1.634714126586914, |
|
"objective/kl": 11.629154205322266, |
|
"objective/non_score_reward": -1.1629154682159424, |
|
"objective/rlhf_reward": -0.28488799929618835, |
|
"objective/scores": 0.87890625, |
|
"policy/approxkl_avg": 0.4181188941001892, |
|
"policy/clipfrac_avg": 0.3037109375, |
|
"policy/entropy_avg": 0.029273509979248047, |
|
"step": 160, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 15, |
|
"val/ratio": 1.0008339881896973, |
|
"val/ratio_var": 1.4662801731901709e-05 |
|
}, |
|
{ |
|
"episode": 8448, |
|
"epoch": 0.11587364724923532, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01651182770729065, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7544757033248085e-06, |
|
"objective/entropy": 1.9540742635726929, |
|
"objective/kl": 11.4830322265625, |
|
"objective/non_score_reward": -1.1483032703399658, |
|
"objective/rlhf_reward": -0.05983233451843262, |
|
"objective/scores": 1.0859375, |
|
"policy/approxkl_avg": 18.791297912597656, |
|
"policy/clipfrac_avg": 0.2880859375, |
|
"policy/entropy_avg": 0.03601264953613281, |
|
"step": 165, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.0220942497253418, |
|
"val/ratio_var": 0.02208283357322216 |
|
}, |
|
{ |
|
"episode": 8704, |
|
"epoch": 0.11938496989315155, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013821810483932495, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7468030690537084e-06, |
|
"objective/entropy": 1.6243339776992798, |
|
"objective/kl": 11.435280799865723, |
|
"objective/non_score_reward": -1.1435281038284302, |
|
"objective/rlhf_reward": -0.12443088740110397, |
|
"objective/scores": 1.015625, |
|
"policy/approxkl_avg": 0.29013216495513916, |
|
"policy/clipfrac_avg": 0.28125, |
|
"policy/entropy_avg": 0.03498649597167969, |
|
"step": 170, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 15, |
|
"val/ratio": 1.0027971267700195, |
|
"val/ratio_var": 0.0002298366161994636 |
|
}, |
|
{ |
|
"episode": 8960, |
|
"epoch": 0.12289629253706777, |
|
"eps": 5, |
|
"loss/policy_avg": -0.011003649793565273, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7391304347826087e-06, |
|
"objective/entropy": 2.000375986099243, |
|
"objective/kl": 11.78514575958252, |
|
"objective/non_score_reward": -1.1785145998001099, |
|
"objective/rlhf_reward": -0.2609584331512451, |
|
"objective/scores": 0.91796875, |
|
"policy/approxkl_avg": 0.8603074550628662, |
|
"policy/clipfrac_avg": 0.2998046875, |
|
"policy/entropy_avg": 0.034775733947753906, |
|
"step": 175, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 19, |
|
"val/ratio": 1.0012288093566895, |
|
"val/ratio_var": 3.532394111971371e-05 |
|
}, |
|
{ |
|
"episode": 9216, |
|
"epoch": 0.126407615180984, |
|
"eps": 5, |
|
"loss/policy_avg": -0.010885423980653286, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.731457800511509e-06, |
|
"objective/entropy": 1.5240473747253418, |
|
"objective/kl": 12.420597076416016, |
|
"objective/non_score_reward": -1.2420598268508911, |
|
"objective/rlhf_reward": -0.16641265153884888, |
|
"objective/scores": 1.078125, |
|
"policy/approxkl_avg": 0.46217110753059387, |
|
"policy/clipfrac_avg": 0.2783203125, |
|
"policy/entropy_avg": 0.029424667358398438, |
|
"step": 180, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 1.0007582902908325, |
|
"val/ratio_var": 2.4759892767178826e-05 |
|
}, |
|
{ |
|
"episode": 9472, |
|
"epoch": 0.12991893782490022, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01097183395177126, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7237851662404094e-06, |
|
"objective/entropy": 1.6292238235473633, |
|
"objective/kl": 12.73173713684082, |
|
"objective/non_score_reward": -1.2731736898422241, |
|
"objective/rlhf_reward": -0.10916168242692947, |
|
"objective/scores": 1.1640625, |
|
"policy/approxkl_avg": 0.5525862574577332, |
|
"policy/clipfrac_avg": 0.310546875, |
|
"policy/entropy_avg": 0.031815528869628906, |
|
"step": 185, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 22, |
|
"val/ratio": 1.0027148723602295, |
|
"val/ratio_var": 0.00016600274830125272 |
|
}, |
|
{ |
|
"episode": 9728, |
|
"epoch": 0.13343026046881643, |
|
"eps": 5, |
|
"loss/policy_avg": -0.010572239756584167, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7161125319693097e-06, |
|
"objective/entropy": 2.028618335723877, |
|
"objective/kl": 12.439943313598633, |
|
"objective/non_score_reward": -1.2439942359924316, |
|
"objective/rlhf_reward": -0.06748821586370468, |
|
"objective/scores": 1.171875, |
|
"policy/approxkl_avg": 0.4930054843425751, |
|
"policy/clipfrac_avg": 0.2841796875, |
|
"policy/entropy_avg": 0.03688812255859375, |
|
"step": 190, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 22, |
|
"val/ratio": 1.001340627670288, |
|
"val/ratio_var": 4.4035481550963596e-05 |
|
}, |
|
{ |
|
"episode": 9984, |
|
"epoch": 0.13694158311273266, |
|
"eps": 5, |
|
"loss/policy_avg": -0.019254155457019806, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.7084398976982097e-06, |
|
"objective/entropy": 2.295351266860962, |
|
"objective/kl": 13.32223892211914, |
|
"objective/non_score_reward": -1.332223892211914, |
|
"objective/rlhf_reward": -0.1836824268102646, |
|
"objective/scores": 1.1484375, |
|
"policy/approxkl_avg": 3.1426281929016113, |
|
"policy/clipfrac_avg": 0.3251953125, |
|
"policy/entropy_avg": 0.03939247131347656, |
|
"step": 195, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 17, |
|
"val/ratio": 1.0032271146774292, |
|
"val/ratio_var": 0.00019827872165478766 |
|
}, |
|
{ |
|
"episode": 10240, |
|
"epoch": 0.14045290575664887, |
|
"eps": 5, |
|
"loss/policy_avg": -0.018122296780347824, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.70076726342711e-06, |
|
"objective/entropy": 2.345075845718384, |
|
"objective/kl": 12.536066055297852, |
|
"objective/non_score_reward": -1.2536065578460693, |
|
"objective/rlhf_reward": -0.056986674666404724, |
|
"objective/scores": 1.1953125, |
|
"policy/approxkl_avg": 27.5201473236084, |
|
"policy/clipfrac_avg": 0.3046875, |
|
"policy/entropy_avg": 0.04156017303466797, |
|
"step": 200, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 0.9993807077407837, |
|
"val/ratio_var": 0.00011275127326371148 |
|
}, |
|
{ |
|
"episode": 10496, |
|
"epoch": 0.1439642284005651, |
|
"eps": 5, |
|
"loss/policy_avg": -0.019295353442430496, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6930946291560103e-06, |
|
"objective/entropy": 2.091012477874756, |
|
"objective/kl": 12.746508598327637, |
|
"objective/non_score_reward": -1.2746508121490479, |
|
"objective/rlhf_reward": -0.09065462648868561, |
|
"objective/scores": 1.1875, |
|
"policy/approxkl_avg": 0.5554059743881226, |
|
"policy/clipfrac_avg": 0.2998046875, |
|
"policy/entropy_avg": 0.03620719909667969, |
|
"step": 205, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 17, |
|
"val/ratio": 1.001387119293213, |
|
"val/ratio_var": 3.4958156902575865e-05 |
|
}, |
|
{ |
|
"episode": 10752, |
|
"epoch": 0.14747555104448132, |
|
"eps": 5, |
|
"loss/policy_avg": -0.010203800164163113, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6854219948849107e-06, |
|
"objective/entropy": 2.1808600425720215, |
|
"objective/kl": 12.404802322387695, |
|
"objective/non_score_reward": -1.2404803037643433, |
|
"objective/rlhf_reward": -0.059675075113773346, |
|
"objective/scores": 1.1796875, |
|
"policy/approxkl_avg": 0.5876989364624023, |
|
"policy/clipfrac_avg": 0.27734375, |
|
"policy/entropy_avg": 0.041385650634765625, |
|
"step": 210, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 14, |
|
"val/ratio": 1.0035128593444824, |
|
"val/ratio_var": 0.0004931605653837323 |
|
}, |
|
{ |
|
"episode": 11008, |
|
"epoch": 0.15098687368839755, |
|
"eps": 5, |
|
"loss/policy_avg": -0.018955286592245102, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.677749360613811e-06, |
|
"objective/entropy": 1.968322992324829, |
|
"objective/kl": 13.322561264038086, |
|
"objective/non_score_reward": -1.3322560787200928, |
|
"objective/rlhf_reward": -0.0670965313911438, |
|
"objective/scores": 1.265625, |
|
"policy/approxkl_avg": 0.39782679080963135, |
|
"policy/clipfrac_avg": 0.373046875, |
|
"policy/entropy_avg": 0.03279399871826172, |
|
"step": 215, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 14, |
|
"val/ratio": 1.0017969608306885, |
|
"val/ratio_var": 6.843745359219611e-05 |
|
}, |
|
{ |
|
"episode": 11264, |
|
"epoch": 0.15449819633231376, |
|
"eps": 5, |
|
"loss/policy_avg": -0.014947709627449512, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.670076726342711e-06, |
|
"objective/entropy": 1.7985560894012451, |
|
"objective/kl": 12.856376647949219, |
|
"objective/non_score_reward": -1.285637617111206, |
|
"objective/rlhf_reward": -0.03251491114497185, |
|
"objective/scores": 1.25, |
|
"policy/approxkl_avg": 0.4516296982765198, |
|
"policy/clipfrac_avg": 0.3671875, |
|
"policy/entropy_avg": 0.031859397888183594, |
|
"step": 220, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0009992122650146, |
|
"val/ratio_var": 4.1194460209226236e-05 |
|
}, |
|
{ |
|
"episode": 11520, |
|
"epoch": 0.15800951897623, |
|
"eps": 5, |
|
"loss/policy_avg": -0.019819077104330063, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6624040920716113e-06, |
|
"objective/entropy": 1.5284242630004883, |
|
"objective/kl": 14.283391952514648, |
|
"objective/non_score_reward": -1.4283392429351807, |
|
"objective/rlhf_reward": -0.014965277165174484, |
|
"objective/scores": 1.4140625, |
|
"policy/approxkl_avg": 1.5518393516540527, |
|
"policy/clipfrac_avg": 0.2744140625, |
|
"policy/entropy_avg": 0.026048660278320312, |
|
"step": 225, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 8, |
|
"val/ratio": 1.0007925033569336, |
|
"val/ratio_var": 3.721785105881281e-05 |
|
}, |
|
{ |
|
"episode": 11776, |
|
"epoch": 0.1615208416201462, |
|
"eps": 5, |
|
"loss/policy_avg": -0.015632648020982742, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6547314578005116e-06, |
|
"objective/entropy": 1.5101430416107178, |
|
"objective/kl": 13.435927391052246, |
|
"objective/non_score_reward": -1.3435927629470825, |
|
"objective/rlhf_reward": 0.017792798578739166, |
|
"objective/scores": 1.359375, |
|
"policy/approxkl_avg": 0.22922199964523315, |
|
"policy/clipfrac_avg": 0.271484375, |
|
"policy/entropy_avg": 0.025536060333251953, |
|
"step": 230, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 1.0355898141860962, |
|
"val/ratio_var": 0.09009659290313721 |
|
}, |
|
{ |
|
"episode": 12032, |
|
"epoch": 0.16503216426406245, |
|
"eps": 5, |
|
"loss/policy_avg": -0.014460040256381035, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.647058823529412e-06, |
|
"objective/entropy": 1.412046194076538, |
|
"objective/kl": 13.981653213500977, |
|
"objective/non_score_reward": -1.3981653451919556, |
|
"objective/rlhf_reward": -0.19434592127799988, |
|
"objective/scores": 1.203125, |
|
"policy/approxkl_avg": 0.48358476161956787, |
|
"policy/clipfrac_avg": 0.287109375, |
|
"policy/entropy_avg": 0.027116775512695312, |
|
"step": 235, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 19, |
|
"val/ratio": 1.0002431869506836, |
|
"val/ratio_var": 1.317531587119447e-05 |
|
}, |
|
{ |
|
"episode": 12288, |
|
"epoch": 0.16854348690797866, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0144148338586092, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6393861892583123e-06, |
|
"objective/entropy": 1.5728825330734253, |
|
"objective/kl": 13.091099739074707, |
|
"objective/non_score_reward": -1.3091099262237549, |
|
"objective/rlhf_reward": -0.12438549101352692, |
|
"objective/scores": 1.1875, |
|
"policy/approxkl_avg": 0.5084937810897827, |
|
"policy/clipfrac_avg": 0.2587890625, |
|
"policy/entropy_avg": 0.028881072998046875, |
|
"step": 240, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 14, |
|
"val/ratio": 1.004534125328064, |
|
"val/ratio_var": 0.001037073670886457 |
|
}, |
|
{ |
|
"episode": 12544, |
|
"epoch": 0.17205480955189487, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02535724639892578, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6317135549872122e-06, |
|
"objective/entropy": 1.6895666122436523, |
|
"objective/kl": 13.036446571350098, |
|
"objective/non_score_reward": -1.3036446571350098, |
|
"objective/rlhf_reward": -0.08131173253059387, |
|
"objective/scores": 1.21875, |
|
"policy/approxkl_avg": 1.397173285484314, |
|
"policy/clipfrac_avg": 0.296875, |
|
"policy/entropy_avg": 0.025877952575683594, |
|
"step": 245, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 11, |
|
"val/ratio": 0.9993641972541809, |
|
"val/ratio_var": 3.554957584128715e-05 |
|
}, |
|
{ |
|
"episode": 12800, |
|
"epoch": 0.1755661321958111, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013989413157105446, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6240409207161126e-06, |
|
"objective/entropy": 1.4321318864822388, |
|
"objective/kl": 13.751260757446289, |
|
"objective/non_score_reward": -1.3751261234283447, |
|
"objective/rlhf_reward": 0.024946460500359535, |
|
"objective/scores": 1.3984375, |
|
"policy/approxkl_avg": 0.3265579044818878, |
|
"policy/clipfrac_avg": 0.3095703125, |
|
"policy/entropy_avg": 0.02507495880126953, |
|
"step": 250, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 14, |
|
"val/ratio": 1.0016669034957886, |
|
"val/ratio_var": 3.951354665332474e-05 |
|
}, |
|
{ |
|
"episode": 13056, |
|
"epoch": 0.1790774548397273, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01614242233335972, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.616368286445013e-06, |
|
"objective/entropy": 1.2477443218231201, |
|
"objective/kl": 14.385757446289062, |
|
"objective/non_score_reward": -1.4385757446289062, |
|
"objective/rlhf_reward": -0.048571567982435226, |
|
"objective/scores": 1.390625, |
|
"policy/approxkl_avg": 0.38643181324005127, |
|
"policy/clipfrac_avg": 0.33203125, |
|
"policy/entropy_avg": 0.02417755126953125, |
|
"step": 255, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 19, |
|
"val/ratio": 1.0020815134048462, |
|
"val/ratio_var": 0.00012469623470678926 |
|
}, |
|
{ |
|
"episode": 13312, |
|
"epoch": 0.18258877748364355, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013632966205477715, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6086956521739132e-06, |
|
"objective/entropy": 1.5228471755981445, |
|
"objective/kl": 14.862211227416992, |
|
"objective/non_score_reward": -1.486221194267273, |
|
"objective/rlhf_reward": -0.07545565813779831, |
|
"objective/scores": 1.40625, |
|
"policy/approxkl_avg": 2.011383056640625, |
|
"policy/clipfrac_avg": 0.3359375, |
|
"policy/entropy_avg": 0.027433395385742188, |
|
"step": 260, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 17, |
|
"val/ratio": 0.9998003244400024, |
|
"val/ratio_var": 2.738163857429754e-05 |
|
}, |
|
{ |
|
"episode": 13568, |
|
"epoch": 0.18610010012755976, |
|
"eps": 5, |
|
"loss/policy_avg": -0.020372817292809486, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.6010230179028136e-06, |
|
"objective/entropy": 1.633180856704712, |
|
"objective/kl": 14.094629287719727, |
|
"objective/non_score_reward": -1.4094629287719727, |
|
"objective/rlhf_reward": -0.015713702887296677, |
|
"objective/scores": 1.390625, |
|
"policy/approxkl_avg": 0.47778478264808655, |
|
"policy/clipfrac_avg": 0.3701171875, |
|
"policy/entropy_avg": 0.02643442153930664, |
|
"step": 265, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 14, |
|
"val/ratio": 1.0079278945922852, |
|
"val/ratio_var": 0.0023215855471789837 |
|
}, |
|
{ |
|
"episode": 13824, |
|
"epoch": 0.189611422771476, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01413625106215477, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.5933503836317135e-06, |
|
"objective/entropy": 1.2899070978164673, |
|
"objective/kl": 14.59975528717041, |
|
"objective/non_score_reward": -1.4599756002426147, |
|
"objective/rlhf_reward": -0.09675531834363937, |
|
"objective/scores": 1.359375, |
|
"policy/approxkl_avg": 0.4568091630935669, |
|
"policy/clipfrac_avg": 0.3076171875, |
|
"policy/entropy_avg": 0.02667713165283203, |
|
"step": 270, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 19, |
|
"val/ratio": 1.003631830215454, |
|
"val/ratio_var": 0.00041694415267556906 |
|
}, |
|
{ |
|
"episode": 14080, |
|
"epoch": 0.1931227454153922, |
|
"eps": 5, |
|
"loss/policy_avg": -0.018304049968719482, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.585677749360614e-06, |
|
"objective/entropy": 1.3464603424072266, |
|
"objective/kl": 14.915502548217773, |
|
"objective/non_score_reward": -1.491550326347351, |
|
"objective/rlhf_reward": -0.013601185753941536, |
|
"objective/scores": 1.4765625, |
|
"policy/approxkl_avg": 0.5154660940170288, |
|
"policy/clipfrac_avg": 0.3251953125, |
|
"policy/entropy_avg": 0.024927139282226562, |
|
"step": 275, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 20, |
|
"val/ratio": 1.0014865398406982, |
|
"val/ratio_var": 8.263294148491696e-05 |
|
}, |
|
{ |
|
"episode": 14336, |
|
"epoch": 0.19663406805930844, |
|
"eps": 5, |
|
"loss/policy_avg": -0.009162629023194313, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.578005115089514e-06, |
|
"objective/entropy": 1.3251242637634277, |
|
"objective/kl": 14.600137710571289, |
|
"objective/non_score_reward": -1.460013747215271, |
|
"objective/rlhf_reward": -0.10571230947971344, |
|
"objective/scores": 1.3515625, |
|
"policy/approxkl_avg": 0.4187917411327362, |
|
"policy/clipfrac_avg": 0.3046875, |
|
"policy/entropy_avg": 0.023657798767089844, |
|
"step": 280, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.008554458618164, |
|
"val/ratio_var": 0.001467635971494019 |
|
}, |
|
{ |
|
"episode": 14592, |
|
"epoch": 0.20014539070322465, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01609072834253311, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.5703324808184145e-06, |
|
"objective/entropy": 1.3078004121780396, |
|
"objective/kl": 14.999523162841797, |
|
"objective/non_score_reward": -1.4999523162841797, |
|
"objective/rlhf_reward": -0.15238332748413086, |
|
"objective/scores": 1.3515625, |
|
"policy/approxkl_avg": 0.3968128561973572, |
|
"policy/clipfrac_avg": 0.36328125, |
|
"policy/entropy_avg": 0.023943424224853516, |
|
"step": 285, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 12, |
|
"val/ratio": 1.0019521713256836, |
|
"val/ratio_var": 8.228437945945188e-05 |
|
}, |
|
{ |
|
"episode": 14848, |
|
"epoch": 0.2036567133471409, |
|
"eps": 5, |
|
"loss/policy_avg": -0.014186807908117771, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.562659846547315e-06, |
|
"objective/entropy": 1.2583755254745483, |
|
"objective/kl": 15.623100280761719, |
|
"objective/non_score_reward": -1.5623100996017456, |
|
"objective/rlhf_reward": -0.09625323116779327, |
|
"objective/scores": 1.46875, |
|
"policy/approxkl_avg": 0.5678977370262146, |
|
"policy/clipfrac_avg": 0.3076171875, |
|
"policy/entropy_avg": 0.024990558624267578, |
|
"step": 290, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 13, |
|
"val/ratio": 1.0014848709106445, |
|
"val/ratio_var": 4.219371476210654e-05 |
|
}, |
|
{ |
|
"episode": 15104, |
|
"epoch": 0.2071680359910571, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013804701156914234, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.5549872122762148e-06, |
|
"objective/entropy": 1.568720817565918, |
|
"objective/kl": 14.687668800354004, |
|
"objective/non_score_reward": -1.4687669277191162, |
|
"objective/rlhf_reward": -0.17009752988815308, |
|
"objective/scores": 1.296875, |
|
"policy/approxkl_avg": 0.3046334981918335, |
|
"policy/clipfrac_avg": 0.26953125, |
|
"policy/entropy_avg": 0.027862548828125, |
|
"step": 295, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.0005717277526855, |
|
"val/ratio_var": 1.1324932529532816e-05 |
|
}, |
|
{ |
|
"episode": 15360, |
|
"epoch": 0.21067935863497333, |
|
"eps": 5, |
|
"loss/policy_avg": -0.018133502453565598, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.547314578005115e-06, |
|
"objective/entropy": 1.2987349033355713, |
|
"objective/kl": 13.89183235168457, |
|
"objective/non_score_reward": -1.3891831636428833, |
|
"objective/rlhf_reward": -0.12500587105751038, |
|
"objective/scores": 1.265625, |
|
"policy/approxkl_avg": 0.31936001777648926, |
|
"policy/clipfrac_avg": 0.33984375, |
|
"policy/entropy_avg": 0.02469015121459961, |
|
"step": 300, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 18, |
|
"val/ratio": 1.0020601749420166, |
|
"val/ratio_var": 3.1293042411562055e-05 |
|
}, |
|
{ |
|
"episode": 15616, |
|
"epoch": 0.21419068127888954, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01710616797208786, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.5396419437340155e-06, |
|
"objective/entropy": 1.4288297891616821, |
|
"objective/kl": 14.952780723571777, |
|
"objective/non_score_reward": -1.4952781200408936, |
|
"objective/rlhf_reward": -0.15792769193649292, |
|
"objective/scores": 1.3359375, |
|
"policy/approxkl_avg": 0.6461950540542603, |
|
"policy/clipfrac_avg": 0.3125, |
|
"policy/entropy_avg": 0.02637958526611328, |
|
"step": 305, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 19, |
|
"val/ratio": 1.0013606548309326, |
|
"val/ratio_var": 4.203971548122354e-05 |
|
}, |
|
{ |
|
"episode": 15872, |
|
"epoch": 0.21770200392280575, |
|
"eps": 5, |
|
"loss/policy_avg": -0.016128187999129295, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.531969309462916e-06, |
|
"objective/entropy": 1.3288850784301758, |
|
"objective/kl": 15.583921432495117, |
|
"objective/non_score_reward": -1.55839204788208, |
|
"objective/rlhf_reward": -0.07665687799453735, |
|
"objective/scores": 1.484375, |
|
"policy/approxkl_avg": 0.3284182548522949, |
|
"policy/clipfrac_avg": 0.328125, |
|
"policy/entropy_avg": 0.02404165267944336, |
|
"step": 310, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.0064442157745361, |
|
"val/ratio_var": 0.0015344778075814247 |
|
}, |
|
{ |
|
"episode": 16128, |
|
"epoch": 0.221213326566722, |
|
"eps": 5, |
|
"loss/policy_avg": -0.015278931707143784, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.524296675191816e-06, |
|
"objective/entropy": 1.3236112594604492, |
|
"objective/kl": 14.773448944091797, |
|
"objective/non_score_reward": -1.4773449897766113, |
|
"objective/rlhf_reward": -0.08708612620830536, |
|
"objective/scores": 1.390625, |
|
"policy/approxkl_avg": 0.2980467975139618, |
|
"policy/clipfrac_avg": 0.34765625, |
|
"policy/entropy_avg": 0.02494335174560547, |
|
"step": 315, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 7, |
|
"val/ratio": 1.0008249282836914, |
|
"val/ratio_var": 1.927867742779199e-05 |
|
}, |
|
{ |
|
"episode": 16384, |
|
"epoch": 0.2247246492106382, |
|
"eps": 5, |
|
"loss/policy_avg": -0.020951703190803528, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.516624040920716e-06, |
|
"objective/entropy": 1.2670817375183105, |
|
"objective/kl": 14.8348970413208, |
|
"objective/non_score_reward": -1.483489751815796, |
|
"objective/rlhf_reward": 0.03382519632577896, |
|
"objective/scores": 1.515625, |
|
"policy/approxkl_avg": 1.0973663330078125, |
|
"policy/clipfrac_avg": 0.361328125, |
|
"policy/entropy_avg": 0.02091073989868164, |
|
"step": 320, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 15, |
|
"val/ratio": 1.003138542175293, |
|
"val/ratio_var": 0.0001205340595333837 |
|
}, |
|
{ |
|
"episode": 16640, |
|
"epoch": 0.22823597185455444, |
|
"eps": 5, |
|
"loss/policy_avg": -0.006676271557807922, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.5089514066496164e-06, |
|
"objective/entropy": 1.191056728363037, |
|
"objective/kl": 16.46404457092285, |
|
"objective/non_score_reward": -1.646404504776001, |
|
"objective/rlhf_reward": -0.14990828931331635, |
|
"objective/scores": 1.5, |
|
"policy/approxkl_avg": 0.31475046277046204, |
|
"policy/clipfrac_avg": 0.2578125, |
|
"policy/entropy_avg": 0.021608352661132812, |
|
"step": 325, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 25, |
|
"val/ratio": 1.000575065612793, |
|
"val/ratio_var": 1.735856494633481e-05 |
|
}, |
|
{ |
|
"episode": 16896, |
|
"epoch": 0.23174729449847065, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02127978205680847, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.5012787723785167e-06, |
|
"objective/entropy": 1.490570068359375, |
|
"objective/kl": 16.22044563293457, |
|
"objective/non_score_reward": -1.622044563293457, |
|
"objective/rlhf_reward": -0.015120631083846092, |
|
"objective/scores": 1.609375, |
|
"policy/approxkl_avg": 2.5871665477752686, |
|
"policy/clipfrac_avg": 0.35546875, |
|
"policy/entropy_avg": 0.027353286743164062, |
|
"step": 330, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 25, |
|
"val/ratio": 1.003063440322876, |
|
"val/ratio_var": 6.509448576252908e-05 |
|
}, |
|
{ |
|
"episode": 17152, |
|
"epoch": 0.23525861714238688, |
|
"eps": 5, |
|
"loss/policy_avg": -0.013108542189002037, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.493606138107417e-06, |
|
"objective/entropy": 1.2718842029571533, |
|
"objective/kl": 16.047882080078125, |
|
"objective/non_score_reward": -1.604788064956665, |
|
"objective/rlhf_reward": -0.12415145337581635, |
|
"objective/scores": 1.484375, |
|
"policy/approxkl_avg": 0.2758824825286865, |
|
"policy/clipfrac_avg": 0.287109375, |
|
"policy/entropy_avg": 0.024268627166748047, |
|
"step": 335, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 16, |
|
"val/ratio": 1.0036962032318115, |
|
"val/ratio_var": 0.0003905180492438376 |
|
}, |
|
{ |
|
"episode": 17408, |
|
"epoch": 0.2387699397863031, |
|
"eps": 5, |
|
"loss/policy_avg": -0.014837839640676975, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.4859335038363174e-06, |
|
"objective/entropy": 1.3406567573547363, |
|
"objective/kl": 16.428348541259766, |
|
"objective/non_score_reward": -1.642835021018982, |
|
"objective/rlhf_reward": 0.018702151253819466, |
|
"objective/scores": 1.6640625, |
|
"policy/approxkl_avg": 0.192110076546669, |
|
"policy/clipfrac_avg": 0.3388671875, |
|
"policy/entropy_avg": 0.025295734405517578, |
|
"step": 340, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 19, |
|
"val/ratio": 1.0008959770202637, |
|
"val/ratio_var": 1.205760781886056e-05 |
|
}, |
|
{ |
|
"episode": 17664, |
|
"epoch": 0.24228126243021933, |
|
"eps": 5, |
|
"loss/policy_avg": -0.01899782381951809, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.4782608695652173e-06, |
|
"objective/entropy": 1.5880205631256104, |
|
"objective/kl": 15.775943756103516, |
|
"objective/non_score_reward": -1.577594518661499, |
|
"objective/rlhf_reward": -0.08363974094390869, |
|
"objective/scores": 1.4921875, |
|
"policy/approxkl_avg": 0.9254180192947388, |
|
"policy/clipfrac_avg": 0.3251953125, |
|
"policy/entropy_avg": 0.024907588958740234, |
|
"step": 345, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 15, |
|
"val/ratio": 1.000749111175537, |
|
"val/ratio_var": 2.8957187168998644e-05 |
|
}, |
|
{ |
|
"episode": 17920, |
|
"epoch": 0.24579258507413554, |
|
"eps": 5, |
|
"loss/policy_avg": -0.014669202268123627, |
|
"loss/value_avg": 0.0, |
|
"lr": 2.4705882352941177e-06, |
|
"objective/entropy": 1.2656543254852295, |
|
"objective/kl": 16.034730911254883, |
|
"objective/non_score_reward": -1.60347318649292, |
|
"objective/rlhf_reward": -0.011744961142539978, |
|
"objective/scores": 1.59375, |
|
"policy/approxkl_avg": 1.004683017730713, |
|
"policy/clipfrac_avg": 0.2646484375, |
|
"policy/entropy_avg": 0.023741722106933594, |
|
"step": 350, |
|
"val/clipfrac_avg": 0.0, |
|
"val/num_eos_tokens": 19, |
|
"val/ratio": 1.0009608268737793, |
|
"val/ratio_var": 4.649764014175162e-05 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 391, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1.3716104077797742, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0, |
|
"train_batch_size": null, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|