drdapo_lora32_max4096 / trainer_state.json
yjyjyj98's picture
Upload folder using huggingface_hub
9c679c1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.03142677561282212,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 387.5625,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 929.0,
"completions/mean_length": 387.5625,
"completions/mean_terminated_length": 298.6000061035156,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.00031426775612822125,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.58870792388916,
"kl": 1.1056605577468872,
"learning_rate": 0.0,
"loss": 0.0442,
"num_tokens": 10321.0,
"reward": -8.78125,
"reward_std": 1.5290063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.15625,
"rewards/check_numbers/std": 1.1212902069091797,
"rewards/format_and_language_reward_func/mean": -3.625,
"rewards/format_and_language_reward_func/std": 0.9574271440505981,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 1
},
{
"completion_length": 427.625,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1380.0,
"completions/mean_length": 427.625,
"completions/mean_terminated_length": 341.3333435058594,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0006285355122564425,
"frac_reward_zero_std": 0.0,
"grad_norm": 15.610054016113281,
"kl": 0.5778560638427734,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0231,
"num_tokens": 20955.0,
"reward": -9.5,
"reward_std": 1.6830127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.75,
"rewards/check_numbers/std": 0.8366600275039673,
"rewards/format_and_language_reward_func/mean": -3.75,
"rewards/format_and_language_reward_func/std": 1.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 2
},
{
"completion_length": 979.5,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1507.0,
"completions/mean_length": 979.5,
"completions/mean_terminated_length": 642.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0009428032683846638,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.994994163513184,
"kl": 0.2919767200946808,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0117,
"num_tokens": 40683.0,
"reward": -7.53125,
"reward_std": 1.3821797370910645,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.40625,
"rewards/check_numbers/std": 1.3443554639816284,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 3
},
{
"completion_length": 836.3125,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1175.0,
"completions/mean_length": 836.3125,
"completions/mean_terminated_length": 433.727294921875,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.001257071024512885,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.14759635925293,
"kl": 0.5689749717712402,
"learning_rate": 1.5e-06,
"loss": 0.0228,
"num_tokens": 58020.0,
"reward": -9.03125,
"reward_std": 1.9392420053482056,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.40625,
"rewards/check_numbers/std": 1.0680004358291626,
"rewards/format_and_language_reward_func/mean": -3.625,
"rewards/format_and_language_reward_func/std": 0.9574271440505981,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 4
},
{
"completion_length": 553.25,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1469.0,
"completions/mean_length": 553.25,
"completions/mean_terminated_length": 475.3333435058594,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0015713387806411063,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.5043792724609375,
"kl": 0.5717646479606628,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0229,
"num_tokens": 70976.0,
"reward": -8.40625,
"reward_std": 0.9375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.03125,
"rewards/check_numbers/std": 1.007782220840454,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 5
},
{
"completion_length": 750.9375,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1676.0,
"completions/mean_length": 750.9375,
"completions/mean_terminated_length": 612.2142944335938,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0018856065367693275,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.5527191162109375,
"kl": 0.26640018820762634,
"learning_rate": 2.5e-06,
"loss": 0.0107,
"num_tokens": 87127.0,
"reward": -8.40625,
"reward_std": 1.3351925611495972,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.90625,
"rewards/check_numbers/std": 1.1138334274291992,
"rewards/format_and_language_reward_func/mean": -3.5,
"rewards/format_and_language_reward_func/std": 0.8944272398948669,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 6
},
{
"completion_length": 822.625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1699.0,
"completions/max_terminated_length": 1699.0,
"completions/mean_length": 822.625,
"completions/mean_terminated_length": 822.625,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0021998742928975488,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.19105863571167,
"kl": 0.17402563989162445,
"learning_rate": 3e-06,
"loss": 0.007,
"num_tokens": 104165.0,
"reward": -7.96875,
"reward_std": 1.063370943069458,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.71875,
"rewards/check_numbers/std": 0.8750000596046448,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 7
},
{
"completion_length": 1419.1875,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1149.0,
"completions/mean_length": 1419.1875,
"completions/mean_terminated_length": 753.0,
"completions/min_length": 366.0,
"completions/min_terminated_length": 366.0,
"epoch": 0.00251414204902577,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1348615139722824,
"kl": 0.029216211289167404,
"learning_rate": 3.5e-06,
"loss": 0.0012,
"num_tokens": 130604.0,
"reward": -8.0625,
"reward_std": 0.625,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.9375,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 8
},
{
"completion_length": 1018.125,
"completions/clipped_ratio": 0.375,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1565.0,
"completions/mean_length": 1018.125,
"completions/mean_terminated_length": 595.7999877929688,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0028284098051539913,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.937878131866455,
"kl": 0.42939966917037964,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0172,
"num_tokens": 151170.0,
"reward": -7.8125,
"reward_std": 1.1593647003173828,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.6875,
"rewards/check_numbers/std": 0.9639329314231873,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 9
},
{
"completion_length": 1018.625,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1317.0,
"completions/mean_length": 1018.625,
"completions/mean_terminated_length": 698.9091186523438,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0031426775612822125,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0097506046295166,
"kl": 0.15729668736457825,
"learning_rate": 4.5e-06,
"loss": 0.0063,
"num_tokens": 171388.0,
"reward": -7.75,
"reward_std": 0.8080127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 10
},
{
"completion_length": 1189.8125,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1696.0,
"completions/mean_length": 1189.8125,
"completions/mean_terminated_length": 947.9091186523438,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.0034569453174104338,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.136498361825943,
"kl": 0.05319710448384285,
"learning_rate": 5e-06,
"loss": 0.0021,
"num_tokens": 194373.0,
"reward": -7.75,
"reward_std": 0.8080127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 11
},
{
"completion_length": 1122.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1665.0,
"completions/mean_length": 1122.0,
"completions/mean_terminated_length": 983.5385131835938,
"completions/min_length": 498.0,
"completions/min_terminated_length": 498.0,
"epoch": 0.003771213073538655,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0965699553489685,
"kl": 0.06320463120937347,
"learning_rate": 4.944444444444445e-06,
"loss": 0.0025,
"num_tokens": 216261.0,
"reward": -7.65625,
"reward_std": 0.40400636196136475,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 12
},
{
"completion_length": 1341.75,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1607.0,
"completions/mean_length": 1341.75,
"completions/mean_terminated_length": 1046.0,
"completions/min_length": 592.0,
"completions/min_terminated_length": 592.0,
"epoch": 0.004085480829666876,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07173436135053635,
"kl": 0.04135803505778313,
"learning_rate": 4.888888888888889e-06,
"loss": 0.0017,
"num_tokens": 241453.0,
"reward": -7.84375,
"reward_std": 0.40400636196136475,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 13
},
{
"completion_length": 1283.8125,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1221.0,
"completions/mean_length": 1283.8125,
"completions/mean_terminated_length": 553.5,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0043997485857950975,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.01732063293457,
"kl": 0.17246384918689728,
"learning_rate": 4.833333333333333e-06,
"loss": 0.0069,
"num_tokens": 266634.0,
"reward": -7.84375,
"reward_std": 1.1508427858352661,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.71875,
"rewards/check_numbers/std": 0.8750000596046448,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 14
},
{
"completion_length": 1189.6875,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1694.0,
"completions/mean_length": 1189.6875,
"completions/mean_terminated_length": 947.727294921875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.004714016341923318,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6227664351463318,
"kl": 0.08904990553855896,
"learning_rate": 4.777777777777778e-06,
"loss": 0.0036,
"num_tokens": 289373.0,
"reward": -6.90625,
"reward_std": 1.0290063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": 0.09375,
"rewards/check_numbers/std": 1.827737808227539,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 15
},
{
"completion_length": 1361.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1314.0,
"completions/mean_length": 1361.0,
"completions/mean_terminated_length": 759.3333740234375,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.00502828409805154,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12482193857431412,
"kl": 0.05782376229763031,
"learning_rate": 4.722222222222222e-06,
"loss": 0.0023,
"num_tokens": 314701.0,
"reward": -8.09375,
"reward_std": 0.9840351343154907,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 16
},
{
"completion_length": 1330.625,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1668.0,
"completions/mean_length": 1330.625,
"completions/mean_terminated_length": 827.4285888671875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.005342551854179761,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09849988669157028,
"kl": 0.07715655118227005,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0031,
"num_tokens": 339795.0,
"reward": -7.9375,
"reward_std": 0.375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.9375,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 17
},
{
"completion_length": 1300.3125,
"completions/clipped_ratio": 0.5,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1540.0,
"completions/mean_length": 1300.3125,
"completions/mean_terminated_length": 878.625,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.0056568196103079825,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.448221445083618,
"kl": 0.3021676242351532,
"learning_rate": 4.611111111111112e-06,
"loss": 0.0121,
"num_tokens": 364600.0,
"reward": -8.3125,
"reward_std": 1.2111132144927979,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.9375,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 18
},
{
"completion_length": 1257.625,
"completions/clipped_ratio": 0.375,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1613.0,
"completions/mean_length": 1257.625,
"completions/mean_terminated_length": 979.0,
"completions/min_length": 311.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.005971087366436203,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12901267409324646,
"kl": 0.06548095494508743,
"learning_rate": 4.555555555555556e-06,
"loss": 0.0026,
"num_tokens": 388078.0,
"reward": -7.84375,
"reward_std": 0.8176814913749695,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.59375,
"rewards/check_numbers/std": 1.7050782442092896,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 19
},
{
"completion_length": 1573.6875,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1592.0,
"completions/mean_length": 1573.6875,
"completions/mean_terminated_length": 1128.75,
"completions/min_length": 825.0,
"completions/min_terminated_length": 825.0,
"epoch": 0.006285355122564425,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10170631110668182,
"kl": 0.04396039992570877,
"learning_rate": 4.5e-06,
"loss": 0.0018,
"num_tokens": 417161.0,
"reward": -8.375,
"reward_std": 0.9611132144927979,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.125,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 20
},
{
"completion_length": 1380.5,
"completions/clipped_ratio": 0.375,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1633.0,
"completions/mean_length": 1380.5,
"completions/mean_terminated_length": 1175.5999755859375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.006599622878692646,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.5239837169647217,
"kl": 0.16142641007900238,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0065,
"num_tokens": 443361.0,
"reward": -8.03125,
"reward_std": 0.6205127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.03125,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 21
},
{
"completion_length": 1384.25,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1601.0,
"completions/mean_length": 1384.25,
"completions/mean_terminated_length": 950.0000610351562,
"completions/min_length": 427.0,
"completions/min_terminated_length": 427.0,
"epoch": 0.0069138906348208675,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09287308901548386,
"kl": 0.06003550812602043,
"learning_rate": 4.388888888888889e-06,
"loss": 0.0024,
"num_tokens": 469325.0,
"reward": -7.78125,
"reward_std": 0.4375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 22
},
{
"completion_length": 1367.4375,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1606.0,
"completions/mean_length": 1367.4375,
"completions/mean_terminated_length": 1091.6666259765625,
"completions/min_length": 531.0,
"completions/min_terminated_length": 531.0,
"epoch": 0.007228158390949088,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.3170417845249176,
"kl": 0.06963013857603073,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0028,
"num_tokens": 495200.0,
"reward": -7.1875,
"reward_std": 0.375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.1875,
"rewards/check_numbers/std": 0.5123475790023804,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 23
},
{
"completion_length": 1551.6875,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1699.0,
"completions/mean_length": 1551.6875,
"completions/mean_terminated_length": 813.6666870117188,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"epoch": 0.00754242614707731,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20077864825725555,
"kl": 0.05942363664507866,
"learning_rate": 4.277777777777778e-06,
"loss": 0.0024,
"num_tokens": 523435.0,
"reward": -8.28125,
"reward_std": 0.9840351343154907,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.03125,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 24
},
{
"completion_length": 1567.25,
"completions/clipped_ratio": 0.875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 833.0,
"completions/mean_length": 1567.25,
"completions/mean_terminated_length": 484.0,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.00785669390320553,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.15381832420825958,
"kl": 0.05258103832602501,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0021,
"num_tokens": 552719.0,
"reward": -7.5,
"reward_std": 0.661700427532196,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 25
},
{
"completion_length": 1234.5625,
"completions/clipped_ratio": 0.5,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1246.0,
"completions/mean_length": 1234.5625,
"completions/mean_terminated_length": 747.125,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.008170961659333752,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.2516714334487915,
"kl": 0.07844924181699753,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0031,
"num_tokens": 576460.0,
"reward": -7.75,
"reward_std": 0.5915063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 26
},
{
"completion_length": 1580.875,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1688.0,
"completions/mean_length": 1580.875,
"completions/mean_terminated_length": 1270.4000244140625,
"completions/min_length": 815.0,
"completions/min_terminated_length": 815.0,
"epoch": 0.008485229415461974,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08427742123603821,
"kl": 0.040127161890268326,
"learning_rate": 4.111111111111111e-06,
"loss": 0.0016,
"num_tokens": 605630.0,
"reward": -7.84375,
"reward_std": 0.5625,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 27
},
{
"completion_length": 1360.1875,
"completions/clipped_ratio": 0.5,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1577.0,
"completions/mean_length": 1360.1875,
"completions/mean_terminated_length": 998.375,
"completions/min_length": 415.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.008799497171590195,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13703206181526184,
"kl": 0.05611160770058632,
"learning_rate": 4.055555555555556e-06,
"loss": 0.0022,
"num_tokens": 631529.0,
"reward": -8.21875,
"reward_std": 0.9407068490982056,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 28
},
{
"completion_length": 1515.625,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1188.0,
"completions/mean_length": 1515.625,
"completions/mean_terminated_length": 1061.5999755859375,
"completions/min_length": 871.0,
"completions/min_terminated_length": 871.0,
"epoch": 0.009113764927718416,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1371777057647705,
"kl": 0.05326192080974579,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0021,
"num_tokens": 659651.0,
"reward": -8.09375,
"reward_std": 0.9040063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 29
},
{
"completion_length": 1422.0625,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1427.0,
"completions/mean_length": 1422.0625,
"completions/mean_terminated_length": 1036.4285888671875,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.009428032683846637,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.34744393825531,
"kl": 2.590277671813965,
"learning_rate": 3.944444444444445e-06,
"loss": 0.1036,
"num_tokens": 686368.0,
"reward": -7.84375,
"reward_std": 1.1540063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.71875,
"rewards/check_numbers/std": 0.8750000596046448,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 30
},
{
"completion_length": 1592.75,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1348.0,
"completions/mean_length": 1592.75,
"completions/mean_terminated_length": 1032.666748046875,
"completions/min_length": 537.0,
"completions/min_terminated_length": 537.0,
"epoch": 0.00974230043997486,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09155543893575668,
"kl": 0.05414074286818504,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0022,
"num_tokens": 715988.0,
"reward": -7.40625,
"reward_std": 0.4375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.28125,
"rewards/check_numbers/std": 0.6046693325042725,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 31
},
{
"completion_length": 1555.5625,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1635.0,
"completions/mean_length": 1555.5625,
"completions/mean_terminated_length": 1056.25,
"completions/min_length": 619.0,
"completions/min_terminated_length": 619.0,
"epoch": 0.01005656819610308,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15684477984905243,
"kl": 0.05063142254948616,
"learning_rate": 3.833333333333334e-06,
"loss": 0.002,
"num_tokens": 744669.0,
"reward": -8.40625,
"reward_std": 1.0148502588272095,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.03125,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 32
},
{
"completion_length": 1546.9375,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1335.0,
"completions/mean_length": 1546.9375,
"completions/mean_terminated_length": 1021.75,
"completions/min_length": 747.0,
"completions/min_terminated_length": 747.0,
"epoch": 0.010370835952231301,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07121387869119644,
"kl": 0.032333556562662125,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0013,
"num_tokens": 773388.0,
"reward": -7.46875,
"reward_std": 0.5625,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 33
},
{
"completion_length": 1376.4375,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1313.0,
"completions/mean_length": 1376.4375,
"completions/mean_terminated_length": 800.5,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.010685103708359522,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.1596808433532715,
"kl": 0.12857064604759216,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.0051,
"num_tokens": 799511.0,
"reward": -7.5,
"reward_std": 0.661700427532196,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 34
},
{
"completion_length": 1699.6875,
"completions/clipped_ratio": 0.875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1711.0,
"completions/mean_length": 1699.6875,
"completions/mean_terminated_length": 1543.5,
"completions/min_length": 1376.0,
"completions/min_terminated_length": 1376.0,
"epoch": 0.010999371464487744,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11480734497308731,
"kl": 0.04761524498462677,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0019,
"num_tokens": 830966.0,
"reward": -7.96875,
"reward_std": 1.2129219770431519,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.5,
"rewards/format_and_language_reward_func/std": 0.8944272398948669,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 35
},
{
"completion_length": 1583.75,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1432.0,
"completions/mean_length": 1583.75,
"completions/mean_terminated_length": 1169.0,
"completions/min_length": 686.0,
"completions/min_terminated_length": 686.0,
"epoch": 0.011313639220615965,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.34630483388900757,
"kl": 0.07976571470499039,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.0032,
"num_tokens": 859930.0,
"reward": -8.09375,
"reward_std": 0.9840351343154907,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 36
},
{
"completion_length": 1338.875,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 622.0,
"completions/mean_length": 1338.875,
"completions/mean_terminated_length": 496.0,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.011627906976744186,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1614941507577896,
"kl": 0.06494183838367462,
"learning_rate": 3.555555555555556e-06,
"loss": 0.0026,
"num_tokens": 885120.0,
"reward": -7.6875,
"reward_std": 0.8080127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 37
},
{
"completion_length": 1480.0625,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1611.0,
"completions/mean_length": 1480.0625,
"completions/mean_terminated_length": 1169.0,
"completions/min_length": 525.0,
"completions/min_terminated_length": 525.0,
"epoch": 0.011942174732872407,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.07541613280773163,
"kl": 0.052320241928100586,
"learning_rate": 3.5e-06,
"loss": 0.0021,
"num_tokens": 912537.0,
"reward": -8.0,
"reward_std": 0.28867512941360474,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 38
},
{
"completion_length": 1373.75,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1538.0,
"completions/mean_length": 1373.75,
"completions/mean_terminated_length": 926.0000610351562,
"completions/min_length": 489.0,
"completions/min_terminated_length": 489.0,
"epoch": 0.01225644248900063,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16380992531776428,
"kl": 0.06148176267743111,
"learning_rate": 3.444444444444445e-06,
"loss": 0.0025,
"num_tokens": 938505.0,
"reward": -8.0,
"reward_std": 1.0497419834136963,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 39
},
{
"completion_length": 1519.1875,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1336.0,
"completions/mean_length": 1519.1875,
"completions/mean_terminated_length": 910.75,
"completions/min_length": 601.0,
"completions/min_terminated_length": 601.0,
"epoch": 0.01257071024512885,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10772477090358734,
"kl": 0.04433707520365715,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.0018,
"num_tokens": 966940.0,
"reward": -7.59375,
"reward_std": 0.7340351343154907,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 40
},
{
"completion_length": 1453.9375,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1432.0,
"completions/mean_length": 1453.9375,
"completions/mean_terminated_length": 1007.1666870117188,
"completions/min_length": 350.0,
"completions/min_terminated_length": 350.0,
"epoch": 0.012884978001257071,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07585709542036057,
"kl": 0.04436139389872551,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0018,
"num_tokens": 993859.0,
"reward": -8.125,
"reward_std": 0.5773502588272095,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 41
},
{
"completion_length": 1421.5,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1470.0,
"completions/mean_length": 1421.5,
"completions/mean_terminated_length": 1035.1429443359375,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.013199245757385292,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5277220010757446,
"kl": 0.06066755950450897,
"learning_rate": 3.277777777777778e-06,
"loss": 0.0024,
"num_tokens": 1020571.0,
"reward": -8.125,
"reward_std": 1.0517165660858154,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 42
},
{
"completion_length": 1514.6875,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1326.0,
"completions/mean_length": 1514.6875,
"completions/mean_terminated_length": 1058.5999755859375,
"completions/min_length": 833.0,
"completions/min_terminated_length": 833.0,
"epoch": 0.013513513513513514,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10104304552078247,
"kl": 0.05354408547282219,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.0021,
"num_tokens": 1049086.0,
"reward": -7.46875,
"reward_std": 0.5625,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 43
},
{
"completion_length": 1431.4375,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1554.0,
"completions/mean_length": 1431.4375,
"completions/mean_terminated_length": 947.1666870117188,
"completions/min_length": 320.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.013827781269641735,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11680426448583603,
"kl": 0.05350031703710556,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.0021,
"num_tokens": 1075785.0,
"reward": -7.46875,
"reward_std": 0.6205127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 44
},
{
"completion_length": 1583.875,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1720.0,
"completions/mean_length": 1583.875,
"completions/mean_terminated_length": 1353.666748046875,
"completions/min_length": 573.0,
"completions/min_terminated_length": 573.0,
"epoch": 0.014142049025769956,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10828514397144318,
"kl": 0.05207566171884537,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.0021,
"num_tokens": 1105015.0,
"reward": -7.25,
"reward_std": 1.1668819189071655,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.125,
"rewards/check_numbers/std": 1.5864006280899048,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 45
},
{
"completion_length": 1366.1875,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1669.0,
"completions/mean_length": 1366.1875,
"completions/mean_terminated_length": 908.71435546875,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"epoch": 0.014456316781898177,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09319330006837845,
"kl": 0.044255051761865616,
"learning_rate": 3.055555555555556e-06,
"loss": 0.0018,
"num_tokens": 1130758.0,
"reward": -7.375,
"reward_std": 0.375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 46
},
{
"completion_length": 1549.25,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1445.0,
"completions/mean_length": 1549.25,
"completions/mean_terminated_length": 1031.0,
"completions/min_length": 798.0,
"completions/min_terminated_length": 798.0,
"epoch": 0.0147705845380264,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.1097634956240654,
"kl": 0.04721890389919281,
"learning_rate": 3e-06,
"loss": 0.0019,
"num_tokens": 1159726.0,
"reward": -7.75,
"reward_std": 0.375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 47
},
{
"completion_length": 1559.8125,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1060.0,
"completions/mean_length": 1559.8125,
"completions/mean_terminated_length": 857.0,
"completions/min_length": 737.0,
"completions/min_terminated_length": 737.0,
"epoch": 0.01508485229415462,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13540787994861603,
"kl": 0.052234258502721786,
"learning_rate": 2.944444444444445e-06,
"loss": 0.0021,
"num_tokens": 1188527.0,
"reward": -8.1875,
"reward_std": 1.1715351343154907,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.9375,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 48
},
{
"completion_length": 1568.125,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1591.0,
"completions/mean_length": 1568.125,
"completions/mean_terminated_length": 1229.5999755859375,
"completions/min_length": 958.0,
"completions/min_terminated_length": 958.0,
"epoch": 0.015399120050282841,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11936990916728973,
"kl": 0.056975651532411575,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0023,
"num_tokens": 1217941.0,
"reward": -8.1875,
"reward_std": 0.8571338653564453,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.9375,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 49
},
{
"completion_length": 1495.4375,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1325.0,
"completions/mean_length": 1495.4375,
"completions/mean_terminated_length": 997.0,
"completions/min_length": 445.0,
"completions/min_terminated_length": 445.0,
"epoch": 0.01571338780641106,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1257868856191635,
"kl": 0.051786769181489944,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.0021,
"num_tokens": 1245944.0,
"reward": -7.6875,
"reward_std": 0.7895780801773071,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 50
},
{
"completion_length": 1586.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1700.0,
"completions/mean_length": 1586.0,
"completions/mean_terminated_length": 1286.800048828125,
"completions/min_length": 820.0,
"completions/min_terminated_length": 820.0,
"epoch": 0.016027655562539284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11521671712398529,
"kl": 0.047557681798934937,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0019,
"num_tokens": 1275304.0,
"reward": -8.0,
"reward_std": 0.6636751294136047,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 51
},
{
"completion_length": 1589.6875,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1535.0,
"completions/mean_length": 1589.6875,
"completions/mean_terminated_length": 1192.75,
"completions/min_length": 863.0,
"completions/min_terminated_length": 863.0,
"epoch": 0.016341923318667503,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.24990025162696838,
"kl": 0.0706791803240776,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.0028,
"num_tokens": 1304927.0,
"reward": -7.5625,
"reward_std": 0.21650634706020355,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 52
},
{
"completion_length": 1505.0625,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1408.0,
"completions/mean_length": 1505.0625,
"completions/mean_terminated_length": 1027.800048828125,
"completions/min_length": 826.0,
"completions/min_terminated_length": 826.0,
"epoch": 0.016656191074795726,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12817677855491638,
"kl": 0.05584558844566345,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0022,
"num_tokens": 1332692.0,
"reward": -7.625,
"reward_std": 0.875,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 53
},
{
"completion_length": 1617.8125,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1689.0,
"completions/mean_length": 1617.8125,
"completions/mean_terminated_length": 1305.25,
"completions/min_length": 897.0,
"completions/min_terminated_length": 897.0,
"epoch": 0.01697045883092395,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11597556620836258,
"kl": 0.05314599350094795,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.0021,
"num_tokens": 1362965.0,
"reward": -7.8125,
"reward_std": 0.661700427532196,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 54
},
{
"completion_length": 1612.125,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1567.0,
"completions/mean_length": 1612.125,
"completions/mean_terminated_length": 1136.0,
"completions/min_length": 707.0,
"completions/min_terminated_length": 707.0,
"epoch": 0.017284726587052168,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.07392556965351105,
"kl": 0.04997054487466812,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.002,
"num_tokens": 1393151.0,
"reward": -7.46875,
"reward_std": 0.1875,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 55
},
{
"completion_length": 1571.625,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1313.0,
"completions/mean_length": 1571.625,
"completions/mean_terminated_length": 920.0,
"completions/min_length": 403.0,
"completions/min_terminated_length": 403.0,
"epoch": 0.01759899434318039,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10698790848255157,
"kl": 0.040371235460042953,
"learning_rate": 2.5e-06,
"loss": 0.0016,
"num_tokens": 1422181.0,
"reward": -7.9375,
"reward_std": 0.5915063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.9375,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 56
},
{
"completion_length": 1594.9375,
"completions/clipped_ratio": 0.875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 707.0,
"completions/mean_length": 1594.9375,
"completions/mean_terminated_length": 705.5,
"completions/min_length": 704.0,
"completions/min_terminated_length": 704.0,
"epoch": 0.017913262099308613,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.6847302317619324,
"kl": 0.04597615823149681,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0018,
"num_tokens": 1451644.0,
"reward": -7.59375,
"reward_std": 0.6205127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 57
},
{
"completion_length": 1503.375,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 640.0,
"completions/mean_length": 1503.375,
"completions/mean_terminated_length": 556.0,
"completions/min_length": 475.0,
"completions/min_terminated_length": 475.0,
"epoch": 0.018227529855436832,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09949938207864761,
"kl": 0.056429892778396606,
"learning_rate": 2.388888888888889e-06,
"loss": 0.0023,
"num_tokens": 1479182.0,
"reward": -7.96875,
"reward_std": 0.6540063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 58
},
{
"completion_length": 1563.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1411.0,
"completions/mean_length": 1563.0,
"completions/mean_terminated_length": 1213.2000732421875,
"completions/min_length": 990.0,
"completions/min_terminated_length": 990.0,
"epoch": 0.018541797611565054,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11543877422809601,
"kl": 0.035633672028779984,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0014,
"num_tokens": 1508062.0,
"reward": -7.65625,
"reward_std": 0.7790063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 59
},
{
"completion_length": 1707.75,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1691.0,
"completions/mean_length": 1707.75,
"completions/mean_terminated_length": 1646.0,
"completions/min_length": 1605.0,
"completions/min_terminated_length": 1605.0,
"epoch": 0.018856065367693273,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0639866441488266,
"kl": 0.05725252255797386,
"learning_rate": 2.277777777777778e-06,
"loss": 0.0023,
"num_tokens": 1539650.0,
"reward": -7.25,
"reward_std": 0.28867512941360474,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": 0.0,
"rewards/check_numbers/std": 0.0,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 60
},
{
"completion_length": 1535.1875,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1665.0,
"completions/mean_length": 1535.1875,
"completions/mean_terminated_length": 974.75,
"completions/min_length": 374.0,
"completions/min_terminated_length": 374.0,
"epoch": 0.019170333123821496,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2154313325881958,
"kl": 0.06772614270448685,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0027,
"num_tokens": 1567677.0,
"reward": -8.09375,
"reward_std": 0.9407067894935608,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 61
},
{
"completion_length": 1480.75,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1193.0,
"completions/mean_length": 1480.75,
"completions/mean_terminated_length": 757.0,
"completions/min_length": 323.0,
"completions/min_terminated_length": 323.0,
"epoch": 0.01948460087994972,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18533650040626526,
"kl": 0.04637778922915459,
"learning_rate": 2.166666666666667e-06,
"loss": 0.0019,
"num_tokens": 1595629.0,
"reward": -7.5625,
"reward_std": 0.8080127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 62
},
{
"completion_length": 1604.8125,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1672.0,
"completions/mean_length": 1604.8125,
"completions/mean_terminated_length": 1097.0,
"completions/min_length": 560.0,
"completions/min_terminated_length": 560.0,
"epoch": 0.019798868636077938,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.17045044898986816,
"kl": 0.05853426456451416,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.0023,
"num_tokens": 1624594.0,
"reward": -8.40625,
"reward_std": 0.7628755569458008,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.03125,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 63
},
{
"completion_length": 1406.9375,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1612.0,
"completions/mean_length": 1406.9375,
"completions/mean_terminated_length": 881.8333740234375,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.02011313639220616,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13805745542049408,
"kl": 0.045212242752313614,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.0018,
"num_tokens": 1650865.0,
"reward": -8.15625,
"reward_std": 0.4955126941204071,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -1.03125,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 64
},
{
"completion_length": 1511.0625,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1656.0,
"completions/mean_length": 1511.0625,
"completions/mean_terminated_length": 1159.5,
"completions/min_length": 661.0,
"completions/min_terminated_length": 661.0,
"epoch": 0.02042740414833438,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10470445454120636,
"kl": 0.040612928569316864,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0016,
"num_tokens": 1679182.0,
"reward": -8.0,
"reward_std": 0.9917292594909668,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 65
},
{
"completion_length": 1688.3125,
"completions/clipped_ratio": 0.875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1570.0,
"completions/mean_length": 1688.3125,
"completions/mean_terminated_length": 1452.5,
"completions/min_length": 1335.0,
"completions/min_terminated_length": 1335.0,
"epoch": 0.020741671904462602,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0923866257071495,
"kl": 0.04577171802520752,
"learning_rate": 1.944444444444445e-06,
"loss": 0.0018,
"num_tokens": 1710331.0,
"reward": -6.875,
"reward_std": 0.9716878533363342,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": 0.25,
"rewards/check_numbers/std": 1.3662601709365845,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 66
},
{
"completion_length": 1652.3125,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 607.0,
"completions/mean_length": 1652.3125,
"completions/mean_terminated_length": 607.0,
"completions/min_length": 607.0,
"completions/min_terminated_length": 607.0,
"epoch": 0.021055939660590824,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13661737740039825,
"kl": 0.0424213632941246,
"learning_rate": 1.888888888888889e-06,
"loss": 0.0017,
"num_tokens": 1741012.0,
"reward": -7.71875,
"reward_std": 0.8185844421386719,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 67
},
{
"completion_length": 1409.125,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1383.0,
"completions/mean_length": 1409.125,
"completions/mean_terminated_length": 887.6666870117188,
"completions/min_length": 636.0,
"completions/min_terminated_length": 636.0,
"epoch": 0.021370207416719043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15749689936637878,
"kl": 0.0645652636885643,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.0026,
"num_tokens": 1767746.0,
"reward": -8.125,
"reward_std": 1.5154354572296143,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.75,
"rewards/format_and_language_reward_func/std": 1.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 68
},
{
"completion_length": 1533.5,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1719.0,
"completions/mean_length": 1533.5,
"completions/mean_terminated_length": 1118.800048828125,
"completions/min_length": 381.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.021684475172847266,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10066410154104233,
"kl": 0.0493154339492321,
"learning_rate": 1.777777777777778e-06,
"loss": 0.002,
"num_tokens": 1795630.0,
"reward": -8.09375,
"reward_std": 0.7028881907463074,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 69
},
{
"completion_length": 1512.1875,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1365.0,
"completions/mean_length": 1512.1875,
"completions/mean_terminated_length": 882.75,
"completions/min_length": 438.0,
"completions/min_terminated_length": 438.0,
"epoch": 0.02199874292897549,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08896202594041824,
"kl": 0.054354239255189896,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.0022,
"num_tokens": 1823761.0,
"reward": -7.6875,
"reward_std": 0.46650636196136475,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 70
},
{
"completion_length": 1664.4375,
"completions/clipped_ratio": 0.875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1470.0,
"completions/mean_length": 1664.4375,
"completions/mean_terminated_length": 1261.5,
"completions/min_length": 1053.0,
"completions/min_terminated_length": 1053.0,
"epoch": 0.022313010685103708,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0697084367275238,
"kl": 0.031285081058740616,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0013,
"num_tokens": 1854820.0,
"reward": -7.5625,
"reward_std": 0.375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 71
},
{
"completion_length": 1451.5,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1602.0,
"completions/mean_length": 1451.5,
"completions/mean_terminated_length": 1000.6666870117188,
"completions/min_length": 462.0,
"completions/min_terminated_length": 462.0,
"epoch": 0.02262727844123193,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1362772285938263,
"kl": 0.057278823107481,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.0023,
"num_tokens": 1882008.0,
"reward": -7.875,
"reward_std": 1.118110179901123,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.5,
"rewards/format_and_language_reward_func/std": 0.8944272398948669,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 72
},
{
"completion_length": 1348.5,
"completions/clipped_ratio": 0.5,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1501.0,
"completions/mean_length": 1348.5,
"completions/mean_terminated_length": 975.0,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.02294154619736015,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11800263077020645,
"kl": 0.05172204598784447,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.0021,
"num_tokens": 1907992.0,
"reward": -7.78125,
"reward_std": 0.4375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 73
},
{
"completion_length": 1543.5625,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1672.0,
"completions/mean_length": 1543.5625,
"completions/mean_terminated_length": 1246.166748046875,
"completions/min_length": 603.0,
"completions/min_terminated_length": 603.0,
"epoch": 0.023255813953488372,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12982743978500366,
"kl": 0.064728744328022,
"learning_rate": 1.5e-06,
"loss": 0.0026,
"num_tokens": 1936533.0,
"reward": -7.53125,
"reward_std": 0.6926814913749695,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.28125,
"rewards/check_numbers/std": 0.6046693325042725,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 74
},
{
"completion_length": 1450.4375,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1525.0,
"completions/mean_length": 1450.4375,
"completions/mean_terminated_length": 635.75,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.023570081709616594,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.081117182970047,
"kl": 0.05186208337545395,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.0021,
"num_tokens": 1963812.0,
"reward": -7.75,
"reward_std": 0.5386751294136047,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 75
},
{
"completion_length": 1392.25,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1499.0,
"completions/mean_length": 1392.25,
"completions/mean_terminated_length": 666.7999877929688,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.023884349465744813,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.37952759861946106,
"kl": 0.06186853349208832,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.0025,
"num_tokens": 1990520.0,
"reward": -7.65625,
"reward_std": 0.6205127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 76
},
{
"completion_length": 1473.9375,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1490.0,
"completions/mean_length": 1473.9375,
"completions/mean_terminated_length": 928.2000122070312,
"completions/min_length": 420.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.024198617221873036,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1677270531654358,
"kl": 0.053330324590206146,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0021,
"num_tokens": 2018063.0,
"reward": -7.75,
"reward_std": 0.75,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 77
},
{
"completion_length": 1513.625,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1547.0,
"completions/mean_length": 1513.625,
"completions/mean_terminated_length": 1055.2000732421875,
"completions/min_length": 599.0,
"completions/min_terminated_length": 599.0,
"epoch": 0.02451288497800126,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1181650385260582,
"kl": 0.05255034193396568,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.0021,
"num_tokens": 2046037.0,
"reward": -7.65625,
"reward_std": 0.6926814913749695,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.28125,
"rewards/check_numbers/std": 0.6046693325042725,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 78
},
{
"completion_length": 1411.25,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1426.0,
"completions/mean_length": 1411.25,
"completions/mean_terminated_length": 1011.71435546875,
"completions/min_length": 793.0,
"completions/min_terminated_length": 793.0,
"epoch": 0.024827152734129478,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1206207275390625,
"kl": 0.05586665868759155,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.0022,
"num_tokens": 2072805.0,
"reward": -7.78125,
"reward_std": 0.8977102637290955,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.28125,
"rewards/check_numbers/std": 0.6046693325042725,
"rewards/format_and_language_reward_func/mean": -3.5,
"rewards/format_and_language_reward_func/std": 0.8944272398948669,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 79
},
{
"completion_length": 1497.875,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1400.0,
"completions/mean_length": 1497.875,
"completions/mean_terminated_length": 825.5,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.0251414204902577,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15405312180519104,
"kl": 0.05234729126095772,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.0021,
"num_tokens": 2100631.0,
"reward": -7.5,
"reward_std": 1.9904643297195435,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.125,
"rewards/check_numbers/std": 1.5864006280899048,
"rewards/format_and_language_reward_func/mean": -3.375,
"rewards/format_and_language_reward_func/std": 0.8062257766723633,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 80
},
{
"completion_length": 1425.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1576.0,
"completions/mean_length": 1425.0,
"completions/mean_terminated_length": 930.0,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.02545568824638592,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11791858822107315,
"kl": 0.04198841378092766,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0017,
"num_tokens": 2127275.0,
"reward": -7.46875,
"reward_std": 0.6205127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.46875,
"rewards/check_numbers/std": 0.7180703282356262,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 81
},
{
"completion_length": 1682.5,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1662.0,
"completions/mean_length": 1682.5,
"completions/mean_terminated_length": 1511.3333740234375,
"completions/min_length": 1430.0,
"completions/min_terminated_length": 1430.0,
"epoch": 0.025769956002514142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1258944272994995,
"kl": 0.03991963341832161,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.0016,
"num_tokens": 2157831.0,
"reward": -7.96875,
"reward_std": 0.8125,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 82
},
{
"completion_length": 1560.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1502.0,
"completions/mean_length": 1560.0,
"completions/mean_terminated_length": 858.0,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.026084223758642364,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.25008276104927063,
"kl": 0.04483529552817345,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0018,
"num_tokens": 2186663.0,
"reward": -7.5625,
"reward_std": 0.5915063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 83
},
{
"completion_length": 1363.8125,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 890.0,
"completions/mean_length": 1363.8125,
"completions/mean_terminated_length": 766.8333740234375,
"completions/min_length": 623.0,
"completions/min_terminated_length": 623.0,
"epoch": 0.026398491514770583,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11882040649652481,
"kl": 0.04816675931215286,
"learning_rate": 9.444444444444445e-07,
"loss": 0.0019,
"num_tokens": 2212692.0,
"reward": -7.78125,
"reward_std": 0.6540063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 84
},
{
"completion_length": 1652.3125,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1646.0,
"completions/mean_length": 1652.3125,
"completions/mean_terminated_length": 1350.3333740234375,
"completions/min_length": 1045.0,
"completions/min_terminated_length": 1045.0,
"epoch": 0.026712759270898806,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11614203453063965,
"kl": 0.0454854890704155,
"learning_rate": 8.88888888888889e-07,
"loss": 0.0018,
"num_tokens": 2243069.0,
"reward": -8.0,
"reward_std": 0.9117004871368408,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 85
},
{
"completion_length": 1576.625,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1560.0,
"completions/mean_length": 1576.625,
"completions/mean_terminated_length": 1140.5,
"completions/min_length": 801.0,
"completions/min_terminated_length": 801.0,
"epoch": 0.02702702702702703,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09951747953891754,
"kl": 0.04343428835272789,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0017,
"num_tokens": 2272123.0,
"reward": -7.4375,
"reward_std": 0.9503755569458008,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.3125,
"rewards/check_numbers/std": 1.6520190238952637,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 86
},
{
"completion_length": 1552.3125,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1073.0,
"completions/mean_length": 1552.3125,
"completions/mean_terminated_length": 817.0,
"completions/min_length": 665.0,
"completions/min_terminated_length": 665.0,
"epoch": 0.027341294783155248,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1279648393392563,
"kl": 0.04489857330918312,
"learning_rate": 7.777777777777779e-07,
"loss": 0.0018,
"num_tokens": 2300748.0,
"reward": -7.96875,
"reward_std": 0.9505414962768555,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 87
},
{
"completion_length": 1594.375,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1438.0,
"completions/mean_length": 1594.375,
"completions/mean_terminated_length": 1041.3333740234375,
"completions/min_length": 780.0,
"completions/min_terminated_length": 780.0,
"epoch": 0.02765556253928347,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18847358226776123,
"kl": 0.03859600052237511,
"learning_rate": 7.222222222222222e-07,
"loss": 0.0015,
"num_tokens": 2330330.0,
"reward": -7.96875,
"reward_std": 0.8125,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 88
},
{
"completion_length": 1683.1875,
"completions/clipped_ratio": 0.875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1547.0,
"completions/mean_length": 1683.1875,
"completions/mean_terminated_length": 1411.5,
"completions/min_length": 1276.0,
"completions/min_terminated_length": 1276.0,
"epoch": 0.02796983029541169,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10217194259166718,
"kl": 0.044257864356040955,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0018,
"num_tokens": 2361789.0,
"reward": -7.6875,
"reward_std": 1.0060844421386719,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 89
},
{
"completion_length": 1638.3125,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1547.0,
"completions/mean_length": 1638.3125,
"completions/mean_terminated_length": 1275.666748046875,
"completions/min_length": 1068.0,
"completions/min_terminated_length": 1068.0,
"epoch": 0.028284098051539912,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6536535620689392,
"kl": 0.07140226662158966,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0029,
"num_tokens": 2392042.0,
"reward": -7.53125,
"reward_std": 0.9040063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.28125,
"rewards/check_numbers/std": 0.6046693325042725,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 90
},
{
"completion_length": 1547.75,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1282.0,
"completions/mean_length": 1547.75,
"completions/mean_terminated_length": 1025.0,
"completions/min_length": 848.0,
"completions/min_terminated_length": 848.0,
"epoch": 0.028598365807668134,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12291199713945389,
"kl": 0.05366697907447815,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0021,
"num_tokens": 2420674.0,
"reward": -7.78125,
"reward_std": 0.8705127239227295,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 91
},
{
"completion_length": 1515.5625,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1680.0,
"completions/mean_length": 1515.5625,
"completions/mean_terminated_length": 1171.5,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.028912633563796353,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11394577473402023,
"kl": 0.04896366223692894,
"learning_rate": 5.000000000000001e-07,
"loss": 0.002,
"num_tokens": 2448871.0,
"reward": -6.71875,
"reward_std": 0.8125,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": 0.28125,
"rewards/check_numbers/std": 1.7220990657806396,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 92
},
{
"completion_length": 1545.1875,
"completions/clipped_ratio": 0.625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1656.0,
"completions/mean_length": 1545.1875,
"completions/mean_terminated_length": 1250.5,
"completions/min_length": 537.0,
"completions/min_terminated_length": 537.0,
"epoch": 0.029226901319924576,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07623685151338577,
"kl": 0.046294040977954865,
"learning_rate": 4.444444444444445e-07,
"loss": 0.0019,
"num_tokens": 2477226.0,
"reward": -7.5625,
"reward_std": 0.375,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.5625,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 93
},
{
"completion_length": 1588.75,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1556.0,
"completions/mean_length": 1588.75,
"completions/mean_terminated_length": 1189.0,
"completions/min_length": 405.0,
"completions/min_terminated_length": 405.0,
"epoch": 0.0295411690760528,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11318648606538773,
"kl": 0.04230440780520439,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.0017,
"num_tokens": 2506806.0,
"reward": -7.75,
"reward_std": 0.5915063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.75,
"rewards/check_numbers/std": 0.7745966911315918,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 94
},
{
"completion_length": 1620.875,
"completions/clipped_ratio": 0.875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 987.0,
"completions/mean_length": 1620.875,
"completions/mean_terminated_length": 913.0,
"completions/min_length": 839.0,
"completions/min_terminated_length": 839.0,
"epoch": 0.029855436832181018,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11543525010347366,
"kl": 0.0602000392973423,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0024,
"num_tokens": 2536500.0,
"reward": -7.96875,
"reward_std": 0.6540063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.84375,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 95
},
{
"completion_length": 1623.375,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1687.0,
"completions/mean_length": 1623.375,
"completions/mean_terminated_length": 1327.5,
"completions/min_length": 743.0,
"completions/min_terminated_length": 743.0,
"epoch": 0.03016970458830924,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11837535351514816,
"kl": 0.04759529232978821,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0019,
"num_tokens": 2566598.0,
"reward": -8.1875,
"reward_std": 0.7165063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.9375,
"rewards/check_numbers/std": 0.75,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 96
},
{
"completion_length": 1617.0625,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1545.0,
"completions/mean_length": 1617.0625,
"completions/mean_terminated_length": 1302.25,
"completions/min_length": 868.0,
"completions/min_terminated_length": 868.0,
"epoch": 0.03048397234443746,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11196030676364899,
"kl": 0.04893610253930092,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.002,
"num_tokens": 2596579.0,
"reward": -7.4375,
"reward_std": 0.7165063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.1875,
"rewards/check_numbers/std": 0.5123475790023804,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 97
},
{
"completion_length": 1362.0625,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1402.0,
"completions/mean_length": 1362.0625,
"completions/mean_terminated_length": 899.2857666015625,
"completions/min_length": 364.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.030798240100565682,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.2802422046661377,
"kl": 0.07202958315610886,
"learning_rate": 1.6666666666666668e-07,
"loss": 0.0029,
"num_tokens": 2622492.0,
"reward": -7.375,
"reward_std": 0.4330126941204071,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.375,
"rewards/check_numbers/std": 0.670820415019989,
"rewards/format_and_language_reward_func/mean": -3.0,
"rewards/format_and_language_reward_func/std": 0.0,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 98
},
{
"completion_length": 1529.75,
"completions/clipped_ratio": 0.75,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1474.0,
"completions/mean_length": 1529.75,
"completions/mean_terminated_length": 953.0,
"completions/min_length": 371.0,
"completions/min_terminated_length": 371.0,
"epoch": 0.031112507856693904,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.27167099714279175,
"kl": 0.0705300122499466,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.0028,
"num_tokens": 2650764.0,
"reward": -7.40625,
"reward_std": 0.6540063619613647,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.28125,
"rewards/check_numbers/std": 0.6046693325042725,
"rewards/format_and_language_reward_func/mean": -3.125,
"rewards/format_and_language_reward_func/std": 0.5,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 99
},
{
"completion_length": 1459.75,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 1722.0,
"completions/max_terminated_length": 1133.0,
"completions/mean_length": 1459.75,
"completions/mean_terminated_length": 882.7999877929688,
"completions/min_length": 436.0,
"completions/min_terminated_length": 436.0,
"epoch": 0.03142677561282212,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11008132994174957,
"kl": 0.04374002292752266,
"learning_rate": 5.555555555555556e-08,
"loss": 0.0017,
"num_tokens": 2678396.0,
"reward": -7.90625,
"reward_std": 1.233162522315979,
"rewards/check_answer/mean": -2.0,
"rewards/check_answer/std": 0.0,
"rewards/check_numbers/mean": -0.65625,
"rewards/check_numbers/std": 0.7685213088989258,
"rewards/format_and_language_reward_func/mean": -3.25,
"rewards/format_and_language_reward_func/std": 0.6831300854682922,
"rewards/match_format_approximately/mean": -2.0,
"rewards/match_format_approximately/std": 0.0,
"rewards/match_format_exactly/mean": 0.0,
"rewards/match_format_exactly/std": 0.0,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 2678396,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}