|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.03142677561282212, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 387.5625, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 929.0, |
|
"completions/mean_length": 387.5625, |
|
"completions/mean_terminated_length": 298.6000061035156, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.00031426775612822125, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 9.58870792388916, |
|
"kl": 1.1056605577468872, |
|
"learning_rate": 0.0, |
|
"loss": 0.0442, |
|
"num_tokens": 10321.0, |
|
"reward": -8.78125, |
|
"reward_std": 1.5290063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.15625, |
|
"rewards/check_numbers/std": 1.1212902069091797, |
|
"rewards/format_and_language_reward_func/mean": -3.625, |
|
"rewards/format_and_language_reward_func/std": 0.9574271440505981, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 427.625, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1380.0, |
|
"completions/mean_length": 427.625, |
|
"completions/mean_terminated_length": 341.3333435058594, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.0006285355122564425, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 15.610054016113281, |
|
"kl": 0.5778560638427734, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.0231, |
|
"num_tokens": 20955.0, |
|
"reward": -9.5, |
|
"reward_std": 1.6830127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.75, |
|
"rewards/check_numbers/std": 0.8366600275039673, |
|
"rewards/format_and_language_reward_func/mean": -3.75, |
|
"rewards/format_and_language_reward_func/std": 1.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 979.5, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1507.0, |
|
"completions/mean_length": 979.5, |
|
"completions/mean_terminated_length": 642.0, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.0009428032683846638, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 4.994994163513184, |
|
"kl": 0.2919767200946808, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0117, |
|
"num_tokens": 40683.0, |
|
"reward": -7.53125, |
|
"reward_std": 1.3821797370910645, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.40625, |
|
"rewards/check_numbers/std": 1.3443554639816284, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 836.3125, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1175.0, |
|
"completions/mean_length": 836.3125, |
|
"completions/mean_terminated_length": 433.727294921875, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.001257071024512885, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 16.14759635925293, |
|
"kl": 0.5689749717712402, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.0228, |
|
"num_tokens": 58020.0, |
|
"reward": -9.03125, |
|
"reward_std": 1.9392420053482056, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.40625, |
|
"rewards/check_numbers/std": 1.0680004358291626, |
|
"rewards/format_and_language_reward_func/mean": -3.625, |
|
"rewards/format_and_language_reward_func/std": 0.9574271440505981, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 553.25, |
|
"completions/clipped_ratio": 0.0625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1469.0, |
|
"completions/mean_length": 553.25, |
|
"completions/mean_terminated_length": 475.3333435058594, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.0015713387806411063, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 5.5043792724609375, |
|
"kl": 0.5717646479606628, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0229, |
|
"num_tokens": 70976.0, |
|
"reward": -8.40625, |
|
"reward_std": 0.9375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.03125, |
|
"rewards/check_numbers/std": 1.007782220840454, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 750.9375, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1676.0, |
|
"completions/mean_length": 750.9375, |
|
"completions/mean_terminated_length": 612.2142944335938, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.0018856065367693275, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 7.5527191162109375, |
|
"kl": 0.26640018820762634, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0107, |
|
"num_tokens": 87127.0, |
|
"reward": -8.40625, |
|
"reward_std": 1.3351925611495972, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.90625, |
|
"rewards/check_numbers/std": 1.1138334274291992, |
|
"rewards/format_and_language_reward_func/mean": -3.5, |
|
"rewards/format_and_language_reward_func/std": 0.8944272398948669, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 822.625, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 1699.0, |
|
"completions/max_terminated_length": 1699.0, |
|
"completions/mean_length": 822.625, |
|
"completions/mean_terminated_length": 822.625, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.0021998742928975488, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 6.19105863571167, |
|
"kl": 0.17402563989162445, |
|
"learning_rate": 3e-06, |
|
"loss": 0.007, |
|
"num_tokens": 104165.0, |
|
"reward": -7.96875, |
|
"reward_std": 1.063370943069458, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.71875, |
|
"rewards/check_numbers/std": 0.8750000596046448, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 1419.1875, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1149.0, |
|
"completions/mean_length": 1419.1875, |
|
"completions/mean_terminated_length": 753.0, |
|
"completions/min_length": 366.0, |
|
"completions/min_terminated_length": 366.0, |
|
"epoch": 0.00251414204902577, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.1348615139722824, |
|
"kl": 0.029216211289167404, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.0012, |
|
"num_tokens": 130604.0, |
|
"reward": -8.0625, |
|
"reward_std": 0.625, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.9375, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 1018.125, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1565.0, |
|
"completions/mean_length": 1018.125, |
|
"completions/mean_terminated_length": 595.7999877929688, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.0028284098051539913, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 7.937878131866455, |
|
"kl": 0.42939966917037964, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0172, |
|
"num_tokens": 151170.0, |
|
"reward": -7.8125, |
|
"reward_std": 1.1593647003173828, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.6875, |
|
"rewards/check_numbers/std": 0.9639329314231873, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 1018.625, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1317.0, |
|
"completions/mean_length": 1018.625, |
|
"completions/mean_terminated_length": 698.9091186523438, |
|
"completions/min_length": 3.0, |
|
"completions/min_terminated_length": 3.0, |
|
"epoch": 0.0031426775612822125, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.0097506046295166, |
|
"kl": 0.15729668736457825, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.0063, |
|
"num_tokens": 171388.0, |
|
"reward": -7.75, |
|
"reward_std": 0.8080127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 1189.8125, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1696.0, |
|
"completions/mean_length": 1189.8125, |
|
"completions/mean_terminated_length": 947.9091186523438, |
|
"completions/min_length": 423.0, |
|
"completions/min_terminated_length": 423.0, |
|
"epoch": 0.0034569453174104338, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.136498361825943, |
|
"kl": 0.05319710448384285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 194373.0, |
|
"reward": -7.75, |
|
"reward_std": 0.8080127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 1122.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1665.0, |
|
"completions/mean_length": 1122.0, |
|
"completions/mean_terminated_length": 983.5385131835938, |
|
"completions/min_length": 498.0, |
|
"completions/min_terminated_length": 498.0, |
|
"epoch": 0.003771213073538655, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.0965699553489685, |
|
"kl": 0.06320463120937347, |
|
"learning_rate": 4.944444444444445e-06, |
|
"loss": 0.0025, |
|
"num_tokens": 216261.0, |
|
"reward": -7.65625, |
|
"reward_std": 0.40400636196136475, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 1341.75, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1607.0, |
|
"completions/mean_length": 1341.75, |
|
"completions/mean_terminated_length": 1046.0, |
|
"completions/min_length": 592.0, |
|
"completions/min_terminated_length": 592.0, |
|
"epoch": 0.004085480829666876, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.07173436135053635, |
|
"kl": 0.04135803505778313, |
|
"learning_rate": 4.888888888888889e-06, |
|
"loss": 0.0017, |
|
"num_tokens": 241453.0, |
|
"reward": -7.84375, |
|
"reward_std": 0.40400636196136475, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 1283.8125, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1221.0, |
|
"completions/mean_length": 1283.8125, |
|
"completions/mean_terminated_length": 553.5, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.0043997485857950975, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 6.01732063293457, |
|
"kl": 0.17246384918689728, |
|
"learning_rate": 4.833333333333333e-06, |
|
"loss": 0.0069, |
|
"num_tokens": 266634.0, |
|
"reward": -7.84375, |
|
"reward_std": 1.1508427858352661, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.71875, |
|
"rewards/check_numbers/std": 0.8750000596046448, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 1189.6875, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1694.0, |
|
"completions/mean_length": 1189.6875, |
|
"completions/mean_terminated_length": 947.727294921875, |
|
"completions/min_length": 36.0, |
|
"completions/min_terminated_length": 36.0, |
|
"epoch": 0.004714016341923318, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.6227664351463318, |
|
"kl": 0.08904990553855896, |
|
"learning_rate": 4.777777777777778e-06, |
|
"loss": 0.0036, |
|
"num_tokens": 289373.0, |
|
"reward": -6.90625, |
|
"reward_std": 1.0290063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": 0.09375, |
|
"rewards/check_numbers/std": 1.827737808227539, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 1361.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1314.0, |
|
"completions/mean_length": 1361.0, |
|
"completions/mean_terminated_length": 759.3333740234375, |
|
"completions/min_length": 148.0, |
|
"completions/min_terminated_length": 148.0, |
|
"epoch": 0.00502828409805154, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.12482193857431412, |
|
"kl": 0.05782376229763031, |
|
"learning_rate": 4.722222222222222e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 314701.0, |
|
"reward": -8.09375, |
|
"reward_std": 0.9840351343154907, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 1330.625, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1668.0, |
|
"completions/mean_length": 1330.625, |
|
"completions/mean_terminated_length": 827.4285888671875, |
|
"completions/min_length": 14.0, |
|
"completions/min_terminated_length": 14.0, |
|
"epoch": 0.005342551854179761, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.09849988669157028, |
|
"kl": 0.07715655118227005, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.0031, |
|
"num_tokens": 339795.0, |
|
"reward": -7.9375, |
|
"reward_std": 0.375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.9375, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 1300.3125, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1540.0, |
|
"completions/mean_length": 1300.3125, |
|
"completions/mean_terminated_length": 878.625, |
|
"completions/min_length": 5.0, |
|
"completions/min_terminated_length": 5.0, |
|
"epoch": 0.0056568196103079825, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 2.448221445083618, |
|
"kl": 0.3021676242351532, |
|
"learning_rate": 4.611111111111112e-06, |
|
"loss": 0.0121, |
|
"num_tokens": 364600.0, |
|
"reward": -8.3125, |
|
"reward_std": 1.2111132144927979, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.9375, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 1257.625, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1613.0, |
|
"completions/mean_length": 1257.625, |
|
"completions/mean_terminated_length": 979.0, |
|
"completions/min_length": 311.0, |
|
"completions/min_terminated_length": 311.0, |
|
"epoch": 0.005971087366436203, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.12901267409324646, |
|
"kl": 0.06548095494508743, |
|
"learning_rate": 4.555555555555556e-06, |
|
"loss": 0.0026, |
|
"num_tokens": 388078.0, |
|
"reward": -7.84375, |
|
"reward_std": 0.8176814913749695, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.59375, |
|
"rewards/check_numbers/std": 1.7050782442092896, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 1573.6875, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1592.0, |
|
"completions/mean_length": 1573.6875, |
|
"completions/mean_terminated_length": 1128.75, |
|
"completions/min_length": 825.0, |
|
"completions/min_terminated_length": 825.0, |
|
"epoch": 0.006285355122564425, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.10170631110668182, |
|
"kl": 0.04396039992570877, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 417161.0, |
|
"reward": -8.375, |
|
"reward_std": 0.9611132144927979, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.125, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 1380.5, |
|
"completions/clipped_ratio": 0.375, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1633.0, |
|
"completions/mean_length": 1380.5, |
|
"completions/mean_terminated_length": 1175.5999755859375, |
|
"completions/min_length": 2.0, |
|
"completions/min_terminated_length": 2.0, |
|
"epoch": 0.006599622878692646, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 2.5239837169647217, |
|
"kl": 0.16142641007900238, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.0065, |
|
"num_tokens": 443361.0, |
|
"reward": -8.03125, |
|
"reward_std": 0.6205127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.03125, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 1384.25, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1601.0, |
|
"completions/mean_length": 1384.25, |
|
"completions/mean_terminated_length": 950.0000610351562, |
|
"completions/min_length": 427.0, |
|
"completions/min_terminated_length": 427.0, |
|
"epoch": 0.0069138906348208675, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.09287308901548386, |
|
"kl": 0.06003550812602043, |
|
"learning_rate": 4.388888888888889e-06, |
|
"loss": 0.0024, |
|
"num_tokens": 469325.0, |
|
"reward": -7.78125, |
|
"reward_std": 0.4375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 1367.4375, |
|
"completions/clipped_ratio": 0.4375, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1606.0, |
|
"completions/mean_length": 1367.4375, |
|
"completions/mean_terminated_length": 1091.6666259765625, |
|
"completions/min_length": 531.0, |
|
"completions/min_terminated_length": 531.0, |
|
"epoch": 0.007228158390949088, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.3170417845249176, |
|
"kl": 0.06963013857603073, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 0.0028, |
|
"num_tokens": 495200.0, |
|
"reward": -7.1875, |
|
"reward_std": 0.375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.1875, |
|
"rewards/check_numbers/std": 0.5123475790023804, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 1551.6875, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1699.0, |
|
"completions/mean_length": 1551.6875, |
|
"completions/mean_terminated_length": 813.6666870117188, |
|
"completions/min_length": 243.0, |
|
"completions/min_terminated_length": 243.0, |
|
"epoch": 0.00754242614707731, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.20077864825725555, |
|
"kl": 0.05942363664507866, |
|
"learning_rate": 4.277777777777778e-06, |
|
"loss": 0.0024, |
|
"num_tokens": 523435.0, |
|
"reward": -8.28125, |
|
"reward_std": 0.9840351343154907, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.03125, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 1567.25, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 833.0, |
|
"completions/mean_length": 1567.25, |
|
"completions/mean_terminated_length": 484.0, |
|
"completions/min_length": 135.0, |
|
"completions/min_terminated_length": 135.0, |
|
"epoch": 0.00785669390320553, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.15381832420825958, |
|
"kl": 0.05258103832602501, |
|
"learning_rate": 4.222222222222223e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 552719.0, |
|
"reward": -7.5, |
|
"reward_std": 0.661700427532196, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 1234.5625, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1246.0, |
|
"completions/mean_length": 1234.5625, |
|
"completions/mean_terminated_length": 747.125, |
|
"completions/min_length": 43.0, |
|
"completions/min_terminated_length": 43.0, |
|
"epoch": 0.008170961659333752, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.2516714334487915, |
|
"kl": 0.07844924181699753, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.0031, |
|
"num_tokens": 576460.0, |
|
"reward": -7.75, |
|
"reward_std": 0.5915063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 1580.875, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1688.0, |
|
"completions/mean_length": 1580.875, |
|
"completions/mean_terminated_length": 1270.4000244140625, |
|
"completions/min_length": 815.0, |
|
"completions/min_terminated_length": 815.0, |
|
"epoch": 0.008485229415461974, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.08427742123603821, |
|
"kl": 0.040127161890268326, |
|
"learning_rate": 4.111111111111111e-06, |
|
"loss": 0.0016, |
|
"num_tokens": 605630.0, |
|
"reward": -7.84375, |
|
"reward_std": 0.5625, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 1360.1875, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1577.0, |
|
"completions/mean_length": 1360.1875, |
|
"completions/mean_terminated_length": 998.375, |
|
"completions/min_length": 415.0, |
|
"completions/min_terminated_length": 415.0, |
|
"epoch": 0.008799497171590195, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13703206181526184, |
|
"kl": 0.05611160770058632, |
|
"learning_rate": 4.055555555555556e-06, |
|
"loss": 0.0022, |
|
"num_tokens": 631529.0, |
|
"reward": -8.21875, |
|
"reward_std": 0.9407068490982056, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 1515.625, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1188.0, |
|
"completions/mean_length": 1515.625, |
|
"completions/mean_terminated_length": 1061.5999755859375, |
|
"completions/min_length": 871.0, |
|
"completions/min_terminated_length": 871.0, |
|
"epoch": 0.009113764927718416, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1371777057647705, |
|
"kl": 0.05326192080974579, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 659651.0, |
|
"reward": -8.09375, |
|
"reward_std": 0.9040063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 1422.0625, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1427.0, |
|
"completions/mean_length": 1422.0625, |
|
"completions/mean_terminated_length": 1036.4285888671875, |
|
"completions/min_length": 1.0, |
|
"completions/min_terminated_length": 1.0, |
|
"epoch": 0.009428032683846637, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 1.34744393825531, |
|
"kl": 2.590277671813965, |
|
"learning_rate": 3.944444444444445e-06, |
|
"loss": 0.1036, |
|
"num_tokens": 686368.0, |
|
"reward": -7.84375, |
|
"reward_std": 1.1540063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.71875, |
|
"rewards/check_numbers/std": 0.8750000596046448, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 1592.75, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1348.0, |
|
"completions/mean_length": 1592.75, |
|
"completions/mean_terminated_length": 1032.666748046875, |
|
"completions/min_length": 537.0, |
|
"completions/min_terminated_length": 537.0, |
|
"epoch": 0.00974230043997486, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.09155543893575668, |
|
"kl": 0.05414074286818504, |
|
"learning_rate": 3.88888888888889e-06, |
|
"loss": 0.0022, |
|
"num_tokens": 715988.0, |
|
"reward": -7.40625, |
|
"reward_std": 0.4375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.28125, |
|
"rewards/check_numbers/std": 0.6046693325042725, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 1555.5625, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1635.0, |
|
"completions/mean_length": 1555.5625, |
|
"completions/mean_terminated_length": 1056.25, |
|
"completions/min_length": 619.0, |
|
"completions/min_terminated_length": 619.0, |
|
"epoch": 0.01005656819610308, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15684477984905243, |
|
"kl": 0.05063142254948616, |
|
"learning_rate": 3.833333333333334e-06, |
|
"loss": 0.002, |
|
"num_tokens": 744669.0, |
|
"reward": -8.40625, |
|
"reward_std": 1.0148502588272095, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.03125, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 1546.9375, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1335.0, |
|
"completions/mean_length": 1546.9375, |
|
"completions/mean_terminated_length": 1021.75, |
|
"completions/min_length": 747.0, |
|
"completions/min_terminated_length": 747.0, |
|
"epoch": 0.010370835952231301, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.07121387869119644, |
|
"kl": 0.032333556562662125, |
|
"learning_rate": 3.777777777777778e-06, |
|
"loss": 0.0013, |
|
"num_tokens": 773388.0, |
|
"reward": -7.46875, |
|
"reward_std": 0.5625, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 1376.4375, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1313.0, |
|
"completions/mean_length": 1376.4375, |
|
"completions/mean_terminated_length": 800.5, |
|
"completions/min_length": 5.0, |
|
"completions/min_terminated_length": 5.0, |
|
"epoch": 0.010685103708359522, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 1.1596808433532715, |
|
"kl": 0.12857064604759216, |
|
"learning_rate": 3.7222222222222225e-06, |
|
"loss": 0.0051, |
|
"num_tokens": 799511.0, |
|
"reward": -7.5, |
|
"reward_std": 0.661700427532196, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 1699.6875, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1711.0, |
|
"completions/mean_length": 1699.6875, |
|
"completions/mean_terminated_length": 1543.5, |
|
"completions/min_length": 1376.0, |
|
"completions/min_terminated_length": 1376.0, |
|
"epoch": 0.010999371464487744, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.11480734497308731, |
|
"kl": 0.04761524498462677, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 0.0019, |
|
"num_tokens": 830966.0, |
|
"reward": -7.96875, |
|
"reward_std": 1.2129219770431519, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.5, |
|
"rewards/format_and_language_reward_func/std": 0.8944272398948669, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 1583.75, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1432.0, |
|
"completions/mean_length": 1583.75, |
|
"completions/mean_terminated_length": 1169.0, |
|
"completions/min_length": 686.0, |
|
"completions/min_terminated_length": 686.0, |
|
"epoch": 0.011313639220615965, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.34630483388900757, |
|
"kl": 0.07976571470499039, |
|
"learning_rate": 3.6111111111111115e-06, |
|
"loss": 0.0032, |
|
"num_tokens": 859930.0, |
|
"reward": -8.09375, |
|
"reward_std": 0.9840351343154907, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 1338.875, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 622.0, |
|
"completions/mean_length": 1338.875, |
|
"completions/mean_terminated_length": 496.0, |
|
"completions/min_length": 286.0, |
|
"completions/min_terminated_length": 286.0, |
|
"epoch": 0.011627906976744186, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1614941507577896, |
|
"kl": 0.06494183838367462, |
|
"learning_rate": 3.555555555555556e-06, |
|
"loss": 0.0026, |
|
"num_tokens": 885120.0, |
|
"reward": -7.6875, |
|
"reward_std": 0.8080127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 1480.0625, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1611.0, |
|
"completions/mean_length": 1480.0625, |
|
"completions/mean_terminated_length": 1169.0, |
|
"completions/min_length": 525.0, |
|
"completions/min_terminated_length": 525.0, |
|
"epoch": 0.011942174732872407, |
|
"frac_reward_zero_std": 0.75, |
|
"grad_norm": 0.07541613280773163, |
|
"kl": 0.052320241928100586, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 912537.0, |
|
"reward": -8.0, |
|
"reward_std": 0.28867512941360474, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 1373.75, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1538.0, |
|
"completions/mean_length": 1373.75, |
|
"completions/mean_terminated_length": 926.0000610351562, |
|
"completions/min_length": 489.0, |
|
"completions/min_terminated_length": 489.0, |
|
"epoch": 0.01225644248900063, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.16380992531776428, |
|
"kl": 0.06148176267743111, |
|
"learning_rate": 3.444444444444445e-06, |
|
"loss": 0.0025, |
|
"num_tokens": 938505.0, |
|
"reward": -8.0, |
|
"reward_std": 1.0497419834136963, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 1519.1875, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1336.0, |
|
"completions/mean_length": 1519.1875, |
|
"completions/mean_terminated_length": 910.75, |
|
"completions/min_length": 601.0, |
|
"completions/min_terminated_length": 601.0, |
|
"epoch": 0.01257071024512885, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.10772477090358734, |
|
"kl": 0.04433707520365715, |
|
"learning_rate": 3.3888888888888893e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 966940.0, |
|
"reward": -7.59375, |
|
"reward_std": 0.7340351343154907, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 1453.9375, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1432.0, |
|
"completions/mean_length": 1453.9375, |
|
"completions/mean_terminated_length": 1007.1666870117188, |
|
"completions/min_length": 350.0, |
|
"completions/min_terminated_length": 350.0, |
|
"epoch": 0.012884978001257071, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.07585709542036057, |
|
"kl": 0.04436139389872551, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 993859.0, |
|
"reward": -8.125, |
|
"reward_std": 0.5773502588272095, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 1421.5, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1470.0, |
|
"completions/mean_length": 1421.5, |
|
"completions/mean_terminated_length": 1035.1429443359375, |
|
"completions/min_length": 118.0, |
|
"completions/min_terminated_length": 118.0, |
|
"epoch": 0.013199245757385292, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.5277220010757446, |
|
"kl": 0.06066755950450897, |
|
"learning_rate": 3.277777777777778e-06, |
|
"loss": 0.0024, |
|
"num_tokens": 1020571.0, |
|
"reward": -8.125, |
|
"reward_std": 1.0517165660858154, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 1514.6875, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1326.0, |
|
"completions/mean_length": 1514.6875, |
|
"completions/mean_terminated_length": 1058.5999755859375, |
|
"completions/min_length": 833.0, |
|
"completions/min_terminated_length": 833.0, |
|
"epoch": 0.013513513513513514, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.10104304552078247, |
|
"kl": 0.05354408547282219, |
|
"learning_rate": 3.2222222222222227e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1049086.0, |
|
"reward": -7.46875, |
|
"reward_std": 0.5625, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 1431.4375, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1554.0, |
|
"completions/mean_length": 1431.4375, |
|
"completions/mean_terminated_length": 947.1666870117188, |
|
"completions/min_length": 320.0, |
|
"completions/min_terminated_length": 320.0, |
|
"epoch": 0.013827781269641735, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11680426448583603, |
|
"kl": 0.05350031703710556, |
|
"learning_rate": 3.1666666666666667e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1075785.0, |
|
"reward": -7.46875, |
|
"reward_std": 0.6205127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 1583.875, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1720.0, |
|
"completions/mean_length": 1583.875, |
|
"completions/mean_terminated_length": 1353.666748046875, |
|
"completions/min_length": 573.0, |
|
"completions/min_terminated_length": 573.0, |
|
"epoch": 0.014142049025769956, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.10828514397144318, |
|
"kl": 0.05207566171884537, |
|
"learning_rate": 3.1111111111111116e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1105015.0, |
|
"reward": -7.25, |
|
"reward_std": 1.1668819189071655, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.125, |
|
"rewards/check_numbers/std": 1.5864006280899048, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 1366.1875, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1669.0, |
|
"completions/mean_length": 1366.1875, |
|
"completions/mean_terminated_length": 908.71435546875, |
|
"completions/min_length": 283.0, |
|
"completions/min_terminated_length": 283.0, |
|
"epoch": 0.014456316781898177, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.09319330006837845, |
|
"kl": 0.044255051761865616, |
|
"learning_rate": 3.055555555555556e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 1130758.0, |
|
"reward": -7.375, |
|
"reward_std": 0.375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 1549.25, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1445.0, |
|
"completions/mean_length": 1549.25, |
|
"completions/mean_terminated_length": 1031.0, |
|
"completions/min_length": 798.0, |
|
"completions/min_terminated_length": 798.0, |
|
"epoch": 0.0147705845380264, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.1097634956240654, |
|
"kl": 0.04721890389919281, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0019, |
|
"num_tokens": 1159726.0, |
|
"reward": -7.75, |
|
"reward_std": 0.375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 1559.8125, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1060.0, |
|
"completions/mean_length": 1559.8125, |
|
"completions/mean_terminated_length": 857.0, |
|
"completions/min_length": 737.0, |
|
"completions/min_terminated_length": 737.0, |
|
"epoch": 0.01508485229415462, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.13540787994861603, |
|
"kl": 0.052234258502721786, |
|
"learning_rate": 2.944444444444445e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1188527.0, |
|
"reward": -8.1875, |
|
"reward_std": 1.1715351343154907, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.9375, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 1568.125, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1591.0, |
|
"completions/mean_length": 1568.125, |
|
"completions/mean_terminated_length": 1229.5999755859375, |
|
"completions/min_length": 958.0, |
|
"completions/min_terminated_length": 958.0, |
|
"epoch": 0.015399120050282841, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.11936990916728973, |
|
"kl": 0.056975651532411575, |
|
"learning_rate": 2.888888888888889e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 1217941.0, |
|
"reward": -8.1875, |
|
"reward_std": 0.8571338653564453, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.9375, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 1495.4375, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1325.0, |
|
"completions/mean_length": 1495.4375, |
|
"completions/mean_terminated_length": 997.0, |
|
"completions/min_length": 445.0, |
|
"completions/min_terminated_length": 445.0, |
|
"epoch": 0.01571338780641106, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.1257868856191635, |
|
"kl": 0.051786769181489944, |
|
"learning_rate": 2.8333333333333335e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1245944.0, |
|
"reward": -7.6875, |
|
"reward_std": 0.7895780801773071, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 1586.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1700.0, |
|
"completions/mean_length": 1586.0, |
|
"completions/mean_terminated_length": 1286.800048828125, |
|
"completions/min_length": 820.0, |
|
"completions/min_terminated_length": 820.0, |
|
"epoch": 0.016027655562539284, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.11521671712398529, |
|
"kl": 0.047557681798934937, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 0.0019, |
|
"num_tokens": 1275304.0, |
|
"reward": -8.0, |
|
"reward_std": 0.6636751294136047, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 1589.6875, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1535.0, |
|
"completions/mean_length": 1589.6875, |
|
"completions/mean_terminated_length": 1192.75, |
|
"completions/min_length": 863.0, |
|
"completions/min_terminated_length": 863.0, |
|
"epoch": 0.016341923318667503, |
|
"frac_reward_zero_std": 0.75, |
|
"grad_norm": 0.24990025162696838, |
|
"kl": 0.0706791803240776, |
|
"learning_rate": 2.7222222222222224e-06, |
|
"loss": 0.0028, |
|
"num_tokens": 1304927.0, |
|
"reward": -7.5625, |
|
"reward_std": 0.21650634706020355, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 1505.0625, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1408.0, |
|
"completions/mean_length": 1505.0625, |
|
"completions/mean_terminated_length": 1027.800048828125, |
|
"completions/min_length": 826.0, |
|
"completions/min_terminated_length": 826.0, |
|
"epoch": 0.016656191074795726, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.12817677855491638, |
|
"kl": 0.05584558844566345, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.0022, |
|
"num_tokens": 1332692.0, |
|
"reward": -7.625, |
|
"reward_std": 0.875, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 1617.8125, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1689.0, |
|
"completions/mean_length": 1617.8125, |
|
"completions/mean_terminated_length": 1305.25, |
|
"completions/min_length": 897.0, |
|
"completions/min_terminated_length": 897.0, |
|
"epoch": 0.01697045883092395, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11597556620836258, |
|
"kl": 0.05314599350094795, |
|
"learning_rate": 2.6111111111111113e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1362965.0, |
|
"reward": -7.8125, |
|
"reward_std": 0.661700427532196, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 1612.125, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1567.0, |
|
"completions/mean_length": 1612.125, |
|
"completions/mean_terminated_length": 1136.0, |
|
"completions/min_length": 707.0, |
|
"completions/min_terminated_length": 707.0, |
|
"epoch": 0.017284726587052168, |
|
"frac_reward_zero_std": 0.75, |
|
"grad_norm": 0.07392556965351105, |
|
"kl": 0.04997054487466812, |
|
"learning_rate": 2.5555555555555557e-06, |
|
"loss": 0.002, |
|
"num_tokens": 1393151.0, |
|
"reward": -7.46875, |
|
"reward_std": 0.1875, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 1571.625, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1313.0, |
|
"completions/mean_length": 1571.625, |
|
"completions/mean_terminated_length": 920.0, |
|
"completions/min_length": 403.0, |
|
"completions/min_terminated_length": 403.0, |
|
"epoch": 0.01759899434318039, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.10698790848255157, |
|
"kl": 0.040371235460042953, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0016, |
|
"num_tokens": 1422181.0, |
|
"reward": -7.9375, |
|
"reward_std": 0.5915063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.9375, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 1594.9375, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 707.0, |
|
"completions/mean_length": 1594.9375, |
|
"completions/mean_terminated_length": 705.5, |
|
"completions/min_length": 704.0, |
|
"completions/min_terminated_length": 704.0, |
|
"epoch": 0.017913262099308613, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.6847302317619324, |
|
"kl": 0.04597615823149681, |
|
"learning_rate": 2.4444444444444447e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 1451644.0, |
|
"reward": -7.59375, |
|
"reward_std": 0.6205127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 1503.375, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 640.0, |
|
"completions/mean_length": 1503.375, |
|
"completions/mean_terminated_length": 556.0, |
|
"completions/min_length": 475.0, |
|
"completions/min_terminated_length": 475.0, |
|
"epoch": 0.018227529855436832, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.09949938207864761, |
|
"kl": 0.056429892778396606, |
|
"learning_rate": 2.388888888888889e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 1479182.0, |
|
"reward": -7.96875, |
|
"reward_std": 0.6540063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 1563.0, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1411.0, |
|
"completions/mean_length": 1563.0, |
|
"completions/mean_terminated_length": 1213.2000732421875, |
|
"completions/min_length": 990.0, |
|
"completions/min_terminated_length": 990.0, |
|
"epoch": 0.018541797611565054, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.11543877422809601, |
|
"kl": 0.035633672028779984, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 0.0014, |
|
"num_tokens": 1508062.0, |
|
"reward": -7.65625, |
|
"reward_std": 0.7790063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 1707.75, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1691.0, |
|
"completions/mean_length": 1707.75, |
|
"completions/mean_terminated_length": 1646.0, |
|
"completions/min_length": 1605.0, |
|
"completions/min_terminated_length": 1605.0, |
|
"epoch": 0.018856065367693273, |
|
"frac_reward_zero_std": 0.75, |
|
"grad_norm": 0.0639866441488266, |
|
"kl": 0.05725252255797386, |
|
"learning_rate": 2.277777777777778e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 1539650.0, |
|
"reward": -7.25, |
|
"reward_std": 0.28867512941360474, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": 0.0, |
|
"rewards/check_numbers/std": 0.0, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 1535.1875, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1665.0, |
|
"completions/mean_length": 1535.1875, |
|
"completions/mean_terminated_length": 974.75, |
|
"completions/min_length": 374.0, |
|
"completions/min_terminated_length": 374.0, |
|
"epoch": 0.019170333123821496, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.2154313325881958, |
|
"kl": 0.06772614270448685, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.0027, |
|
"num_tokens": 1567677.0, |
|
"reward": -8.09375, |
|
"reward_std": 0.9407067894935608, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 1480.75, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1193.0, |
|
"completions/mean_length": 1480.75, |
|
"completions/mean_terminated_length": 757.0, |
|
"completions/min_length": 323.0, |
|
"completions/min_terminated_length": 323.0, |
|
"epoch": 0.01948460087994972, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18533650040626526, |
|
"kl": 0.04637778922915459, |
|
"learning_rate": 2.166666666666667e-06, |
|
"loss": 0.0019, |
|
"num_tokens": 1595629.0, |
|
"reward": -7.5625, |
|
"reward_std": 0.8080127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 1604.8125, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1672.0, |
|
"completions/mean_length": 1604.8125, |
|
"completions/mean_terminated_length": 1097.0, |
|
"completions/min_length": 560.0, |
|
"completions/min_terminated_length": 560.0, |
|
"epoch": 0.019798868636077938, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.17045044898986816, |
|
"kl": 0.05853426456451416, |
|
"learning_rate": 2.1111111111111114e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 1624594.0, |
|
"reward": -8.40625, |
|
"reward_std": 0.7628755569458008, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.03125, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 1406.9375, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1612.0, |
|
"completions/mean_length": 1406.9375, |
|
"completions/mean_terminated_length": 881.8333740234375, |
|
"completions/min_length": 494.0, |
|
"completions/min_terminated_length": 494.0, |
|
"epoch": 0.02011313639220616, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.13805745542049408, |
|
"kl": 0.045212242752313614, |
|
"learning_rate": 2.0555555555555555e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 1650865.0, |
|
"reward": -8.15625, |
|
"reward_std": 0.4955126941204071, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -1.03125, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 1511.0625, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1656.0, |
|
"completions/mean_length": 1511.0625, |
|
"completions/mean_terminated_length": 1159.5, |
|
"completions/min_length": 661.0, |
|
"completions/min_terminated_length": 661.0, |
|
"epoch": 0.02042740414833438, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.10470445454120636, |
|
"kl": 0.040612928569316864, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0016, |
|
"num_tokens": 1679182.0, |
|
"reward": -8.0, |
|
"reward_std": 0.9917292594909668, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 1688.3125, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1570.0, |
|
"completions/mean_length": 1688.3125, |
|
"completions/mean_terminated_length": 1452.5, |
|
"completions/min_length": 1335.0, |
|
"completions/min_terminated_length": 1335.0, |
|
"epoch": 0.020741671904462602, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.0923866257071495, |
|
"kl": 0.04577171802520752, |
|
"learning_rate": 1.944444444444445e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 1710331.0, |
|
"reward": -6.875, |
|
"reward_std": 0.9716878533363342, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": 0.25, |
|
"rewards/check_numbers/std": 1.3662601709365845, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 1652.3125, |
|
"completions/clipped_ratio": 0.9375, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 607.0, |
|
"completions/mean_length": 1652.3125, |
|
"completions/mean_terminated_length": 607.0, |
|
"completions/min_length": 607.0, |
|
"completions/min_terminated_length": 607.0, |
|
"epoch": 0.021055939660590824, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.13661737740039825, |
|
"kl": 0.0424213632941246, |
|
"learning_rate": 1.888888888888889e-06, |
|
"loss": 0.0017, |
|
"num_tokens": 1741012.0, |
|
"reward": -7.71875, |
|
"reward_std": 0.8185844421386719, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 1409.125, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1383.0, |
|
"completions/mean_length": 1409.125, |
|
"completions/mean_terminated_length": 887.6666870117188, |
|
"completions/min_length": 636.0, |
|
"completions/min_terminated_length": 636.0, |
|
"epoch": 0.021370207416719043, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15749689936637878, |
|
"kl": 0.0645652636885643, |
|
"learning_rate": 1.8333333333333333e-06, |
|
"loss": 0.0026, |
|
"num_tokens": 1767746.0, |
|
"reward": -8.125, |
|
"reward_std": 1.5154354572296143, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.75, |
|
"rewards/format_and_language_reward_func/std": 1.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 1533.5, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1719.0, |
|
"completions/mean_length": 1533.5, |
|
"completions/mean_terminated_length": 1118.800048828125, |
|
"completions/min_length": 381.0, |
|
"completions/min_terminated_length": 381.0, |
|
"epoch": 0.021684475172847266, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.10066410154104233, |
|
"kl": 0.0493154339492321, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 0.002, |
|
"num_tokens": 1795630.0, |
|
"reward": -8.09375, |
|
"reward_std": 0.7028881907463074, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 1512.1875, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1365.0, |
|
"completions/mean_length": 1512.1875, |
|
"completions/mean_terminated_length": 882.75, |
|
"completions/min_length": 438.0, |
|
"completions/min_terminated_length": 438.0, |
|
"epoch": 0.02199874292897549, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.08896202594041824, |
|
"kl": 0.054354239255189896, |
|
"learning_rate": 1.7222222222222224e-06, |
|
"loss": 0.0022, |
|
"num_tokens": 1823761.0, |
|
"reward": -7.6875, |
|
"reward_std": 0.46650636196136475, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 1664.4375, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1470.0, |
|
"completions/mean_length": 1664.4375, |
|
"completions/mean_terminated_length": 1261.5, |
|
"completions/min_length": 1053.0, |
|
"completions/min_terminated_length": 1053.0, |
|
"epoch": 0.022313010685103708, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.0697084367275238, |
|
"kl": 0.031285081058740616, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0013, |
|
"num_tokens": 1854820.0, |
|
"reward": -7.5625, |
|
"reward_std": 0.375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 1451.5, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1602.0, |
|
"completions/mean_length": 1451.5, |
|
"completions/mean_terminated_length": 1000.6666870117188, |
|
"completions/min_length": 462.0, |
|
"completions/min_terminated_length": 462.0, |
|
"epoch": 0.02262727844123193, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1362772285938263, |
|
"kl": 0.057278823107481, |
|
"learning_rate": 1.6111111111111113e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 1882008.0, |
|
"reward": -7.875, |
|
"reward_std": 1.118110179901123, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.5, |
|
"rewards/format_and_language_reward_func/std": 0.8944272398948669, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 1348.5, |
|
"completions/clipped_ratio": 0.5, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1501.0, |
|
"completions/mean_length": 1348.5, |
|
"completions/mean_terminated_length": 975.0, |
|
"completions/min_length": 235.0, |
|
"completions/min_terminated_length": 235.0, |
|
"epoch": 0.02294154619736015, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11800263077020645, |
|
"kl": 0.05172204598784447, |
|
"learning_rate": 1.5555555555555558e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1907992.0, |
|
"reward": -7.78125, |
|
"reward_std": 0.4375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 1543.5625, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1672.0, |
|
"completions/mean_length": 1543.5625, |
|
"completions/mean_terminated_length": 1246.166748046875, |
|
"completions/min_length": 603.0, |
|
"completions/min_terminated_length": 603.0, |
|
"epoch": 0.023255813953488372, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.12982743978500366, |
|
"kl": 0.064728744328022, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.0026, |
|
"num_tokens": 1936533.0, |
|
"reward": -7.53125, |
|
"reward_std": 0.6926814913749695, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.28125, |
|
"rewards/check_numbers/std": 0.6046693325042725, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 1450.4375, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1525.0, |
|
"completions/mean_length": 1450.4375, |
|
"completions/mean_terminated_length": 635.75, |
|
"completions/min_length": 71.0, |
|
"completions/min_terminated_length": 71.0, |
|
"epoch": 0.023570081709616594, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.081117182970047, |
|
"kl": 0.05186208337545395, |
|
"learning_rate": 1.4444444444444445e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 1963812.0, |
|
"reward": -7.75, |
|
"reward_std": 0.5386751294136047, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 1392.25, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1499.0, |
|
"completions/mean_length": 1392.25, |
|
"completions/mean_terminated_length": 666.7999877929688, |
|
"completions/min_length": 31.0, |
|
"completions/min_terminated_length": 31.0, |
|
"epoch": 0.023884349465744813, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.37952759861946106, |
|
"kl": 0.06186853349208832, |
|
"learning_rate": 1.3888888888888892e-06, |
|
"loss": 0.0025, |
|
"num_tokens": 1990520.0, |
|
"reward": -7.65625, |
|
"reward_std": 0.6205127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 1473.9375, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1490.0, |
|
"completions/mean_length": 1473.9375, |
|
"completions/mean_terminated_length": 928.2000122070312, |
|
"completions/min_length": 420.0, |
|
"completions/min_terminated_length": 420.0, |
|
"epoch": 0.024198617221873036, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1677270531654358, |
|
"kl": 0.053330324590206146, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 2018063.0, |
|
"reward": -7.75, |
|
"reward_std": 0.75, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 1513.625, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1547.0, |
|
"completions/mean_length": 1513.625, |
|
"completions/mean_terminated_length": 1055.2000732421875, |
|
"completions/min_length": 599.0, |
|
"completions/min_terminated_length": 599.0, |
|
"epoch": 0.02451288497800126, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.1181650385260582, |
|
"kl": 0.05255034193396568, |
|
"learning_rate": 1.2777777777777779e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 2046037.0, |
|
"reward": -7.65625, |
|
"reward_std": 0.6926814913749695, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.28125, |
|
"rewards/check_numbers/std": 0.6046693325042725, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 1411.25, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1426.0, |
|
"completions/mean_length": 1411.25, |
|
"completions/mean_terminated_length": 1011.71435546875, |
|
"completions/min_length": 793.0, |
|
"completions/min_terminated_length": 793.0, |
|
"epoch": 0.024827152734129478, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.1206207275390625, |
|
"kl": 0.05586665868759155, |
|
"learning_rate": 1.2222222222222223e-06, |
|
"loss": 0.0022, |
|
"num_tokens": 2072805.0, |
|
"reward": -7.78125, |
|
"reward_std": 0.8977102637290955, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.28125, |
|
"rewards/check_numbers/std": 0.6046693325042725, |
|
"rewards/format_and_language_reward_func/mean": -3.5, |
|
"rewards/format_and_language_reward_func/std": 0.8944272398948669, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 1497.875, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1400.0, |
|
"completions/mean_length": 1497.875, |
|
"completions/mean_terminated_length": 825.5, |
|
"completions/min_length": 494.0, |
|
"completions/min_terminated_length": 494.0, |
|
"epoch": 0.0251414204902577, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.15405312180519104, |
|
"kl": 0.05234729126095772, |
|
"learning_rate": 1.1666666666666668e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 2100631.0, |
|
"reward": -7.5, |
|
"reward_std": 1.9904643297195435, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.125, |
|
"rewards/check_numbers/std": 1.5864006280899048, |
|
"rewards/format_and_language_reward_func/mean": -3.375, |
|
"rewards/format_and_language_reward_func/std": 0.8062257766723633, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 1425.0, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1576.0, |
|
"completions/mean_length": 1425.0, |
|
"completions/mean_terminated_length": 930.0, |
|
"completions/min_length": 79.0, |
|
"completions/min_terminated_length": 79.0, |
|
"epoch": 0.02545568824638592, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11791858822107315, |
|
"kl": 0.04198841378092766, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.0017, |
|
"num_tokens": 2127275.0, |
|
"reward": -7.46875, |
|
"reward_std": 0.6205127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.46875, |
|
"rewards/check_numbers/std": 0.7180703282356262, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 1682.5, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1662.0, |
|
"completions/mean_length": 1682.5, |
|
"completions/mean_terminated_length": 1511.3333740234375, |
|
"completions/min_length": 1430.0, |
|
"completions/min_terminated_length": 1430.0, |
|
"epoch": 0.025769956002514142, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1258944272994995, |
|
"kl": 0.03991963341832161, |
|
"learning_rate": 1.0555555555555557e-06, |
|
"loss": 0.0016, |
|
"num_tokens": 2157831.0, |
|
"reward": -7.96875, |
|
"reward_std": 0.8125, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 1560.0, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1502.0, |
|
"completions/mean_length": 1560.0, |
|
"completions/mean_terminated_length": 858.0, |
|
"completions/min_length": 199.0, |
|
"completions/min_terminated_length": 199.0, |
|
"epoch": 0.026084223758642364, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.25008276104927063, |
|
"kl": 0.04483529552817345, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0018, |
|
"num_tokens": 2186663.0, |
|
"reward": -7.5625, |
|
"reward_std": 0.5915063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 1363.8125, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 890.0, |
|
"completions/mean_length": 1363.8125, |
|
"completions/mean_terminated_length": 766.8333740234375, |
|
"completions/min_length": 623.0, |
|
"completions/min_terminated_length": 623.0, |
|
"epoch": 0.026398491514770583, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11882040649652481, |
|
"kl": 0.04816675931215286, |
|
"learning_rate": 9.444444444444445e-07, |
|
"loss": 0.0019, |
|
"num_tokens": 2212692.0, |
|
"reward": -7.78125, |
|
"reward_std": 0.6540063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 1652.3125, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1646.0, |
|
"completions/mean_length": 1652.3125, |
|
"completions/mean_terminated_length": 1350.3333740234375, |
|
"completions/min_length": 1045.0, |
|
"completions/min_terminated_length": 1045.0, |
|
"epoch": 0.026712759270898806, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.11614203453063965, |
|
"kl": 0.0454854890704155, |
|
"learning_rate": 8.88888888888889e-07, |
|
"loss": 0.0018, |
|
"num_tokens": 2243069.0, |
|
"reward": -8.0, |
|
"reward_std": 0.9117004871368408, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 1576.625, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1560.0, |
|
"completions/mean_length": 1576.625, |
|
"completions/mean_terminated_length": 1140.5, |
|
"completions/min_length": 801.0, |
|
"completions/min_terminated_length": 801.0, |
|
"epoch": 0.02702702702702703, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.09951747953891754, |
|
"kl": 0.04343428835272789, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.0017, |
|
"num_tokens": 2272123.0, |
|
"reward": -7.4375, |
|
"reward_std": 0.9503755569458008, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.3125, |
|
"rewards/check_numbers/std": 1.6520190238952637, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 1552.3125, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1073.0, |
|
"completions/mean_length": 1552.3125, |
|
"completions/mean_terminated_length": 817.0, |
|
"completions/min_length": 665.0, |
|
"completions/min_terminated_length": 665.0, |
|
"epoch": 0.027341294783155248, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.1279648393392563, |
|
"kl": 0.04489857330918312, |
|
"learning_rate": 7.777777777777779e-07, |
|
"loss": 0.0018, |
|
"num_tokens": 2300748.0, |
|
"reward": -7.96875, |
|
"reward_std": 0.9505414962768555, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 1594.375, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1438.0, |
|
"completions/mean_length": 1594.375, |
|
"completions/mean_terminated_length": 1041.3333740234375, |
|
"completions/min_length": 780.0, |
|
"completions/min_terminated_length": 780.0, |
|
"epoch": 0.02765556253928347, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.18847358226776123, |
|
"kl": 0.03859600052237511, |
|
"learning_rate": 7.222222222222222e-07, |
|
"loss": 0.0015, |
|
"num_tokens": 2330330.0, |
|
"reward": -7.96875, |
|
"reward_std": 0.8125, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 1683.1875, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1547.0, |
|
"completions/mean_length": 1683.1875, |
|
"completions/mean_terminated_length": 1411.5, |
|
"completions/min_length": 1276.0, |
|
"completions/min_terminated_length": 1276.0, |
|
"epoch": 0.02796983029541169, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.10217194259166718, |
|
"kl": 0.044257864356040955, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.0018, |
|
"num_tokens": 2361789.0, |
|
"reward": -7.6875, |
|
"reward_std": 1.0060844421386719, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 1638.3125, |
|
"completions/clipped_ratio": 0.8125, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1547.0, |
|
"completions/mean_length": 1638.3125, |
|
"completions/mean_terminated_length": 1275.666748046875, |
|
"completions/min_length": 1068.0, |
|
"completions/min_terminated_length": 1068.0, |
|
"epoch": 0.028284098051539912, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.6536535620689392, |
|
"kl": 0.07140226662158966, |
|
"learning_rate": 6.111111111111112e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 2392042.0, |
|
"reward": -7.53125, |
|
"reward_std": 0.9040063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.28125, |
|
"rewards/check_numbers/std": 0.6046693325042725, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 1547.75, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1282.0, |
|
"completions/mean_length": 1547.75, |
|
"completions/mean_terminated_length": 1025.0, |
|
"completions/min_length": 848.0, |
|
"completions/min_terminated_length": 848.0, |
|
"epoch": 0.028598365807668134, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.12291199713945389, |
|
"kl": 0.05366697907447815, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 0.0021, |
|
"num_tokens": 2420674.0, |
|
"reward": -7.78125, |
|
"reward_std": 0.8705127239227295, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 1515.5625, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1680.0, |
|
"completions/mean_length": 1515.5625, |
|
"completions/mean_terminated_length": 1171.5, |
|
"completions/min_length": 73.0, |
|
"completions/min_terminated_length": 73.0, |
|
"epoch": 0.028912633563796353, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11394577473402023, |
|
"kl": 0.04896366223692894, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.002, |
|
"num_tokens": 2448871.0, |
|
"reward": -6.71875, |
|
"reward_std": 0.8125, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": 0.28125, |
|
"rewards/check_numbers/std": 1.7220990657806396, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 1545.1875, |
|
"completions/clipped_ratio": 0.625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1656.0, |
|
"completions/mean_length": 1545.1875, |
|
"completions/mean_terminated_length": 1250.5, |
|
"completions/min_length": 537.0, |
|
"completions/min_terminated_length": 537.0, |
|
"epoch": 0.029226901319924576, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.07623685151338577, |
|
"kl": 0.046294040977954865, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 0.0019, |
|
"num_tokens": 2477226.0, |
|
"reward": -7.5625, |
|
"reward_std": 0.375, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.5625, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 1588.75, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1556.0, |
|
"completions/mean_length": 1588.75, |
|
"completions/mean_terminated_length": 1189.0, |
|
"completions/min_length": 405.0, |
|
"completions/min_terminated_length": 405.0, |
|
"epoch": 0.0295411690760528, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11318648606538773, |
|
"kl": 0.04230440780520439, |
|
"learning_rate": 3.8888888888888895e-07, |
|
"loss": 0.0017, |
|
"num_tokens": 2506806.0, |
|
"reward": -7.75, |
|
"reward_std": 0.5915063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.75, |
|
"rewards/check_numbers/std": 0.7745966911315918, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 1620.875, |
|
"completions/clipped_ratio": 0.875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 987.0, |
|
"completions/mean_length": 1620.875, |
|
"completions/mean_terminated_length": 913.0, |
|
"completions/min_length": 839.0, |
|
"completions/min_terminated_length": 839.0, |
|
"epoch": 0.029855436832181018, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11543525010347366, |
|
"kl": 0.0602000392973423, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 0.0024, |
|
"num_tokens": 2536500.0, |
|
"reward": -7.96875, |
|
"reward_std": 0.6540063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.84375, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 1623.375, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1687.0, |
|
"completions/mean_length": 1623.375, |
|
"completions/mean_terminated_length": 1327.5, |
|
"completions/min_length": 743.0, |
|
"completions/min_terminated_length": 743.0, |
|
"epoch": 0.03016970458830924, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11837535351514816, |
|
"kl": 0.04759529232978821, |
|
"learning_rate": 2.7777777777777776e-07, |
|
"loss": 0.0019, |
|
"num_tokens": 2566598.0, |
|
"reward": -8.1875, |
|
"reward_std": 0.7165063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.9375, |
|
"rewards/check_numbers/std": 0.75, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 1617.0625, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1545.0, |
|
"completions/mean_length": 1617.0625, |
|
"completions/mean_terminated_length": 1302.25, |
|
"completions/min_length": 868.0, |
|
"completions/min_terminated_length": 868.0, |
|
"epoch": 0.03048397234443746, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.11196030676364899, |
|
"kl": 0.04893610253930092, |
|
"learning_rate": 2.2222222222222224e-07, |
|
"loss": 0.002, |
|
"num_tokens": 2596579.0, |
|
"reward": -7.4375, |
|
"reward_std": 0.7165063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.1875, |
|
"rewards/check_numbers/std": 0.5123475790023804, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 1362.0625, |
|
"completions/clipped_ratio": 0.5625, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1402.0, |
|
"completions/mean_length": 1362.0625, |
|
"completions/mean_terminated_length": 899.2857666015625, |
|
"completions/min_length": 364.0, |
|
"completions/min_terminated_length": 364.0, |
|
"epoch": 0.030798240100565682, |
|
"frac_reward_zero_std": 0.5, |
|
"grad_norm": 0.2802422046661377, |
|
"kl": 0.07202958315610886, |
|
"learning_rate": 1.6666666666666668e-07, |
|
"loss": 0.0029, |
|
"num_tokens": 2622492.0, |
|
"reward": -7.375, |
|
"reward_std": 0.4330126941204071, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.375, |
|
"rewards/check_numbers/std": 0.670820415019989, |
|
"rewards/format_and_language_reward_func/mean": -3.0, |
|
"rewards/format_and_language_reward_func/std": 0.0, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 1529.75, |
|
"completions/clipped_ratio": 0.75, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1474.0, |
|
"completions/mean_length": 1529.75, |
|
"completions/mean_terminated_length": 953.0, |
|
"completions/min_length": 371.0, |
|
"completions/min_terminated_length": 371.0, |
|
"epoch": 0.031112507856693904, |
|
"frac_reward_zero_std": 0.25, |
|
"grad_norm": 0.27167099714279175, |
|
"kl": 0.0705300122499466, |
|
"learning_rate": 1.1111111111111112e-07, |
|
"loss": 0.0028, |
|
"num_tokens": 2650764.0, |
|
"reward": -7.40625, |
|
"reward_std": 0.6540063619613647, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.28125, |
|
"rewards/check_numbers/std": 0.6046693325042725, |
|
"rewards/format_and_language_reward_func/mean": -3.125, |
|
"rewards/format_and_language_reward_func/std": 0.5, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 1459.75, |
|
"completions/clipped_ratio": 0.6875, |
|
"completions/max_length": 1722.0, |
|
"completions/max_terminated_length": 1133.0, |
|
"completions/mean_length": 1459.75, |
|
"completions/mean_terminated_length": 882.7999877929688, |
|
"completions/min_length": 436.0, |
|
"completions/min_terminated_length": 436.0, |
|
"epoch": 0.03142677561282212, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 0.11008132994174957, |
|
"kl": 0.04374002292752266, |
|
"learning_rate": 5.555555555555556e-08, |
|
"loss": 0.0017, |
|
"num_tokens": 2678396.0, |
|
"reward": -7.90625, |
|
"reward_std": 1.233162522315979, |
|
"rewards/check_answer/mean": -2.0, |
|
"rewards/check_answer/std": 0.0, |
|
"rewards/check_numbers/mean": -0.65625, |
|
"rewards/check_numbers/std": 0.7685213088989258, |
|
"rewards/format_and_language_reward_func/mean": -3.25, |
|
"rewards/format_and_language_reward_func/std": 0.6831300854682922, |
|
"rewards/match_format_approximately/mean": -2.0, |
|
"rewards/match_format_approximately/std": 0.0, |
|
"rewards/match_format_exactly/mean": 0.0, |
|
"rewards/match_format_exactly/std": 0.0, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 2678396, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|