{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03142677561282212, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 387.5625, "completions/clipped_ratio": 0.0625, "completions/max_length": 1722.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 387.5625, "completions/mean_terminated_length": 298.6000061035156, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.00031426775612822125, "frac_reward_zero_std": 0.0, "grad_norm": 9.58870792388916, "kl": 1.1056605577468872, "learning_rate": 0.0, "loss": 0.0442, "num_tokens": 10321.0, "reward": -8.78125, "reward_std": 1.5290063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.15625, "rewards/check_numbers/std": 1.1212902069091797, "rewards/format_and_language_reward_func/mean": -3.625, "rewards/format_and_language_reward_func/std": 0.9574271440505981, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 1 }, { "completion_length": 427.625, "completions/clipped_ratio": 0.0625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 427.625, "completions/mean_terminated_length": 341.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0006285355122564425, "frac_reward_zero_std": 0.0, "grad_norm": 15.610054016113281, "kl": 0.5778560638427734, "learning_rate": 5.000000000000001e-07, "loss": 0.0231, "num_tokens": 20955.0, "reward": -9.5, "reward_std": 1.6830127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.75, "rewards/check_numbers/std": 0.8366600275039673, "rewards/format_and_language_reward_func/mean": -3.75, "rewards/format_and_language_reward_func/std": 1.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 2 }, { "completion_length": 979.5, "completions/clipped_ratio": 0.3125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 979.5, "completions/mean_terminated_length": 642.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0009428032683846638, "frac_reward_zero_std": 0.0, "grad_norm": 4.994994163513184, "kl": 0.2919767200946808, "learning_rate": 1.0000000000000002e-06, "loss": 0.0117, "num_tokens": 40683.0, "reward": -7.53125, "reward_std": 1.3821797370910645, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.40625, "rewards/check_numbers/std": 1.3443554639816284, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 3 }, { "completion_length": 836.3125, "completions/clipped_ratio": 0.3125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 836.3125, "completions/mean_terminated_length": 433.727294921875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.001257071024512885, "frac_reward_zero_std": 0.0, "grad_norm": 16.14759635925293, "kl": 0.5689749717712402, "learning_rate": 1.5e-06, "loss": 0.0228, "num_tokens": 58020.0, "reward": -9.03125, "reward_std": 1.9392420053482056, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.40625, "rewards/check_numbers/std": 1.0680004358291626, "rewards/format_and_language_reward_func/mean": -3.625, "rewards/format_and_language_reward_func/std": 0.9574271440505981, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 4 }, { "completion_length": 553.25, "completions/clipped_ratio": 0.0625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 553.25, "completions/mean_terminated_length": 475.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0015713387806411063, "frac_reward_zero_std": 0.0, "grad_norm": 5.5043792724609375, "kl": 0.5717646479606628, "learning_rate": 2.0000000000000003e-06, "loss": 0.0229, "num_tokens": 70976.0, "reward": -8.40625, "reward_std": 0.9375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.03125, "rewards/check_numbers/std": 1.007782220840454, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 5 }, { "completion_length": 750.9375, "completions/clipped_ratio": 0.125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 750.9375, "completions/mean_terminated_length": 612.2142944335938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0018856065367693275, "frac_reward_zero_std": 0.0, "grad_norm": 7.5527191162109375, "kl": 0.26640018820762634, "learning_rate": 2.5e-06, "loss": 0.0107, "num_tokens": 87127.0, "reward": -8.40625, "reward_std": 1.3351925611495972, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.90625, "rewards/check_numbers/std": 1.1138334274291992, "rewards/format_and_language_reward_func/mean": -3.5, "rewards/format_and_language_reward_func/std": 0.8944272398948669, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 6 }, { "completion_length": 822.625, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 822.625, "completions/mean_terminated_length": 822.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0021998742928975488, "frac_reward_zero_std": 0.0, "grad_norm": 6.19105863571167, "kl": 0.17402563989162445, "learning_rate": 3e-06, "loss": 0.007, "num_tokens": 104165.0, "reward": -7.96875, "reward_std": 1.063370943069458, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.71875, "rewards/check_numbers/std": 0.8750000596046448, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 7 }, { "completion_length": 1419.1875, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 1419.1875, "completions/mean_terminated_length": 753.0, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.00251414204902577, "frac_reward_zero_std": 0.25, "grad_norm": 0.1348615139722824, "kl": 0.029216211289167404, "learning_rate": 3.5e-06, "loss": 0.0012, "num_tokens": 130604.0, "reward": -8.0625, "reward_std": 0.625, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.9375, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 8 }, { "completion_length": 1018.125, "completions/clipped_ratio": 0.375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 1018.125, "completions/mean_terminated_length": 595.7999877929688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0028284098051539913, "frac_reward_zero_std": 0.0, "grad_norm": 7.937878131866455, "kl": 0.42939966917037964, "learning_rate": 4.000000000000001e-06, "loss": 0.0172, "num_tokens": 151170.0, "reward": -7.8125, "reward_std": 1.1593647003173828, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.6875, "rewards/check_numbers/std": 0.9639329314231873, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 9 }, { "completion_length": 1018.625, "completions/clipped_ratio": 0.3125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 1018.625, "completions/mean_terminated_length": 698.9091186523438, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.0031426775612822125, "frac_reward_zero_std": 0.0, "grad_norm": 3.0097506046295166, "kl": 0.15729668736457825, "learning_rate": 4.5e-06, "loss": 0.0063, "num_tokens": 171388.0, "reward": -7.75, "reward_std": 0.8080127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 10 }, { "completion_length": 1189.8125, "completions/clipped_ratio": 0.3125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 1189.8125, "completions/mean_terminated_length": 947.9091186523438, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.0034569453174104338, "frac_reward_zero_std": 0.0, "grad_norm": 0.136498361825943, "kl": 0.05319710448384285, "learning_rate": 5e-06, "loss": 0.0021, "num_tokens": 194373.0, "reward": -7.75, "reward_std": 0.8080127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 11 }, { "completion_length": 1122.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 1122.0, "completions/mean_terminated_length": 983.5385131835938, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.003771213073538655, "frac_reward_zero_std": 0.5, "grad_norm": 0.0965699553489685, "kl": 0.06320463120937347, "learning_rate": 4.944444444444445e-06, "loss": 0.0025, "num_tokens": 216261.0, "reward": -7.65625, "reward_std": 0.40400636196136475, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 12 }, { "completion_length": 1341.75, "completions/clipped_ratio": 0.4375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 1341.75, "completions/mean_terminated_length": 1046.0, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.004085480829666876, "frac_reward_zero_std": 0.5, "grad_norm": 0.07173436135053635, "kl": 0.04135803505778313, "learning_rate": 4.888888888888889e-06, "loss": 0.0017, "num_tokens": 241453.0, "reward": -7.84375, "reward_std": 0.40400636196136475, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 13 }, { "completion_length": 1283.8125, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 1283.8125, "completions/mean_terminated_length": 553.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.0043997485857950975, "frac_reward_zero_std": 0.0, "grad_norm": 6.01732063293457, "kl": 0.17246384918689728, "learning_rate": 4.833333333333333e-06, "loss": 0.0069, "num_tokens": 266634.0, "reward": -7.84375, "reward_std": 1.1508427858352661, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.71875, "rewards/check_numbers/std": 0.8750000596046448, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 14 }, { "completion_length": 1189.6875, "completions/clipped_ratio": 0.3125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 1189.6875, "completions/mean_terminated_length": 947.727294921875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.004714016341923318, "frac_reward_zero_std": 0.0, "grad_norm": 0.6227664351463318, "kl": 0.08904990553855896, "learning_rate": 4.777777777777778e-06, "loss": 0.0036, "num_tokens": 289373.0, "reward": -6.90625, "reward_std": 1.0290063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": 0.09375, "rewards/check_numbers/std": 1.827737808227539, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 15 }, { "completion_length": 1361.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 1361.0, "completions/mean_terminated_length": 759.3333740234375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.00502828409805154, "frac_reward_zero_std": 0.25, "grad_norm": 0.12482193857431412, "kl": 0.05782376229763031, "learning_rate": 4.722222222222222e-06, "loss": 0.0023, "num_tokens": 314701.0, "reward": -8.09375, "reward_std": 0.9840351343154907, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 16 }, { "completion_length": 1330.625, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 1330.625, "completions/mean_terminated_length": 827.4285888671875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.005342551854179761, "frac_reward_zero_std": 0.5, "grad_norm": 0.09849988669157028, "kl": 0.07715655118227005, "learning_rate": 4.666666666666667e-06, "loss": 0.0031, "num_tokens": 339795.0, "reward": -7.9375, "reward_std": 0.375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.9375, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 17 }, { "completion_length": 1300.3125, "completions/clipped_ratio": 0.5, "completions/max_length": 1722.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 1300.3125, "completions/mean_terminated_length": 878.625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0056568196103079825, "frac_reward_zero_std": 0.0, "grad_norm": 2.448221445083618, "kl": 0.3021676242351532, "learning_rate": 4.611111111111112e-06, "loss": 0.0121, "num_tokens": 364600.0, "reward": -8.3125, "reward_std": 1.2111132144927979, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.9375, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 18 }, { "completion_length": 1257.625, "completions/clipped_ratio": 0.375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 1257.625, "completions/mean_terminated_length": 979.0, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.005971087366436203, "frac_reward_zero_std": 0.25, "grad_norm": 0.12901267409324646, "kl": 0.06548095494508743, "learning_rate": 4.555555555555556e-06, "loss": 0.0026, "num_tokens": 388078.0, "reward": -7.84375, "reward_std": 0.8176814913749695, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.59375, "rewards/check_numbers/std": 1.7050782442092896, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 19 }, { "completion_length": 1573.6875, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 1573.6875, "completions/mean_terminated_length": 1128.75, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.006285355122564425, "frac_reward_zero_std": 0.25, "grad_norm": 0.10170631110668182, "kl": 0.04396039992570877, "learning_rate": 4.5e-06, "loss": 0.0018, "num_tokens": 417161.0, "reward": -8.375, "reward_std": 0.9611132144927979, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.125, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 20 }, { "completion_length": 1380.5, "completions/clipped_ratio": 0.375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 1380.5, "completions/mean_terminated_length": 1175.5999755859375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.006599622878692646, "frac_reward_zero_std": 0.25, "grad_norm": 2.5239837169647217, "kl": 0.16142641007900238, "learning_rate": 4.444444444444444e-06, "loss": 0.0065, "num_tokens": 443361.0, "reward": -8.03125, "reward_std": 0.6205127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.03125, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 21 }, { "completion_length": 1384.25, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 1384.25, "completions/mean_terminated_length": 950.0000610351562, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.0069138906348208675, "frac_reward_zero_std": 0.5, "grad_norm": 0.09287308901548386, "kl": 0.06003550812602043, "learning_rate": 4.388888888888889e-06, "loss": 0.0024, "num_tokens": 469325.0, "reward": -7.78125, "reward_std": 0.4375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 22 }, { "completion_length": 1367.4375, "completions/clipped_ratio": 0.4375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 1367.4375, "completions/mean_terminated_length": 1091.6666259765625, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.007228158390949088, "frac_reward_zero_std": 0.5, "grad_norm": 0.3170417845249176, "kl": 0.06963013857603073, "learning_rate": 4.333333333333334e-06, "loss": 0.0028, "num_tokens": 495200.0, "reward": -7.1875, "reward_std": 0.375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.1875, "rewards/check_numbers/std": 0.5123475790023804, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 23 }, { "completion_length": 1551.6875, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 1551.6875, "completions/mean_terminated_length": 813.6666870117188, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.00754242614707731, "frac_reward_zero_std": 0.0, "grad_norm": 0.20077864825725555, "kl": 0.05942363664507866, "learning_rate": 4.277777777777778e-06, "loss": 0.0024, "num_tokens": 523435.0, "reward": -8.28125, "reward_std": 0.9840351343154907, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.03125, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 24 }, { "completion_length": 1567.25, "completions/clipped_ratio": 0.875, "completions/max_length": 1722.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 1567.25, "completions/mean_terminated_length": 484.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.00785669390320553, "frac_reward_zero_std": 0.25, "grad_norm": 0.15381832420825958, "kl": 0.05258103832602501, "learning_rate": 4.222222222222223e-06, "loss": 0.0021, "num_tokens": 552719.0, "reward": -7.5, "reward_std": 0.661700427532196, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 25 }, { "completion_length": 1234.5625, "completions/clipped_ratio": 0.5, "completions/max_length": 1722.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 1234.5625, "completions/mean_terminated_length": 747.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.008170961659333752, "frac_reward_zero_std": 0.25, "grad_norm": 0.2516714334487915, "kl": 0.07844924181699753, "learning_rate": 4.166666666666667e-06, "loss": 0.0031, "num_tokens": 576460.0, "reward": -7.75, "reward_std": 0.5915063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 26 }, { "completion_length": 1580.875, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 1580.875, "completions/mean_terminated_length": 1270.4000244140625, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.008485229415461974, "frac_reward_zero_std": 0.25, "grad_norm": 0.08427742123603821, "kl": 0.040127161890268326, "learning_rate": 4.111111111111111e-06, "loss": 0.0016, "num_tokens": 605630.0, "reward": -7.84375, "reward_std": 0.5625, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 27 }, { "completion_length": 1360.1875, "completions/clipped_ratio": 0.5, "completions/max_length": 1722.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 1360.1875, "completions/mean_terminated_length": 998.375, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.008799497171590195, "frac_reward_zero_std": 0.0, "grad_norm": 0.13703206181526184, "kl": 0.05611160770058632, "learning_rate": 4.055555555555556e-06, "loss": 0.0022, "num_tokens": 631529.0, "reward": -8.21875, "reward_std": 0.9407068490982056, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 28 }, { "completion_length": 1515.625, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 1515.625, "completions/mean_terminated_length": 1061.5999755859375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 0.009113764927718416, "frac_reward_zero_std": 0.0, "grad_norm": 0.1371777057647705, "kl": 0.05326192080974579, "learning_rate": 4.000000000000001e-06, "loss": 0.0021, "num_tokens": 659651.0, "reward": -8.09375, "reward_std": 0.9040063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 29 }, { "completion_length": 1422.0625, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 1422.0625, "completions/mean_terminated_length": 1036.4285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.009428032683846637, "frac_reward_zero_std": 0.0, "grad_norm": 1.34744393825531, "kl": 2.590277671813965, "learning_rate": 3.944444444444445e-06, "loss": 0.1036, "num_tokens": 686368.0, "reward": -7.84375, "reward_std": 1.1540063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.71875, "rewards/check_numbers/std": 0.8750000596046448, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 30 }, { "completion_length": 1592.75, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1592.75, "completions/mean_terminated_length": 1032.666748046875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.00974230043997486, "frac_reward_zero_std": 0.5, "grad_norm": 0.09155543893575668, "kl": 0.05414074286818504, "learning_rate": 3.88888888888889e-06, "loss": 0.0022, "num_tokens": 715988.0, "reward": -7.40625, "reward_std": 0.4375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.28125, "rewards/check_numbers/std": 0.6046693325042725, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 31 }, { "completion_length": 1555.5625, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 1555.5625, "completions/mean_terminated_length": 1056.25, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.01005656819610308, "frac_reward_zero_std": 0.0, "grad_norm": 0.15684477984905243, "kl": 0.05063142254948616, "learning_rate": 3.833333333333334e-06, "loss": 0.002, "num_tokens": 744669.0, "reward": -8.40625, "reward_std": 1.0148502588272095, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.03125, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 32 }, { "completion_length": 1546.9375, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 1546.9375, "completions/mean_terminated_length": 1021.75, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.010370835952231301, "frac_reward_zero_std": 0.25, "grad_norm": 0.07121387869119644, "kl": 0.032333556562662125, "learning_rate": 3.777777777777778e-06, "loss": 0.0013, "num_tokens": 773388.0, "reward": -7.46875, "reward_std": 0.5625, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 33 }, { "completion_length": 1376.4375, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 1376.4375, "completions/mean_terminated_length": 800.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.010685103708359522, "frac_reward_zero_std": 0.25, "grad_norm": 1.1596808433532715, "kl": 0.12857064604759216, "learning_rate": 3.7222222222222225e-06, "loss": 0.0051, "num_tokens": 799511.0, "reward": -7.5, "reward_std": 0.661700427532196, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 34 }, { "completion_length": 1699.6875, "completions/clipped_ratio": 0.875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 1699.6875, "completions/mean_terminated_length": 1543.5, "completions/min_length": 1376.0, "completions/min_terminated_length": 1376.0, "epoch": 0.010999371464487744, "frac_reward_zero_std": 0.0, "grad_norm": 0.11480734497308731, "kl": 0.04761524498462677, "learning_rate": 3.6666666666666666e-06, "loss": 0.0019, "num_tokens": 830966.0, "reward": -7.96875, "reward_std": 1.2129219770431519, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.5, "rewards/format_and_language_reward_func/std": 0.8944272398948669, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 35 }, { "completion_length": 1583.75, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 1583.75, "completions/mean_terminated_length": 1169.0, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "epoch": 0.011313639220615965, "frac_reward_zero_std": 0.25, "grad_norm": 0.34630483388900757, "kl": 0.07976571470499039, "learning_rate": 3.6111111111111115e-06, "loss": 0.0032, "num_tokens": 859930.0, "reward": -8.09375, "reward_std": 0.9840351343154907, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 36 }, { "completion_length": 1338.875, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 1338.875, "completions/mean_terminated_length": 496.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.011627906976744186, "frac_reward_zero_std": 0.0, "grad_norm": 0.1614941507577896, "kl": 0.06494183838367462, "learning_rate": 3.555555555555556e-06, "loss": 0.0026, "num_tokens": 885120.0, "reward": -7.6875, "reward_std": 0.8080127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 37 }, { "completion_length": 1480.0625, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 1480.0625, "completions/mean_terminated_length": 1169.0, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.011942174732872407, "frac_reward_zero_std": 0.75, "grad_norm": 0.07541613280773163, "kl": 0.052320241928100586, "learning_rate": 3.5e-06, "loss": 0.0021, "num_tokens": 912537.0, "reward": -8.0, "reward_std": 0.28867512941360474, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 38 }, { "completion_length": 1373.75, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 1373.75, "completions/mean_terminated_length": 926.0000610351562, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.01225644248900063, "frac_reward_zero_std": 0.0, "grad_norm": 0.16380992531776428, "kl": 0.06148176267743111, "learning_rate": 3.444444444444445e-06, "loss": 0.0025, "num_tokens": 938505.0, "reward": -8.0, "reward_std": 1.0497419834136963, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 39 }, { "completion_length": 1519.1875, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 1519.1875, "completions/mean_terminated_length": 910.75, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.01257071024512885, "frac_reward_zero_std": 0.25, "grad_norm": 0.10772477090358734, "kl": 0.04433707520365715, "learning_rate": 3.3888888888888893e-06, "loss": 0.0018, "num_tokens": 966940.0, "reward": -7.59375, "reward_std": 0.7340351343154907, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 40 }, { "completion_length": 1453.9375, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 1453.9375, "completions/mean_terminated_length": 1007.1666870117188, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.012884978001257071, "frac_reward_zero_std": 0.5, "grad_norm": 0.07585709542036057, "kl": 0.04436139389872551, "learning_rate": 3.3333333333333333e-06, "loss": 0.0018, "num_tokens": 993859.0, "reward": -8.125, "reward_std": 0.5773502588272095, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 41 }, { "completion_length": 1421.5, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 1421.5, "completions/mean_terminated_length": 1035.1429443359375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.013199245757385292, "frac_reward_zero_std": 0.0, "grad_norm": 0.5277220010757446, "kl": 0.06066755950450897, "learning_rate": 3.277777777777778e-06, "loss": 0.0024, "num_tokens": 1020571.0, "reward": -8.125, "reward_std": 1.0517165660858154, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 42 }, { "completion_length": 1514.6875, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 1514.6875, "completions/mean_terminated_length": 1058.5999755859375, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 0.013513513513513514, "frac_reward_zero_std": 0.25, "grad_norm": 0.10104304552078247, "kl": 0.05354408547282219, "learning_rate": 3.2222222222222227e-06, "loss": 0.0021, "num_tokens": 1049086.0, "reward": -7.46875, "reward_std": 0.5625, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 43 }, { "completion_length": 1431.4375, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 1431.4375, "completions/mean_terminated_length": 947.1666870117188, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.013827781269641735, "frac_reward_zero_std": 0.25, "grad_norm": 0.11680426448583603, "kl": 0.05350031703710556, "learning_rate": 3.1666666666666667e-06, "loss": 0.0021, "num_tokens": 1075785.0, "reward": -7.46875, "reward_std": 0.6205127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 44 }, { "completion_length": 1583.875, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 1583.875, "completions/mean_terminated_length": 1353.666748046875, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.014142049025769956, "frac_reward_zero_std": 0.0, "grad_norm": 0.10828514397144318, "kl": 0.05207566171884537, "learning_rate": 3.1111111111111116e-06, "loss": 0.0021, "num_tokens": 1105015.0, "reward": -7.25, "reward_std": 1.1668819189071655, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.125, "rewards/check_numbers/std": 1.5864006280899048, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 45 }, { "completion_length": 1366.1875, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 1366.1875, "completions/mean_terminated_length": 908.71435546875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.014456316781898177, "frac_reward_zero_std": 0.5, "grad_norm": 0.09319330006837845, "kl": 0.044255051761865616, "learning_rate": 3.055555555555556e-06, "loss": 0.0018, "num_tokens": 1130758.0, "reward": -7.375, "reward_std": 0.375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 46 }, { "completion_length": 1549.25, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 1549.25, "completions/mean_terminated_length": 1031.0, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 0.0147705845380264, "frac_reward_zero_std": 0.5, "grad_norm": 0.1097634956240654, "kl": 0.04721890389919281, "learning_rate": 3e-06, "loss": 0.0019, "num_tokens": 1159726.0, "reward": -7.75, "reward_std": 0.375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 47 }, { "completion_length": 1559.8125, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 1559.8125, "completions/mean_terminated_length": 857.0, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.01508485229415462, "frac_reward_zero_std": 0.0, "grad_norm": 0.13540787994861603, "kl": 0.052234258502721786, "learning_rate": 2.944444444444445e-06, "loss": 0.0021, "num_tokens": 1188527.0, "reward": -8.1875, "reward_std": 1.1715351343154907, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.9375, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 48 }, { "completion_length": 1568.125, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1591.0, "completions/mean_length": 1568.125, "completions/mean_terminated_length": 1229.5999755859375, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 0.015399120050282841, "frac_reward_zero_std": 0.0, "grad_norm": 0.11936990916728973, "kl": 0.056975651532411575, "learning_rate": 2.888888888888889e-06, "loss": 0.0023, "num_tokens": 1217941.0, "reward": -8.1875, "reward_std": 0.8571338653564453, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.9375, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 49 }, { "completion_length": 1495.4375, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 1495.4375, "completions/mean_terminated_length": 997.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.01571338780641106, "frac_reward_zero_std": 0.25, "grad_norm": 0.1257868856191635, "kl": 0.051786769181489944, "learning_rate": 2.8333333333333335e-06, "loss": 0.0021, "num_tokens": 1245944.0, "reward": -7.6875, "reward_std": 0.7895780801773071, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 50 }, { "completion_length": 1586.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 1586.0, "completions/mean_terminated_length": 1286.800048828125, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.016027655562539284, "frac_reward_zero_std": 0.0, "grad_norm": 0.11521671712398529, "kl": 0.047557681798934937, "learning_rate": 2.7777777777777783e-06, "loss": 0.0019, "num_tokens": 1275304.0, "reward": -8.0, "reward_std": 0.6636751294136047, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 51 }, { "completion_length": 1589.6875, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1589.6875, "completions/mean_terminated_length": 1192.75, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.016341923318667503, "frac_reward_zero_std": 0.75, "grad_norm": 0.24990025162696838, "kl": 0.0706791803240776, "learning_rate": 2.7222222222222224e-06, "loss": 0.0028, "num_tokens": 1304927.0, "reward": -7.5625, "reward_std": 0.21650634706020355, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 52 }, { "completion_length": 1505.0625, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 1505.0625, "completions/mean_terminated_length": 1027.800048828125, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 0.016656191074795726, "frac_reward_zero_std": 0.0, "grad_norm": 0.12817677855491638, "kl": 0.05584558844566345, "learning_rate": 2.666666666666667e-06, "loss": 0.0022, "num_tokens": 1332692.0, "reward": -7.625, "reward_std": 0.875, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 53 }, { "completion_length": 1617.8125, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 1617.8125, "completions/mean_terminated_length": 1305.25, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.01697045883092395, "frac_reward_zero_std": 0.25, "grad_norm": 0.11597556620836258, "kl": 0.05314599350094795, "learning_rate": 2.6111111111111113e-06, "loss": 0.0021, "num_tokens": 1362965.0, "reward": -7.8125, "reward_std": 0.661700427532196, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 54 }, { "completion_length": 1612.125, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 1612.125, "completions/mean_terminated_length": 1136.0, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.017284726587052168, "frac_reward_zero_std": 0.75, "grad_norm": 0.07392556965351105, "kl": 0.04997054487466812, "learning_rate": 2.5555555555555557e-06, "loss": 0.002, "num_tokens": 1393151.0, "reward": -7.46875, "reward_std": 0.1875, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 55 }, { "completion_length": 1571.625, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 1571.625, "completions/mean_terminated_length": 920.0, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.01759899434318039, "frac_reward_zero_std": 0.25, "grad_norm": 0.10698790848255157, "kl": 0.040371235460042953, "learning_rate": 2.5e-06, "loss": 0.0016, "num_tokens": 1422181.0, "reward": -7.9375, "reward_std": 0.5915063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.9375, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 56 }, { "completion_length": 1594.9375, "completions/clipped_ratio": 0.875, "completions/max_length": 1722.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 1594.9375, "completions/mean_terminated_length": 705.5, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.017913262099308613, "frac_reward_zero_std": 0.25, "grad_norm": 0.6847302317619324, "kl": 0.04597615823149681, "learning_rate": 2.4444444444444447e-06, "loss": 0.0018, "num_tokens": 1451644.0, "reward": -7.59375, "reward_std": 0.6205127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 57 }, { "completion_length": 1503.375, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 1503.375, "completions/mean_terminated_length": 556.0, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.018227529855436832, "frac_reward_zero_std": 0.25, "grad_norm": 0.09949938207864761, "kl": 0.056429892778396606, "learning_rate": 2.388888888888889e-06, "loss": 0.0023, "num_tokens": 1479182.0, "reward": -7.96875, "reward_std": 0.6540063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 58 }, { "completion_length": 1563.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 1563.0, "completions/mean_terminated_length": 1213.2000732421875, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "epoch": 0.018541797611565054, "frac_reward_zero_std": 0.0, "grad_norm": 0.11543877422809601, "kl": 0.035633672028779984, "learning_rate": 2.3333333333333336e-06, "loss": 0.0014, "num_tokens": 1508062.0, "reward": -7.65625, "reward_std": 0.7790063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 59 }, { "completion_length": 1707.75, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 1707.75, "completions/mean_terminated_length": 1646.0, "completions/min_length": 1605.0, "completions/min_terminated_length": 1605.0, "epoch": 0.018856065367693273, "frac_reward_zero_std": 0.75, "grad_norm": 0.0639866441488266, "kl": 0.05725252255797386, "learning_rate": 2.277777777777778e-06, "loss": 0.0023, "num_tokens": 1539650.0, "reward": -7.25, "reward_std": 0.28867512941360474, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": 0.0, "rewards/check_numbers/std": 0.0, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 60 }, { "completion_length": 1535.1875, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 1535.1875, "completions/mean_terminated_length": 974.75, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.019170333123821496, "frac_reward_zero_std": 0.0, "grad_norm": 0.2154313325881958, "kl": 0.06772614270448685, "learning_rate": 2.222222222222222e-06, "loss": 0.0027, "num_tokens": 1567677.0, "reward": -8.09375, "reward_std": 0.9407067894935608, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 61 }, { "completion_length": 1480.75, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 1480.75, "completions/mean_terminated_length": 757.0, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.01948460087994972, "frac_reward_zero_std": 0.0, "grad_norm": 0.18533650040626526, "kl": 0.04637778922915459, "learning_rate": 2.166666666666667e-06, "loss": 0.0019, "num_tokens": 1595629.0, "reward": -7.5625, "reward_std": 0.8080127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 62 }, { "completion_length": 1604.8125, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 1604.8125, "completions/mean_terminated_length": 1097.0, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.019798868636077938, "frac_reward_zero_std": 0.25, "grad_norm": 0.17045044898986816, "kl": 0.05853426456451416, "learning_rate": 2.1111111111111114e-06, "loss": 0.0023, "num_tokens": 1624594.0, "reward": -8.40625, "reward_std": 0.7628755569458008, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.03125, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 63 }, { "completion_length": 1406.9375, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 1406.9375, "completions/mean_terminated_length": 881.8333740234375, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.02011313639220616, "frac_reward_zero_std": 0.25, "grad_norm": 0.13805745542049408, "kl": 0.045212242752313614, "learning_rate": 2.0555555555555555e-06, "loss": 0.0018, "num_tokens": 1650865.0, "reward": -8.15625, "reward_std": 0.4955126941204071, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -1.03125, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 64 }, { "completion_length": 1511.0625, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 1511.0625, "completions/mean_terminated_length": 1159.5, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 0.02042740414833438, "frac_reward_zero_std": 0.0, "grad_norm": 0.10470445454120636, "kl": 0.040612928569316864, "learning_rate": 2.0000000000000003e-06, "loss": 0.0016, "num_tokens": 1679182.0, "reward": -8.0, "reward_std": 0.9917292594909668, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 65 }, { "completion_length": 1688.3125, "completions/clipped_ratio": 0.875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 1688.3125, "completions/mean_terminated_length": 1452.5, "completions/min_length": 1335.0, "completions/min_terminated_length": 1335.0, "epoch": 0.020741671904462602, "frac_reward_zero_std": 0.25, "grad_norm": 0.0923866257071495, "kl": 0.04577171802520752, "learning_rate": 1.944444444444445e-06, "loss": 0.0018, "num_tokens": 1710331.0, "reward": -6.875, "reward_std": 0.9716878533363342, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": 0.25, "rewards/check_numbers/std": 1.3662601709365845, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 66 }, { "completion_length": 1652.3125, "completions/clipped_ratio": 0.9375, "completions/max_length": 1722.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 1652.3125, "completions/mean_terminated_length": 607.0, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.021055939660590824, "frac_reward_zero_std": 0.25, "grad_norm": 0.13661737740039825, "kl": 0.0424213632941246, "learning_rate": 1.888888888888889e-06, "loss": 0.0017, "num_tokens": 1741012.0, "reward": -7.71875, "reward_std": 0.8185844421386719, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 67 }, { "completion_length": 1409.125, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 1409.125, "completions/mean_terminated_length": 887.6666870117188, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 0.021370207416719043, "frac_reward_zero_std": 0.0, "grad_norm": 0.15749689936637878, "kl": 0.0645652636885643, "learning_rate": 1.8333333333333333e-06, "loss": 0.0026, "num_tokens": 1767746.0, "reward": -8.125, "reward_std": 1.5154354572296143, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.75, "rewards/format_and_language_reward_func/std": 1.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 68 }, { "completion_length": 1533.5, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 1533.5, "completions/mean_terminated_length": 1118.800048828125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.021684475172847266, "frac_reward_zero_std": 0.25, "grad_norm": 0.10066410154104233, "kl": 0.0493154339492321, "learning_rate": 1.777777777777778e-06, "loss": 0.002, "num_tokens": 1795630.0, "reward": -8.09375, "reward_std": 0.7028881907463074, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 69 }, { "completion_length": 1512.1875, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 1512.1875, "completions/mean_terminated_length": 882.75, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.02199874292897549, "frac_reward_zero_std": 0.5, "grad_norm": 0.08896202594041824, "kl": 0.054354239255189896, "learning_rate": 1.7222222222222224e-06, "loss": 0.0022, "num_tokens": 1823761.0, "reward": -7.6875, "reward_std": 0.46650636196136475, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 70 }, { "completion_length": 1664.4375, "completions/clipped_ratio": 0.875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 1664.4375, "completions/mean_terminated_length": 1261.5, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "epoch": 0.022313010685103708, "frac_reward_zero_std": 0.5, "grad_norm": 0.0697084367275238, "kl": 0.031285081058740616, "learning_rate": 1.6666666666666667e-06, "loss": 0.0013, "num_tokens": 1854820.0, "reward": -7.5625, "reward_std": 0.375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 71 }, { "completion_length": 1451.5, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 1451.5, "completions/mean_terminated_length": 1000.6666870117188, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.02262727844123193, "frac_reward_zero_std": 0.0, "grad_norm": 0.1362772285938263, "kl": 0.057278823107481, "learning_rate": 1.6111111111111113e-06, "loss": 0.0023, "num_tokens": 1882008.0, "reward": -7.875, "reward_std": 1.118110179901123, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.5, "rewards/format_and_language_reward_func/std": 0.8944272398948669, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 72 }, { "completion_length": 1348.5, "completions/clipped_ratio": 0.5, "completions/max_length": 1722.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 1348.5, "completions/mean_terminated_length": 975.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.02294154619736015, "frac_reward_zero_std": 0.25, "grad_norm": 0.11800263077020645, "kl": 0.05172204598784447, "learning_rate": 1.5555555555555558e-06, "loss": 0.0021, "num_tokens": 1907992.0, "reward": -7.78125, "reward_std": 0.4375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 73 }, { "completion_length": 1543.5625, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 1543.5625, "completions/mean_terminated_length": 1246.166748046875, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.023255813953488372, "frac_reward_zero_std": 0.25, "grad_norm": 0.12982743978500366, "kl": 0.064728744328022, "learning_rate": 1.5e-06, "loss": 0.0026, "num_tokens": 1936533.0, "reward": -7.53125, "reward_std": 0.6926814913749695, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.28125, "rewards/check_numbers/std": 0.6046693325042725, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 74 }, { "completion_length": 1450.4375, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 1450.4375, "completions/mean_terminated_length": 635.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.023570081709616594, "frac_reward_zero_std": 0.5, "grad_norm": 0.081117182970047, "kl": 0.05186208337545395, "learning_rate": 1.4444444444444445e-06, "loss": 0.0021, "num_tokens": 1963812.0, "reward": -7.75, "reward_std": 0.5386751294136047, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 75 }, { "completion_length": 1392.25, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 1392.25, "completions/mean_terminated_length": 666.7999877929688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.023884349465744813, "frac_reward_zero_std": 0.25, "grad_norm": 0.37952759861946106, "kl": 0.06186853349208832, "learning_rate": 1.3888888888888892e-06, "loss": 0.0025, "num_tokens": 1990520.0, "reward": -7.65625, "reward_std": 0.6205127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 76 }, { "completion_length": 1473.9375, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 1473.9375, "completions/mean_terminated_length": 928.2000122070312, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.024198617221873036, "frac_reward_zero_std": 0.0, "grad_norm": 0.1677270531654358, "kl": 0.053330324590206146, "learning_rate": 1.3333333333333334e-06, "loss": 0.0021, "num_tokens": 2018063.0, "reward": -7.75, "reward_std": 0.75, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 77 }, { "completion_length": 1513.625, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 1513.625, "completions/mean_terminated_length": 1055.2000732421875, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.02451288497800126, "frac_reward_zero_std": 0.25, "grad_norm": 0.1181650385260582, "kl": 0.05255034193396568, "learning_rate": 1.2777777777777779e-06, "loss": 0.0021, "num_tokens": 2046037.0, "reward": -7.65625, "reward_std": 0.6926814913749695, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.28125, "rewards/check_numbers/std": 0.6046693325042725, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 78 }, { "completion_length": 1411.25, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 1411.25, "completions/mean_terminated_length": 1011.71435546875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.024827152734129478, "frac_reward_zero_std": 0.25, "grad_norm": 0.1206207275390625, "kl": 0.05586665868759155, "learning_rate": 1.2222222222222223e-06, "loss": 0.0022, "num_tokens": 2072805.0, "reward": -7.78125, "reward_std": 0.8977102637290955, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.28125, "rewards/check_numbers/std": 0.6046693325042725, "rewards/format_and_language_reward_func/mean": -3.5, "rewards/format_and_language_reward_func/std": 0.8944272398948669, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 79 }, { "completion_length": 1497.875, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 1497.875, "completions/mean_terminated_length": 825.5, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.0251414204902577, "frac_reward_zero_std": 0.0, "grad_norm": 0.15405312180519104, "kl": 0.05234729126095772, "learning_rate": 1.1666666666666668e-06, "loss": 0.0021, "num_tokens": 2100631.0, "reward": -7.5, "reward_std": 1.9904643297195435, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.125, "rewards/check_numbers/std": 1.5864006280899048, "rewards/format_and_language_reward_func/mean": -3.375, "rewards/format_and_language_reward_func/std": 0.8062257766723633, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 80 }, { "completion_length": 1425.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 1425.0, "completions/mean_terminated_length": 930.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.02545568824638592, "frac_reward_zero_std": 0.25, "grad_norm": 0.11791858822107315, "kl": 0.04198841378092766, "learning_rate": 1.111111111111111e-06, "loss": 0.0017, "num_tokens": 2127275.0, "reward": -7.46875, "reward_std": 0.6205127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.46875, "rewards/check_numbers/std": 0.7180703282356262, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 81 }, { "completion_length": 1682.5, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 1682.5, "completions/mean_terminated_length": 1511.3333740234375, "completions/min_length": 1430.0, "completions/min_terminated_length": 1430.0, "epoch": 0.025769956002514142, "frac_reward_zero_std": 0.0, "grad_norm": 0.1258944272994995, "kl": 0.03991963341832161, "learning_rate": 1.0555555555555557e-06, "loss": 0.0016, "num_tokens": 2157831.0, "reward": -7.96875, "reward_std": 0.8125, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 82 }, { "completion_length": 1560.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 1560.0, "completions/mean_terminated_length": 858.0, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.026084223758642364, "frac_reward_zero_std": 0.25, "grad_norm": 0.25008276104927063, "kl": 0.04483529552817345, "learning_rate": 1.0000000000000002e-06, "loss": 0.0018, "num_tokens": 2186663.0, "reward": -7.5625, "reward_std": 0.5915063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 83 }, { "completion_length": 1363.8125, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 1363.8125, "completions/mean_terminated_length": 766.8333740234375, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.026398491514770583, "frac_reward_zero_std": 0.25, "grad_norm": 0.11882040649652481, "kl": 0.04816675931215286, "learning_rate": 9.444444444444445e-07, "loss": 0.0019, "num_tokens": 2212692.0, "reward": -7.78125, "reward_std": 0.6540063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 84 }, { "completion_length": 1652.3125, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 1652.3125, "completions/mean_terminated_length": 1350.3333740234375, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "epoch": 0.026712759270898806, "frac_reward_zero_std": 0.0, "grad_norm": 0.11614203453063965, "kl": 0.0454854890704155, "learning_rate": 8.88888888888889e-07, "loss": 0.0018, "num_tokens": 2243069.0, "reward": -8.0, "reward_std": 0.9117004871368408, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 85 }, { "completion_length": 1576.625, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 1576.625, "completions/mean_terminated_length": 1140.5, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.02702702702702703, "frac_reward_zero_std": 0.25, "grad_norm": 0.09951747953891754, "kl": 0.04343428835272789, "learning_rate": 8.333333333333333e-07, "loss": 0.0017, "num_tokens": 2272123.0, "reward": -7.4375, "reward_std": 0.9503755569458008, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.3125, "rewards/check_numbers/std": 1.6520190238952637, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 86 }, { "completion_length": 1552.3125, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 1552.3125, "completions/mean_terminated_length": 817.0, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.027341294783155248, "frac_reward_zero_std": 0.0, "grad_norm": 0.1279648393392563, "kl": 0.04489857330918312, "learning_rate": 7.777777777777779e-07, "loss": 0.0018, "num_tokens": 2300748.0, "reward": -7.96875, "reward_std": 0.9505414962768555, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 87 }, { "completion_length": 1594.375, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 1594.375, "completions/mean_terminated_length": 1041.3333740234375, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 0.02765556253928347, "frac_reward_zero_std": 0.0, "grad_norm": 0.18847358226776123, "kl": 0.03859600052237511, "learning_rate": 7.222222222222222e-07, "loss": 0.0015, "num_tokens": 2330330.0, "reward": -7.96875, "reward_std": 0.8125, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 88 }, { "completion_length": 1683.1875, "completions/clipped_ratio": 0.875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 1683.1875, "completions/mean_terminated_length": 1411.5, "completions/min_length": 1276.0, "completions/min_terminated_length": 1276.0, "epoch": 0.02796983029541169, "frac_reward_zero_std": 0.0, "grad_norm": 0.10217194259166718, "kl": 0.044257864356040955, "learning_rate": 6.666666666666667e-07, "loss": 0.0018, "num_tokens": 2361789.0, "reward": -7.6875, "reward_std": 1.0060844421386719, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 89 }, { "completion_length": 1638.3125, "completions/clipped_ratio": 0.8125, "completions/max_length": 1722.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 1638.3125, "completions/mean_terminated_length": 1275.666748046875, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "epoch": 0.028284098051539912, "frac_reward_zero_std": 0.0, "grad_norm": 0.6536535620689392, "kl": 0.07140226662158966, "learning_rate": 6.111111111111112e-07, "loss": 0.0029, "num_tokens": 2392042.0, "reward": -7.53125, "reward_std": 0.9040063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.28125, "rewards/check_numbers/std": 0.6046693325042725, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 90 }, { "completion_length": 1547.75, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 1547.75, "completions/mean_terminated_length": 1025.0, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 0.028598365807668134, "frac_reward_zero_std": 0.0, "grad_norm": 0.12291199713945389, "kl": 0.05366697907447815, "learning_rate": 5.555555555555555e-07, "loss": 0.0021, "num_tokens": 2420674.0, "reward": -7.78125, "reward_std": 0.8705127239227295, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 91 }, { "completion_length": 1515.5625, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 1515.5625, "completions/mean_terminated_length": 1171.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.028912633563796353, "frac_reward_zero_std": 0.25, "grad_norm": 0.11394577473402023, "kl": 0.04896366223692894, "learning_rate": 5.000000000000001e-07, "loss": 0.002, "num_tokens": 2448871.0, "reward": -6.71875, "reward_std": 0.8125, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": 0.28125, "rewards/check_numbers/std": 1.7220990657806396, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 92 }, { "completion_length": 1545.1875, "completions/clipped_ratio": 0.625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 1545.1875, "completions/mean_terminated_length": 1250.5, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "epoch": 0.029226901319924576, "frac_reward_zero_std": 0.5, "grad_norm": 0.07623685151338577, "kl": 0.046294040977954865, "learning_rate": 4.444444444444445e-07, "loss": 0.0019, "num_tokens": 2477226.0, "reward": -7.5625, "reward_std": 0.375, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.5625, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 93 }, { "completion_length": 1588.75, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 1588.75, "completions/mean_terminated_length": 1189.0, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.0295411690760528, "frac_reward_zero_std": 0.25, "grad_norm": 0.11318648606538773, "kl": 0.04230440780520439, "learning_rate": 3.8888888888888895e-07, "loss": 0.0017, "num_tokens": 2506806.0, "reward": -7.75, "reward_std": 0.5915063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.75, "rewards/check_numbers/std": 0.7745966911315918, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 94 }, { "completion_length": 1620.875, "completions/clipped_ratio": 0.875, "completions/max_length": 1722.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 1620.875, "completions/mean_terminated_length": 913.0, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.029855436832181018, "frac_reward_zero_std": 0.25, "grad_norm": 0.11543525010347366, "kl": 0.0602000392973423, "learning_rate": 3.3333333333333335e-07, "loss": 0.0024, "num_tokens": 2536500.0, "reward": -7.96875, "reward_std": 0.6540063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.84375, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 95 }, { "completion_length": 1623.375, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 1623.375, "completions/mean_terminated_length": 1327.5, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.03016970458830924, "frac_reward_zero_std": 0.25, "grad_norm": 0.11837535351514816, "kl": 0.04759529232978821, "learning_rate": 2.7777777777777776e-07, "loss": 0.0019, "num_tokens": 2566598.0, "reward": -8.1875, "reward_std": 0.7165063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.9375, "rewards/check_numbers/std": 0.75, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 96 }, { "completion_length": 1617.0625, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1617.0625, "completions/mean_terminated_length": 1302.25, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 0.03048397234443746, "frac_reward_zero_std": 0.25, "grad_norm": 0.11196030676364899, "kl": 0.04893610253930092, "learning_rate": 2.2222222222222224e-07, "loss": 0.002, "num_tokens": 2596579.0, "reward": -7.4375, "reward_std": 0.7165063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.1875, "rewards/check_numbers/std": 0.5123475790023804, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 97 }, { "completion_length": 1362.0625, "completions/clipped_ratio": 0.5625, "completions/max_length": 1722.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 1362.0625, "completions/mean_terminated_length": 899.2857666015625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.030798240100565682, "frac_reward_zero_std": 0.5, "grad_norm": 0.2802422046661377, "kl": 0.07202958315610886, "learning_rate": 1.6666666666666668e-07, "loss": 0.0029, "num_tokens": 2622492.0, "reward": -7.375, "reward_std": 0.4330126941204071, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.375, "rewards/check_numbers/std": 0.670820415019989, "rewards/format_and_language_reward_func/mean": -3.0, "rewards/format_and_language_reward_func/std": 0.0, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 98 }, { "completion_length": 1529.75, "completions/clipped_ratio": 0.75, "completions/max_length": 1722.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 1529.75, "completions/mean_terminated_length": 953.0, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.031112507856693904, "frac_reward_zero_std": 0.25, "grad_norm": 0.27167099714279175, "kl": 0.0705300122499466, "learning_rate": 1.1111111111111112e-07, "loss": 0.0028, "num_tokens": 2650764.0, "reward": -7.40625, "reward_std": 0.6540063619613647, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.28125, "rewards/check_numbers/std": 0.6046693325042725, "rewards/format_and_language_reward_func/mean": -3.125, "rewards/format_and_language_reward_func/std": 0.5, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 99 }, { "completion_length": 1459.75, "completions/clipped_ratio": 0.6875, "completions/max_length": 1722.0, "completions/max_terminated_length": 1133.0, "completions/mean_length": 1459.75, "completions/mean_terminated_length": 882.7999877929688, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.03142677561282212, "frac_reward_zero_std": 0.0, "grad_norm": 0.11008132994174957, "kl": 0.04374002292752266, "learning_rate": 5.555555555555556e-08, "loss": 0.0017, "num_tokens": 2678396.0, "reward": -7.90625, "reward_std": 1.233162522315979, "rewards/check_answer/mean": -2.0, "rewards/check_answer/std": 0.0, "rewards/check_numbers/mean": -0.65625, "rewards/check_numbers/std": 0.7685213088989258, "rewards/format_and_language_reward_func/mean": -3.25, "rewards/format_and_language_reward_func/std": 0.6831300854682922, "rewards/match_format_approximately/mean": -2.0, "rewards/match_format_approximately/std": 0.0, "rewards/match_format_exactly/mean": 0.0, "rewards/match_format_exactly/std": 0.0, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 2678396, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }