{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9905213270142181, "eval_steps": 100, "global_step": 210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 177.85299255533252, "learning_rate": 5e-09, "logits/chosen": 129.0, "logits/rejected": 125.5, "logps/chosen": -428.0, "logps/rejected": -470.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.09, "grad_norm": 171.69218216908772, "learning_rate": 5e-08, "logits/chosen": 125.5, "logits/rejected": 133.0, "logps/chosen": -414.0, "logps/rejected": -448.0, "loss": 0.7199, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -0.017822265625, "rewards/margins": -0.0498046875, "rewards/rejected": 0.031982421875, "step": 10 }, { "epoch": 0.19, "grad_norm": 180.59129253006714, "learning_rate": 1e-07, "logits/chosen": 136.0, "logits/rejected": 136.0, "logps/chosen": -400.0, "logps/rejected": -430.0, "loss": 0.7187, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0242919921875, "rewards/margins": -0.02197265625, "rewards/rejected": -0.0023956298828125, "step": 20 }, { "epoch": 0.28, "grad_norm": 171.658251860157, "learning_rate": 1.5e-07, "logits/chosen": 130.0, "logits/rejected": 130.0, "logps/chosen": -374.0, "logps/rejected": -388.0, "loss": 0.7132, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.09521484375, "rewards/margins": 0.004791259765625, "rewards/rejected": 0.09033203125, "step": 30 }, { "epoch": 0.38, "grad_norm": 170.37244697891163, "learning_rate": 2e-07, "logits/chosen": 114.5, "logits/rejected": 128.0, "logps/chosen": -336.0, "logps/rejected": -428.0, "loss": 0.659, "rewards/accuracies": 0.625, "rewards/chosen": 0.30859375, "rewards/margins": 0.23828125, "rewards/rejected": 0.0712890625, "step": 40 }, { "epoch": 0.47, "grad_norm": 156.01063486797565, "learning_rate": 2.5e-07, "logits/chosen": 124.5, "logits/rejected": 124.0, "logps/chosen": -372.0, "logps/rejected": -388.0, "loss": 0.6405, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.173828125, "rewards/margins": 0.318359375, "rewards/rejected": -0.14453125, "step": 50 }, { "epoch": 0.57, "grad_norm": 191.09116619653446, "learning_rate": 3e-07, "logits/chosen": 116.0, "logits/rejected": 124.0, "logps/chosen": -342.0, "logps/rejected": -416.0, "loss": 0.5833, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.0191650390625, "rewards/margins": 0.67578125, "rewards/rejected": -0.69140625, "step": 60 }, { "epoch": 0.66, "grad_norm": 155.04958273291422, "learning_rate": 3.5e-07, "logits/chosen": 122.0, "logits/rejected": 117.5, "logps/chosen": -386.0, "logps/rejected": -394.0, "loss": 0.5745, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.09375, "rewards/margins": 0.79296875, "rewards/rejected": -0.88671875, "step": 70 }, { "epoch": 0.76, "grad_norm": 142.1796149100647, "learning_rate": 4e-07, "logits/chosen": 116.5, "logits/rejected": 116.5, "logps/chosen": -338.0, "logps/rejected": -406.0, "loss": 0.537, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.33203125, "rewards/margins": 0.96484375, "rewards/rejected": -1.296875, "step": 80 }, { "epoch": 0.85, "grad_norm": 199.30217047660693, "learning_rate": 4.5e-07, "logits/chosen": 130.0, "logits/rejected": 132.0, "logps/chosen": -412.0, "logps/rejected": -444.0, "loss": 0.4894, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8515625, "rewards/margins": 1.0078125, "rewards/rejected": -1.859375, "step": 90 }, { "epoch": 0.95, "grad_norm": 130.24358680507126, "learning_rate": 5e-07, "logits/chosen": 117.5, "logits/rejected": 126.5, "logps/chosen": -392.0, "logps/rejected": -490.0, "loss": 0.5324, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.98828125, "rewards/margins": 1.2578125, "rewards/rejected": -2.25, "step": 100 }, { "epoch": 0.95, "eval_logits/chosen": 92.0, "eval_logits/rejected": 94.0, "eval_logps/chosen": -376.0, "eval_logps/rejected": -456.0, "eval_loss": 0.4851927161216736, "eval_rewards/accuracies": 0.7340425252914429, "eval_rewards/chosen": -1.1640625, "eval_rewards/margins": 1.3046875, "eval_rewards/rejected": -2.46875, "eval_runtime": 130.6909, "eval_samples_per_second": 5.739, "eval_steps_per_second": 0.36, "step": 100 }, { "epoch": 1.04, "grad_norm": 73.74185148689295, "learning_rate": 4.898732434036243e-07, "logits/chosen": 119.5, "logits/rejected": 114.5, "logps/chosen": -406.0, "logps/rejected": -458.0, "loss": 0.3895, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.109375, "rewards/margins": 1.6953125, "rewards/rejected": -2.8125, "step": 110 }, { "epoch": 1.14, "grad_norm": 77.9102123166481, "learning_rate": 4.603133832077953e-07, "logits/chosen": 113.5, "logits/rejected": 122.0, "logps/chosen": -352.0, "logps/rejected": -470.0, "loss": 0.2308, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.04150390625, "rewards/margins": 2.859375, "rewards/rejected": -2.90625, "step": 120 }, { "epoch": 1.23, "grad_norm": 72.1155658512914, "learning_rate": 4.137151834863213e-07, "logits/chosen": 107.0, "logits/rejected": 108.5, "logps/chosen": -392.0, "logps/rejected": -462.0, "loss": 0.2175, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.68359375, "rewards/margins": 2.796875, "rewards/rejected": -3.46875, "step": 130 }, { "epoch": 1.33, "grad_norm": 83.89189869364462, "learning_rate": 3.5385375325047163e-07, "logits/chosen": 116.0, "logits/rejected": 123.0, "logps/chosen": -396.0, "logps/rejected": -510.0, "loss": 0.2066, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.6875, "rewards/margins": 2.796875, "rewards/rejected": -4.46875, "step": 140 }, { "epoch": 1.42, "grad_norm": 67.05775374408921, "learning_rate": 2.8557870956832133e-07, "logits/chosen": 104.5, "logits/rejected": 107.5, "logps/chosen": -390.0, "logps/rejected": -490.0, "loss": 0.1569, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.484375, "rewards/margins": 3.25, "rewards/rejected": -4.71875, "step": 150 }, { "epoch": 1.52, "grad_norm": 91.02831609820949, "learning_rate": 2.1442129043167873e-07, "logits/chosen": 101.5, "logits/rejected": 121.0, "logps/chosen": -394.0, "logps/rejected": -506.0, "loss": 0.1956, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.28125, "rewards/margins": 3.671875, "rewards/rejected": -4.9375, "step": 160 }, { "epoch": 1.61, "grad_norm": 78.82758233414538, "learning_rate": 1.461462467495284e-07, "logits/chosen": 109.5, "logits/rejected": 110.5, "logps/chosen": -406.0, "logps/rejected": -476.0, "loss": 0.1575, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9375, "rewards/margins": 3.484375, "rewards/rejected": -4.4375, "step": 170 }, { "epoch": 1.71, "grad_norm": 95.52959889189601, "learning_rate": 8.628481651367875e-08, "logits/chosen": 105.0, "logits/rejected": 117.0, "logps/chosen": -406.0, "logps/rejected": -536.0, "loss": 0.1574, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.83203125, "rewards/margins": 3.3125, "rewards/rejected": -4.15625, "step": 180 }, { "epoch": 1.8, "grad_norm": 67.11817234106277, "learning_rate": 3.968661679220467e-08, "logits/chosen": 114.0, "logits/rejected": 115.0, "logps/chosen": -414.0, "logps/rejected": -512.0, "loss": 0.1866, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.8125, "rewards/margins": 3.5625, "rewards/rejected": -4.375, "step": 190 }, { "epoch": 1.9, "grad_norm": 83.00810869226768, "learning_rate": 1.0126756596375685e-08, "logits/chosen": 96.5, "logits/rejected": 104.0, "logps/chosen": -384.0, "logps/rejected": -464.0, "loss": 0.1822, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.265625, "rewards/margins": 3.125, "rewards/rejected": -4.375, "step": 200 }, { "epoch": 1.9, "eval_logits/chosen": 82.0, "eval_logits/rejected": 84.0, "eval_logps/chosen": -388.0, "eval_logps/rejected": -480.0, "eval_loss": 0.4446233808994293, "eval_rewards/accuracies": 0.7446808218955994, "eval_rewards/chosen": -1.796875, "eval_rewards/margins": 1.8359375, "eval_rewards/rejected": -3.640625, "eval_runtime": 133.5481, "eval_samples_per_second": 5.616, "eval_steps_per_second": 0.352, "step": 200 }, { "epoch": 1.99, "grad_norm": 47.68700617963266, "learning_rate": 0.0, "logits/chosen": 96.5, "logits/rejected": 98.5, "logps/chosen": -390.0, "logps/rejected": -446.0, "loss": 0.2176, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.140625, "rewards/margins": 3.125, "rewards/rejected": -4.28125, "step": 210 }, { "epoch": 1.99, "step": 210, "total_flos": 0.0, "train_loss": 0.4030106709116981, "train_runtime": 4928.2992, "train_samples_per_second": 2.739, "train_steps_per_second": 0.043 } ], "logging_steps": 10, "max_steps": 210, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }