{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9874476987447699, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016736401673640166, "grad_norm": 80.91779463089912, "learning_rate": 1.6666666666666664e-08, "logits/chosen": -2.3851256370544434, "logits/rejected": -2.345982551574707, "logps/chosen": -261.423828125, "logps/pi_response": -141.3343505859375, "logps/ref_response": -141.3343505859375, "logps/rejected": -573.3737182617188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.16736401673640167, "grad_norm": 95.19369287284003, "learning_rate": 9.860114570402053e-08, "logits/chosen": -2.3706295490264893, "logits/rejected": -2.278637170791626, "logps/chosen": -276.3284912109375, "logps/pi_response": -117.53726959228516, "logps/ref_response": -117.3475341796875, "logps/rejected": -554.368896484375, "loss": 0.6838, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": -0.009727651253342628, "rewards/margins": 0.017089376226067543, "rewards/rejected": -0.02681702747941017, "step": 10 }, { "epoch": 0.33472803347280333, "grad_norm": 52.082836046752725, "learning_rate": 8.374915007591053e-08, "logits/chosen": -2.313706874847412, "logits/rejected": -2.2545723915100098, "logps/chosen": -285.1987609863281, "logps/pi_response": -125.8084487915039, "logps/ref_response": -121.66993713378906, "logps/rejected": -595.0728149414062, "loss": 0.6049, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.19213858246803284, "rewards/margins": 0.2317531853914261, "rewards/rejected": -0.42389172315597534, "step": 20 }, { "epoch": 0.502092050209205, "grad_norm": 31.318197186196873, "learning_rate": 5.738232820012406e-08, "logits/chosen": -2.254267692565918, "logits/rejected": -2.1841464042663574, "logps/chosen": -307.58837890625, "logps/pi_response": -121.324462890625, "logps/ref_response": -112.60211181640625, "logps/rejected": -647.4771118164062, "loss": 0.5489, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3839130699634552, "rewards/margins": 0.5251675248146057, "rewards/rejected": -0.9090806245803833, "step": 30 }, { "epoch": 0.6694560669456067, "grad_norm": 59.9245918733432, "learning_rate": 2.8496739886173992e-08, "logits/chosen": -2.241121768951416, "logits/rejected": -2.150895118713379, "logps/chosen": -325.35955810546875, "logps/pi_response": -138.71681213378906, "logps/ref_response": -126.43278503417969, "logps/rejected": -717.0718994140625, "loss": 0.4975, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5242354869842529, "rewards/margins": 0.901803195476532, "rewards/rejected": -1.4260386228561401, "step": 40 }, { "epoch": 0.8368200836820083, "grad_norm": 82.26785400421726, "learning_rate": 6.947819411632222e-09, "logits/chosen": -2.2152085304260254, "logits/rejected": -2.1557888984680176, "logps/chosen": -362.69854736328125, "logps/pi_response": -137.07763671875, "logps/ref_response": -123.94889831542969, "logps/rejected": -755.274169921875, "loss": 0.5106, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6341878771781921, "rewards/margins": 1.0056912899017334, "rewards/rejected": -1.6398792266845703, "step": 50 }, { "epoch": 0.9874476987447699, "step": 59, "total_flos": 0.0, "train_loss": 0.5600805605872202, "train_runtime": 2606.3275, "train_samples_per_second": 5.864, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }