yiran-wang3 commited on
Commit
23a0146
1 Parent(s): 4eec409

End of training

Browse files
Files changed (6) hide show
  1. README.md +64 -0
  2. all_results.json +9 -0
  3. config.json +1 -1
  4. generation_config.json +14 -0
  5. train_results.json +9 -0
  6. trainer_state.json +1113 -0
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen2.5-Coder-1.5B-Instruct
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/qwcoder2_original_cn_mining_oj_iter0-binarized
12
+ model-index:
13
+ - name: qwen2_coder_adamw_iter1
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # qwen2_coder_adamw_iter1
21
+
22
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct) on the self-generate/qwcoder2_original_cn_mining_oj_iter0-binarized dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.26999635322421206,
5
+ "train_runtime": 166.8574,
6
+ "train_samples": 3222,
7
+ "train_samples_per_second": 19.31,
8
+ "train_steps_per_second": 0.306
9
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.45.0"
14
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.26999635322421206,
5
+ "train_runtime": 166.8574,
6
+ "train_samples": 3222,
7
+ "train_samples_per_second": 19.31,
8
+ "train_steps_per_second": 0.306
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 51,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": -3.131476879119873,
13
+ "debug/policy_chosen_logps": -223.49798583984375,
14
+ "debug/policy_rejected_logits": -3.0218234062194824,
15
+ "debug/policy_rejected_logps": -181.94036865234375,
16
+ "debug/reference_chosen_logps": -223.49798583984375,
17
+ "debug/reference_rejected_logps": -181.94036865234375,
18
+ "epoch": 0.0196078431372549,
19
+ "grad_norm": 9.59268936350444,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": -3.131476879119873,
22
+ "logits/rejected": -3.0218234062194824,
23
+ "logps/chosen": -223.49798583984375,
24
+ "logps/rejected": -181.94036865234375,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": -3.1443662643432617,
34
+ "debug/policy_chosen_logps": -209.216552734375,
35
+ "debug/policy_rejected_logits": -3.076768159866333,
36
+ "debug/policy_rejected_logps": -170.2884521484375,
37
+ "debug/reference_chosen_logps": -209.07872009277344,
38
+ "debug/reference_rejected_logps": -169.68731689453125,
39
+ "epoch": 0.0392156862745098,
40
+ "grad_norm": 8.71018223332819,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": -3.1443662643432617,
43
+ "logits/rejected": -3.076768159866333,
44
+ "logps/chosen": -209.216552734375,
45
+ "logps/rejected": -170.2884521484375,
46
+ "loss": 0.4974,
47
+ "rewards/accuracies": 0.875,
48
+ "rewards/chosen": -0.001378459855914116,
49
+ "rewards/margins": 0.004632873460650444,
50
+ "rewards/rejected": -0.00601133331656456,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": -3.135432243347168,
55
+ "debug/policy_chosen_logps": -203.98123168945312,
56
+ "debug/policy_rejected_logits": -3.058173179626465,
57
+ "debug/policy_rejected_logps": -171.8382568359375,
58
+ "debug/reference_chosen_logps": -206.17086791992188,
59
+ "debug/reference_rejected_logps": -172.7147216796875,
60
+ "epoch": 0.058823529411764705,
61
+ "grad_norm": 11.448652871780954,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": -3.135432243347168,
64
+ "logits/rejected": -3.058173179626465,
65
+ "logps/chosen": -203.98123168945312,
66
+ "logps/rejected": -171.8382568359375,
67
+ "loss": 0.4898,
68
+ "rewards/accuracies": 1.0,
69
+ "rewards/chosen": 0.02189634181559086,
70
+ "rewards/margins": 0.013131675310432911,
71
+ "rewards/rejected": 0.008764667436480522,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": -3.0565543174743652,
76
+ "debug/policy_chosen_logps": -206.8490447998047,
77
+ "debug/policy_rejected_logits": -2.9550375938415527,
78
+ "debug/policy_rejected_logps": -174.601318359375,
79
+ "debug/reference_chosen_logps": -206.09422302246094,
80
+ "debug/reference_rejected_logps": -172.06332397460938,
81
+ "epoch": 0.0784313725490196,
82
+ "grad_norm": 8.681717474563778,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": -3.0565543174743652,
85
+ "logits/rejected": -2.9550375938415527,
86
+ "logps/chosen": -206.8490447998047,
87
+ "logps/rejected": -174.601318359375,
88
+ "loss": 0.4855,
89
+ "rewards/accuracies": 0.75,
90
+ "rewards/chosen": -0.007548102643340826,
91
+ "rewards/margins": 0.017831895500421524,
92
+ "rewards/rejected": -0.025379998609423637,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": -3.0399742126464844,
97
+ "debug/policy_chosen_logps": -206.2752227783203,
98
+ "debug/policy_rejected_logits": -2.9232938289642334,
99
+ "debug/policy_rejected_logps": -156.70419311523438,
100
+ "debug/reference_chosen_logps": -205.56788635253906,
101
+ "debug/reference_rejected_logps": -150.99920654296875,
102
+ "epoch": 0.09803921568627451,
103
+ "grad_norm": 8.125347499138313,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": -3.0399742126464844,
106
+ "logits/rejected": -2.9232938289642334,
107
+ "logps/chosen": -206.2752227783203,
108
+ "logps/rejected": -156.70419311523438,
109
+ "loss": 0.4648,
110
+ "rewards/accuracies": 1.0,
111
+ "rewards/chosen": -0.00707334466278553,
112
+ "rewards/margins": 0.04997648298740387,
113
+ "rewards/rejected": -0.05704982578754425,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": -3.2029356956481934,
118
+ "debug/policy_chosen_logps": -206.69122314453125,
119
+ "debug/policy_rejected_logits": -3.122507095336914,
120
+ "debug/policy_rejected_logps": -199.9852752685547,
121
+ "debug/reference_chosen_logps": -210.3363494873047,
122
+ "debug/reference_rejected_logps": -200.11654663085938,
123
+ "epoch": 0.11764705882352941,
124
+ "grad_norm": 10.6811806285762,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": -3.2029356956481934,
127
+ "logits/rejected": -3.122507095336914,
128
+ "logps/chosen": -206.69122314453125,
129
+ "logps/rejected": -199.9852752685547,
130
+ "loss": 0.4523,
131
+ "rewards/accuracies": 1.0,
132
+ "rewards/chosen": 0.03645110875368118,
133
+ "rewards/margins": 0.03513820841908455,
134
+ "rewards/rejected": 0.0013129040598869324,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": -3.202118396759033,
139
+ "debug/policy_chosen_logps": -198.13885498046875,
140
+ "debug/policy_rejected_logits": -2.9787750244140625,
141
+ "debug/policy_rejected_logps": -169.29666137695312,
142
+ "debug/reference_chosen_logps": -199.79531860351562,
143
+ "debug/reference_rejected_logps": -164.98037719726562,
144
+ "epoch": 0.13725490196078433,
145
+ "grad_norm": 7.511392268060962,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": -3.202118396759033,
148
+ "logits/rejected": -2.9787750244140625,
149
+ "logps/chosen": -198.13885498046875,
150
+ "logps/rejected": -169.29666137695312,
151
+ "loss": 0.4439,
152
+ "rewards/accuracies": 0.875,
153
+ "rewards/chosen": 0.01656484603881836,
154
+ "rewards/margins": 0.05972753465175629,
155
+ "rewards/rejected": -0.04316268861293793,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": -3.1585052013397217,
160
+ "debug/policy_chosen_logps": -209.9014892578125,
161
+ "debug/policy_rejected_logits": -3.147646427154541,
162
+ "debug/policy_rejected_logps": -184.0531005859375,
163
+ "debug/reference_chosen_logps": -213.74742126464844,
164
+ "debug/reference_rejected_logps": -184.40182495117188,
165
+ "epoch": 0.1568627450980392,
166
+ "grad_norm": 9.384905399946895,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": -3.1585052013397217,
169
+ "logits/rejected": -3.147646427154541,
170
+ "logps/chosen": -209.9014892578125,
171
+ "logps/rejected": -184.0531005859375,
172
+ "loss": 0.4553,
173
+ "rewards/accuracies": 0.75,
174
+ "rewards/chosen": 0.03845922276377678,
175
+ "rewards/margins": 0.03497195988893509,
176
+ "rewards/rejected": 0.00348726287484169,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": -3.219722032546997,
181
+ "debug/policy_chosen_logps": -204.11474609375,
182
+ "debug/policy_rejected_logits": -3.0252864360809326,
183
+ "debug/policy_rejected_logps": -165.35894775390625,
184
+ "debug/reference_chosen_logps": -207.07034301757812,
185
+ "debug/reference_rejected_logps": -154.7605438232422,
186
+ "epoch": 0.17647058823529413,
187
+ "grad_norm": 6.128726988165846,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": -3.219722032546997,
190
+ "logits/rejected": -3.0252864360809326,
191
+ "logps/chosen": -204.11474609375,
192
+ "logps/rejected": -165.35894775390625,
193
+ "loss": 0.4068,
194
+ "rewards/accuracies": 1.0,
195
+ "rewards/chosen": 0.02955583482980728,
196
+ "rewards/margins": 0.1355399787425995,
197
+ "rewards/rejected": -0.1059841513633728,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": -3.3743577003479004,
202
+ "debug/policy_chosen_logps": -182.16360473632812,
203
+ "debug/policy_rejected_logits": -2.9649839401245117,
204
+ "debug/policy_rejected_logps": -173.23670959472656,
205
+ "debug/reference_chosen_logps": -190.21826171875,
206
+ "debug/reference_rejected_logps": -162.2075958251953,
207
+ "epoch": 0.19607843137254902,
208
+ "grad_norm": 6.364236944929035,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": -3.3743577003479004,
211
+ "logits/rejected": -2.9649839401245117,
212
+ "logps/chosen": -182.16360473632812,
213
+ "logps/rejected": -173.23670959472656,
214
+ "loss": 0.3573,
215
+ "rewards/accuracies": 1.0,
216
+ "rewards/chosen": 0.08054651319980621,
217
+ "rewards/margins": 0.19083772599697113,
218
+ "rewards/rejected": -0.11029121279716492,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": -3.173790454864502,
223
+ "debug/policy_chosen_logps": -203.52264404296875,
224
+ "debug/policy_rejected_logits": -2.949070692062378,
225
+ "debug/policy_rejected_logps": -178.64117431640625,
226
+ "debug/reference_chosen_logps": -212.055908203125,
227
+ "debug/reference_rejected_logps": -169.71719360351562,
228
+ "epoch": 0.21568627450980393,
229
+ "grad_norm": 5.353556279413172,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": -3.173790454864502,
232
+ "logits/rejected": -2.949070692062378,
233
+ "logps/chosen": -203.52264404296875,
234
+ "logps/rejected": -178.64117431640625,
235
+ "loss": 0.3634,
236
+ "rewards/accuracies": 0.875,
237
+ "rewards/chosen": 0.08533257991075516,
238
+ "rewards/margins": 0.1745723932981491,
239
+ "rewards/rejected": -0.08923980593681335,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": -3.277130126953125,
244
+ "debug/policy_chosen_logps": -199.06358337402344,
245
+ "debug/policy_rejected_logits": -3.132725954055786,
246
+ "debug/policy_rejected_logps": -179.83506774902344,
247
+ "debug/reference_chosen_logps": -208.1540985107422,
248
+ "debug/reference_rejected_logps": -177.0548095703125,
249
+ "epoch": 0.23529411764705882,
250
+ "grad_norm": 5.672030188679155,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": -3.277130126953125,
253
+ "logits/rejected": -3.132725954055786,
254
+ "logps/chosen": -199.06358337402344,
255
+ "logps/rejected": -179.83506774902344,
256
+ "loss": 0.3502,
257
+ "rewards/accuracies": 0.875,
258
+ "rewards/chosen": 0.09090512990951538,
259
+ "rewards/margins": 0.11870768666267395,
260
+ "rewards/rejected": -0.027802541851997375,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": -3.1242167949676514,
265
+ "debug/policy_chosen_logps": -191.25042724609375,
266
+ "debug/policy_rejected_logits": -2.9863691329956055,
267
+ "debug/policy_rejected_logps": -175.88369750976562,
268
+ "debug/reference_chosen_logps": -196.898193359375,
269
+ "debug/reference_rejected_logps": -166.8399658203125,
270
+ "epoch": 0.2549019607843137,
271
+ "grad_norm": 5.205320381513838,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": -3.1242167949676514,
274
+ "logits/rejected": -2.9863691329956055,
275
+ "logps/chosen": -191.25042724609375,
276
+ "logps/rejected": -175.88369750976562,
277
+ "loss": 0.37,
278
+ "rewards/accuracies": 0.75,
279
+ "rewards/chosen": 0.056477658450603485,
280
+ "rewards/margins": 0.14691495895385742,
281
+ "rewards/rejected": -0.09043729305267334,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": -3.144406318664551,
286
+ "debug/policy_chosen_logps": -218.8365478515625,
287
+ "debug/policy_rejected_logits": -2.8921687602996826,
288
+ "debug/policy_rejected_logps": -180.90426635742188,
289
+ "debug/reference_chosen_logps": -229.2056884765625,
290
+ "debug/reference_rejected_logps": -170.25051879882812,
291
+ "epoch": 0.27450980392156865,
292
+ "grad_norm": 5.1865412600469725,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": -3.144406318664551,
295
+ "logits/rejected": -2.8921687602996826,
296
+ "logps/chosen": -218.8365478515625,
297
+ "logps/rejected": -180.90426635742188,
298
+ "loss": 0.3456,
299
+ "rewards/accuracies": 1.0,
300
+ "rewards/chosen": 0.10369150340557098,
301
+ "rewards/margins": 0.21022894978523254,
302
+ "rewards/rejected": -0.10653746128082275,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": -3.2248146533966064,
307
+ "debug/policy_chosen_logps": -201.5963134765625,
308
+ "debug/policy_rejected_logits": -2.995159149169922,
309
+ "debug/policy_rejected_logps": -184.41653442382812,
310
+ "debug/reference_chosen_logps": -210.53994750976562,
311
+ "debug/reference_rejected_logps": -177.02459716796875,
312
+ "epoch": 0.29411764705882354,
313
+ "grad_norm": 5.281953187102778,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": -3.2248146533966064,
316
+ "logits/rejected": -2.995159149169922,
317
+ "logps/chosen": -201.5963134765625,
318
+ "logps/rejected": -184.41653442382812,
319
+ "loss": 0.3298,
320
+ "rewards/accuracies": 0.75,
321
+ "rewards/chosen": 0.08943626284599304,
322
+ "rewards/margins": 0.163355752825737,
323
+ "rewards/rejected": -0.07391948252916336,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": -3.152334213256836,
328
+ "debug/policy_chosen_logps": -203.98538208007812,
329
+ "debug/policy_rejected_logits": -3.0059425830841064,
330
+ "debug/policy_rejected_logps": -174.17031860351562,
331
+ "debug/reference_chosen_logps": -212.34693908691406,
332
+ "debug/reference_rejected_logps": -161.72183227539062,
333
+ "epoch": 0.3137254901960784,
334
+ "grad_norm": 4.990441768681764,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": -3.152334213256836,
337
+ "logits/rejected": -3.0059425830841064,
338
+ "logps/chosen": -203.98538208007812,
339
+ "logps/rejected": -174.17031860351562,
340
+ "loss": 0.3315,
341
+ "rewards/accuracies": 1.0,
342
+ "rewards/chosen": 0.08361560851335526,
343
+ "rewards/margins": 0.2081003040075302,
344
+ "rewards/rejected": -0.12448470294475555,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": -3.3237087726593018,
349
+ "debug/policy_chosen_logps": -197.6831512451172,
350
+ "debug/policy_rejected_logits": -3.1152756214141846,
351
+ "debug/policy_rejected_logps": -185.88690185546875,
352
+ "debug/reference_chosen_logps": -209.38027954101562,
353
+ "debug/reference_rejected_logps": -171.8429412841797,
354
+ "epoch": 0.3333333333333333,
355
+ "grad_norm": 3.847170282499073,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": -3.3237087726593018,
358
+ "logits/rejected": -3.1152756214141846,
359
+ "logps/chosen": -197.6831512451172,
360
+ "logps/rejected": -185.88690185546875,
361
+ "loss": 0.3262,
362
+ "rewards/accuracies": 0.625,
363
+ "rewards/chosen": 0.11697111278772354,
364
+ "rewards/margins": 0.25741061568260193,
365
+ "rewards/rejected": -0.14043951034545898,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": -3.423741102218628,
370
+ "debug/policy_chosen_logps": -197.50125122070312,
371
+ "debug/policy_rejected_logits": -3.0541563034057617,
372
+ "debug/policy_rejected_logps": -194.04254150390625,
373
+ "debug/reference_chosen_logps": -214.143310546875,
374
+ "debug/reference_rejected_logps": -176.38931274414062,
375
+ "epoch": 0.35294117647058826,
376
+ "grad_norm": 4.6640459530058145,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": -3.423741102218628,
379
+ "logits/rejected": -3.0541563034057617,
380
+ "logps/chosen": -197.50125122070312,
381
+ "logps/rejected": -194.04254150390625,
382
+ "loss": 0.2745,
383
+ "rewards/accuracies": 0.875,
384
+ "rewards/chosen": 0.16642045974731445,
385
+ "rewards/margins": 0.3429526686668396,
386
+ "rewards/rejected": -0.17653217911720276,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": -3.1585283279418945,
391
+ "debug/policy_chosen_logps": -205.2724151611328,
392
+ "debug/policy_rejected_logits": -2.9066219329833984,
393
+ "debug/policy_rejected_logps": -190.3876953125,
394
+ "debug/reference_chosen_logps": -227.2172088623047,
395
+ "debug/reference_rejected_logps": -159.18490600585938,
396
+ "epoch": 0.37254901960784315,
397
+ "grad_norm": 4.373436851253389,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": -3.1585283279418945,
400
+ "logits/rejected": -2.9066219329833984,
401
+ "logps/chosen": -205.2724151611328,
402
+ "logps/rejected": -190.3876953125,
403
+ "loss": 0.2051,
404
+ "rewards/accuracies": 0.875,
405
+ "rewards/chosen": 0.2194480001926422,
406
+ "rewards/margins": 0.5314759016036987,
407
+ "rewards/rejected": -0.3120279312133789,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": -3.3925397396087646,
412
+ "debug/policy_chosen_logps": -182.79222106933594,
413
+ "debug/policy_rejected_logits": -2.9614052772521973,
414
+ "debug/policy_rejected_logps": -192.5382843017578,
415
+ "debug/reference_chosen_logps": -208.93617248535156,
416
+ "debug/reference_rejected_logps": -170.37913513183594,
417
+ "epoch": 0.39215686274509803,
418
+ "grad_norm": 7.041418694876974,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": -3.3925397396087646,
421
+ "logits/rejected": -2.9614052772521973,
422
+ "logps/chosen": -182.79222106933594,
423
+ "logps/rejected": -192.5382843017578,
424
+ "loss": 0.2688,
425
+ "rewards/accuracies": 0.75,
426
+ "rewards/chosen": 0.2614395320415497,
427
+ "rewards/margins": 0.48303085565567017,
428
+ "rewards/rejected": -0.22159132361412048,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": -3.3081858158111572,
433
+ "debug/policy_chosen_logps": -170.76687622070312,
434
+ "debug/policy_rejected_logits": -2.9337339401245117,
435
+ "debug/policy_rejected_logps": -190.08706665039062,
436
+ "debug/reference_chosen_logps": -194.40255737304688,
437
+ "debug/reference_rejected_logps": -161.28668212890625,
438
+ "epoch": 0.4117647058823529,
439
+ "grad_norm": 5.913800772738065,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": -3.3081858158111572,
442
+ "logits/rejected": -2.9337339401245117,
443
+ "logps/chosen": -170.76687622070312,
444
+ "logps/rejected": -190.08706665039062,
445
+ "loss": 0.2346,
446
+ "rewards/accuracies": 0.875,
447
+ "rewards/chosen": 0.2363569736480713,
448
+ "rewards/margins": 0.5243606567382812,
449
+ "rewards/rejected": -0.28800368309020996,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": -3.262423276901245,
454
+ "debug/policy_chosen_logps": -174.46621704101562,
455
+ "debug/policy_rejected_logits": -2.9751062393188477,
456
+ "debug/policy_rejected_logps": -194.79879760742188,
457
+ "debug/reference_chosen_logps": -195.777587890625,
458
+ "debug/reference_rejected_logps": -166.50228881835938,
459
+ "epoch": 0.43137254901960786,
460
+ "grad_norm": 3.6878540395392996,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": -3.262423276901245,
463
+ "logits/rejected": -2.9751062393188477,
464
+ "logps/chosen": -174.46621704101562,
465
+ "logps/rejected": -194.79879760742188,
466
+ "loss": 0.1804,
467
+ "rewards/accuracies": 0.75,
468
+ "rewards/chosen": 0.21311378479003906,
469
+ "rewards/margins": 0.4960786700248718,
470
+ "rewards/rejected": -0.28296488523483276,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": -3.284288167953491,
475
+ "debug/policy_chosen_logps": -201.29129028320312,
476
+ "debug/policy_rejected_logits": -3.055243730545044,
477
+ "debug/policy_rejected_logps": -189.35101318359375,
478
+ "debug/reference_chosen_logps": -227.23031616210938,
479
+ "debug/reference_rejected_logps": -170.58262634277344,
480
+ "epoch": 0.45098039215686275,
481
+ "grad_norm": 3.4160494981935057,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": -3.284288167953491,
484
+ "logits/rejected": -3.055243730545044,
485
+ "logps/chosen": -201.29129028320312,
486
+ "logps/rejected": -189.35101318359375,
487
+ "loss": 0.2467,
488
+ "rewards/accuracies": 0.75,
489
+ "rewards/chosen": 0.2593901753425598,
490
+ "rewards/margins": 0.44707390666007996,
491
+ "rewards/rejected": -0.18768377602100372,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": -3.3582632541656494,
496
+ "debug/policy_chosen_logps": -180.80130004882812,
497
+ "debug/policy_rejected_logits": -2.9310052394866943,
498
+ "debug/policy_rejected_logps": -198.58038330078125,
499
+ "debug/reference_chosen_logps": -208.0858917236328,
500
+ "debug/reference_rejected_logps": -156.55642700195312,
501
+ "epoch": 0.47058823529411764,
502
+ "grad_norm": 4.3584453143571515,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": -3.3582632541656494,
505
+ "logits/rejected": -2.9310052394866943,
506
+ "logps/chosen": -180.80130004882812,
507
+ "logps/rejected": -198.58038330078125,
508
+ "loss": 0.1894,
509
+ "rewards/accuracies": 0.875,
510
+ "rewards/chosen": 0.2728460729122162,
511
+ "rewards/margins": 0.6930855512619019,
512
+ "rewards/rejected": -0.42023950815200806,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": -3.294647455215454,
517
+ "debug/policy_chosen_logps": -200.66555786132812,
518
+ "debug/policy_rejected_logits": -2.9623827934265137,
519
+ "debug/policy_rejected_logps": -209.60763549804688,
520
+ "debug/reference_chosen_logps": -226.37498474121094,
521
+ "debug/reference_rejected_logps": -175.77857971191406,
522
+ "epoch": 0.49019607843137253,
523
+ "grad_norm": 5.190786042631128,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": -3.294647455215454,
526
+ "logits/rejected": -2.9623827934265137,
527
+ "logps/chosen": -200.66555786132812,
528
+ "logps/rejected": -209.60763549804688,
529
+ "loss": 0.2067,
530
+ "rewards/accuracies": 0.875,
531
+ "rewards/chosen": 0.257094144821167,
532
+ "rewards/margins": 0.5953845977783203,
533
+ "rewards/rejected": -0.3382904529571533,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": -3.303532123565674,
538
+ "debug/policy_chosen_logps": -172.21371459960938,
539
+ "debug/policy_rejected_logits": -3.037935495376587,
540
+ "debug/policy_rejected_logps": -192.28829956054688,
541
+ "debug/reference_chosen_logps": -197.53707885742188,
542
+ "debug/reference_rejected_logps": -173.1179656982422,
543
+ "epoch": 0.5098039215686274,
544
+ "grad_norm": 3.424028522129365,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": -3.303532123565674,
547
+ "logits/rejected": -3.037935495376587,
548
+ "logps/chosen": -172.21371459960938,
549
+ "logps/rejected": -192.28829956054688,
550
+ "loss": 0.2072,
551
+ "rewards/accuracies": 0.75,
552
+ "rewards/chosen": 0.25323352217674255,
553
+ "rewards/margins": 0.44493675231933594,
554
+ "rewards/rejected": -0.19170325994491577,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": -3.2078473567962646,
559
+ "debug/policy_chosen_logps": -185.5386199951172,
560
+ "debug/policy_rejected_logits": -3.058324098587036,
561
+ "debug/policy_rejected_logps": -181.49124145507812,
562
+ "debug/reference_chosen_logps": -216.10214233398438,
563
+ "debug/reference_rejected_logps": -166.07806396484375,
564
+ "epoch": 0.5294117647058824,
565
+ "grad_norm": 3.2542715272608427,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": -3.2078473567962646,
568
+ "logits/rejected": -3.058324098587036,
569
+ "logps/chosen": -185.5386199951172,
570
+ "logps/rejected": -181.49124145507812,
571
+ "loss": 0.1898,
572
+ "rewards/accuracies": 0.75,
573
+ "rewards/chosen": 0.3056352138519287,
574
+ "rewards/margins": 0.45976707339286804,
575
+ "rewards/rejected": -0.15413185954093933,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": -3.1834847927093506,
580
+ "debug/policy_chosen_logps": -189.82064819335938,
581
+ "debug/policy_rejected_logits": -3.0259480476379395,
582
+ "debug/policy_rejected_logps": -192.5255126953125,
583
+ "debug/reference_chosen_logps": -225.47145080566406,
584
+ "debug/reference_rejected_logps": -174.45132446289062,
585
+ "epoch": 0.5490196078431373,
586
+ "grad_norm": 5.011972117551199,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": -3.1834847927093506,
589
+ "logits/rejected": -3.0259480476379395,
590
+ "logps/chosen": -189.82064819335938,
591
+ "logps/rejected": -192.5255126953125,
592
+ "loss": 0.2206,
593
+ "rewards/accuracies": 0.75,
594
+ "rewards/chosen": 0.3565079867839813,
595
+ "rewards/margins": 0.5372498631477356,
596
+ "rewards/rejected": -0.18074187636375427,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": -3.439246416091919,
601
+ "debug/policy_chosen_logps": -161.62721252441406,
602
+ "debug/policy_rejected_logits": -2.9180073738098145,
603
+ "debug/policy_rejected_logps": -201.15945434570312,
604
+ "debug/reference_chosen_logps": -197.59341430664062,
605
+ "debug/reference_rejected_logps": -162.81692504882812,
606
+ "epoch": 0.5686274509803921,
607
+ "grad_norm": 2.4188165120482044,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": -3.439246416091919,
610
+ "logits/rejected": -2.9180073738098145,
611
+ "logps/chosen": -161.62721252441406,
612
+ "logps/rejected": -201.15945434570312,
613
+ "loss": 0.1577,
614
+ "rewards/accuracies": 0.875,
615
+ "rewards/chosen": 0.35966184735298157,
616
+ "rewards/margins": 0.7430870532989502,
617
+ "rewards/rejected": -0.38342520594596863,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": -3.3921971321105957,
622
+ "debug/policy_chosen_logps": -167.86477661132812,
623
+ "debug/policy_rejected_logits": -3.1083662509918213,
624
+ "debug/policy_rejected_logps": -186.5076141357422,
625
+ "debug/reference_chosen_logps": -202.75399780273438,
626
+ "debug/reference_rejected_logps": -165.93167114257812,
627
+ "epoch": 0.5882352941176471,
628
+ "grad_norm": 5.111153930194959,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": -3.3921971321105957,
631
+ "logits/rejected": -3.1083662509918213,
632
+ "logps/chosen": -167.86477661132812,
633
+ "logps/rejected": -186.5076141357422,
634
+ "loss": 0.2254,
635
+ "rewards/accuracies": 0.875,
636
+ "rewards/chosen": 0.3488922417163849,
637
+ "rewards/margins": 0.5546516180038452,
638
+ "rewards/rejected": -0.20575937628746033,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": -3.296841621398926,
643
+ "debug/policy_chosen_logps": -186.3404541015625,
644
+ "debug/policy_rejected_logits": -3.0427982807159424,
645
+ "debug/policy_rejected_logps": -189.97247314453125,
646
+ "debug/reference_chosen_logps": -225.02894592285156,
647
+ "debug/reference_rejected_logps": -176.0540771484375,
648
+ "epoch": 0.6078431372549019,
649
+ "grad_norm": 5.753832158358882,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": -3.296841621398926,
652
+ "logits/rejected": -3.0427982807159424,
653
+ "logps/chosen": -186.3404541015625,
654
+ "logps/rejected": -189.97247314453125,
655
+ "loss": 0.2514,
656
+ "rewards/accuracies": 1.0,
657
+ "rewards/chosen": 0.3868849575519562,
658
+ "rewards/margins": 0.5260686874389648,
659
+ "rewards/rejected": -0.13918372988700867,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": -3.241013765335083,
664
+ "debug/policy_chosen_logps": -183.28062438964844,
665
+ "debug/policy_rejected_logits": -3.004754066467285,
666
+ "debug/policy_rejected_logps": -206.055419921875,
667
+ "debug/reference_chosen_logps": -215.31048583984375,
668
+ "debug/reference_rejected_logps": -162.4297637939453,
669
+ "epoch": 0.6274509803921569,
670
+ "grad_norm": 2.9239290358904837,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": -3.241013765335083,
673
+ "logits/rejected": -3.004754066467285,
674
+ "logps/chosen": -183.28062438964844,
675
+ "logps/rejected": -206.055419921875,
676
+ "loss": 0.1757,
677
+ "rewards/accuracies": 1.0,
678
+ "rewards/chosen": 0.32029855251312256,
679
+ "rewards/margins": 0.7565551996231079,
680
+ "rewards/rejected": -0.43625661730766296,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": -3.3382985591888428,
685
+ "debug/policy_chosen_logps": -180.47259521484375,
686
+ "debug/policy_rejected_logits": -3.019533395767212,
687
+ "debug/policy_rejected_logps": -206.90989685058594,
688
+ "debug/reference_chosen_logps": -214.73683166503906,
689
+ "debug/reference_rejected_logps": -155.3048095703125,
690
+ "epoch": 0.6470588235294118,
691
+ "grad_norm": 3.2393949291557003,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": -3.3382985591888428,
694
+ "logits/rejected": -3.019533395767212,
695
+ "logps/chosen": -180.47259521484375,
696
+ "logps/rejected": -206.90989685058594,
697
+ "loss": 0.2186,
698
+ "rewards/accuracies": 1.0,
699
+ "rewards/chosen": 0.3426423668861389,
700
+ "rewards/margins": 0.8586931228637695,
701
+ "rewards/rejected": -0.5160508155822754,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": -3.25339412689209,
706
+ "debug/policy_chosen_logps": -177.0988006591797,
707
+ "debug/policy_rejected_logits": -2.986337900161743,
708
+ "debug/policy_rejected_logps": -201.41714477539062,
709
+ "debug/reference_chosen_logps": -204.36630249023438,
710
+ "debug/reference_rejected_logps": -173.47714233398438,
711
+ "epoch": 0.6666666666666666,
712
+ "grad_norm": 4.241231231386345,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": -3.25339412689209,
715
+ "logits/rejected": -2.986337900161743,
716
+ "logps/chosen": -177.0988006591797,
717
+ "logps/rejected": -201.41714477539062,
718
+ "loss": 0.1937,
719
+ "rewards/accuracies": 0.875,
720
+ "rewards/chosen": 0.2726749777793884,
721
+ "rewards/margins": 0.5520750284194946,
722
+ "rewards/rejected": -0.2794000804424286,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": -3.251969337463379,
727
+ "debug/policy_chosen_logps": -190.11709594726562,
728
+ "debug/policy_rejected_logits": -3.0741655826568604,
729
+ "debug/policy_rejected_logps": -194.2032470703125,
730
+ "debug/reference_chosen_logps": -217.17962646484375,
731
+ "debug/reference_rejected_logps": -160.67848205566406,
732
+ "epoch": 0.6862745098039216,
733
+ "grad_norm": 4.744228039712321,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": -3.251969337463379,
736
+ "logits/rejected": -3.0741655826568604,
737
+ "logps/chosen": -190.11709594726562,
738
+ "logps/rejected": -194.2032470703125,
739
+ "loss": 0.1497,
740
+ "rewards/accuracies": 0.75,
741
+ "rewards/chosen": 0.2706252336502075,
742
+ "rewards/margins": 0.6058727502822876,
743
+ "rewards/rejected": -0.3352475166320801,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": -3.3483757972717285,
748
+ "debug/policy_chosen_logps": -179.4599609375,
749
+ "debug/policy_rejected_logits": -3.1370773315429688,
750
+ "debug/policy_rejected_logps": -190.57760620117188,
751
+ "debug/reference_chosen_logps": -211.63006591796875,
752
+ "debug/reference_rejected_logps": -169.59912109375,
753
+ "epoch": 0.7058823529411765,
754
+ "grad_norm": 3.186431463787634,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": -3.3483757972717285,
757
+ "logits/rejected": -3.1370773315429688,
758
+ "logps/chosen": -179.4599609375,
759
+ "logps/rejected": -190.57760620117188,
760
+ "loss": 0.2442,
761
+ "rewards/accuracies": 0.75,
762
+ "rewards/chosen": 0.32170116901397705,
763
+ "rewards/margins": 0.531485915184021,
764
+ "rewards/rejected": -0.20978482067584991,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": -3.2231926918029785,
769
+ "debug/policy_chosen_logps": -172.751220703125,
770
+ "debug/policy_rejected_logits": -3.042171001434326,
771
+ "debug/policy_rejected_logps": -192.6194610595703,
772
+ "debug/reference_chosen_logps": -204.07984924316406,
773
+ "debug/reference_rejected_logps": -162.96307373046875,
774
+ "epoch": 0.7254901960784313,
775
+ "grad_norm": 3.76917946103672,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": -3.2231926918029785,
778
+ "logits/rejected": -3.042171001434326,
779
+ "logps/chosen": -172.751220703125,
780
+ "logps/rejected": -192.6194610595703,
781
+ "loss": 0.222,
782
+ "rewards/accuracies": 1.0,
783
+ "rewards/chosen": 0.31328636407852173,
784
+ "rewards/margins": 0.609850287437439,
785
+ "rewards/rejected": -0.29656392335891724,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": -3.370556116104126,
790
+ "debug/policy_chosen_logps": -179.37933349609375,
791
+ "debug/policy_rejected_logits": -2.9038238525390625,
792
+ "debug/policy_rejected_logps": -197.517333984375,
793
+ "debug/reference_chosen_logps": -211.8248291015625,
794
+ "debug/reference_rejected_logps": -159.6433563232422,
795
+ "epoch": 0.7450980392156863,
796
+ "grad_norm": 3.7765583713304034,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": -3.370556116104126,
799
+ "logits/rejected": -2.9038238525390625,
800
+ "logps/chosen": -179.37933349609375,
801
+ "logps/rejected": -197.517333984375,
802
+ "loss": 0.214,
803
+ "rewards/accuracies": 1.0,
804
+ "rewards/chosen": 0.3244548439979553,
805
+ "rewards/margins": 0.7031944990158081,
806
+ "rewards/rejected": -0.3787396550178528,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": -3.253465414047241,
811
+ "debug/policy_chosen_logps": -188.2485809326172,
812
+ "debug/policy_rejected_logits": -3.051933526992798,
813
+ "debug/policy_rejected_logps": -186.41270446777344,
814
+ "debug/reference_chosen_logps": -218.99688720703125,
815
+ "debug/reference_rejected_logps": -167.86203002929688,
816
+ "epoch": 0.7647058823529411,
817
+ "grad_norm": 3.533915411617125,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": -3.253465414047241,
820
+ "logits/rejected": -3.051933526992798,
821
+ "logps/chosen": -188.2485809326172,
822
+ "logps/rejected": -186.41270446777344,
823
+ "loss": 0.2102,
824
+ "rewards/accuracies": 0.875,
825
+ "rewards/chosen": 0.3074829876422882,
826
+ "rewards/margins": 0.492989718914032,
827
+ "rewards/rejected": -0.18550674617290497,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": -3.397189140319824,
832
+ "debug/policy_chosen_logps": -184.84124755859375,
833
+ "debug/policy_rejected_logits": -2.951024055480957,
834
+ "debug/policy_rejected_logps": -205.2071533203125,
835
+ "debug/reference_chosen_logps": -216.28897094726562,
836
+ "debug/reference_rejected_logps": -167.5351104736328,
837
+ "epoch": 0.7843137254901961,
838
+ "grad_norm": 4.343119675678065,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": -3.397189140319824,
841
+ "logits/rejected": -2.951024055480957,
842
+ "logps/chosen": -184.84124755859375,
843
+ "logps/rejected": -205.2071533203125,
844
+ "loss": 0.1484,
845
+ "rewards/accuracies": 0.875,
846
+ "rewards/chosen": 0.3144773840904236,
847
+ "rewards/margins": 0.6911977529525757,
848
+ "rewards/rejected": -0.3767203688621521,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": -3.4680333137512207,
853
+ "debug/policy_chosen_logps": -174.45204162597656,
854
+ "debug/policy_rejected_logits": -3.177816867828369,
855
+ "debug/policy_rejected_logps": -200.5006103515625,
856
+ "debug/reference_chosen_logps": -201.8401641845703,
857
+ "debug/reference_rejected_logps": -168.33837890625,
858
+ "epoch": 0.803921568627451,
859
+ "grad_norm": 4.7398844072134,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": -3.4680333137512207,
862
+ "logits/rejected": -3.177816867828369,
863
+ "logps/chosen": -174.45204162597656,
864
+ "logps/rejected": -200.5006103515625,
865
+ "loss": 0.1898,
866
+ "rewards/accuracies": 1.0,
867
+ "rewards/chosen": 0.273881196975708,
868
+ "rewards/margins": 0.595503568649292,
869
+ "rewards/rejected": -0.3216223418712616,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": -3.3945956230163574,
874
+ "debug/policy_chosen_logps": -172.79193115234375,
875
+ "debug/policy_rejected_logits": -3.0443050861358643,
876
+ "debug/policy_rejected_logps": -194.8880615234375,
877
+ "debug/reference_chosen_logps": -198.82882690429688,
878
+ "debug/reference_rejected_logps": -170.7233123779297,
879
+ "epoch": 0.8235294117647058,
880
+ "grad_norm": 5.166045026790051,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": -3.3945956230163574,
883
+ "logits/rejected": -3.0443050861358643,
884
+ "logps/chosen": -172.79193115234375,
885
+ "logps/rejected": -194.8880615234375,
886
+ "loss": 0.1759,
887
+ "rewards/accuracies": 1.0,
888
+ "rewards/chosen": 0.26036909222602844,
889
+ "rewards/margins": 0.5020167827606201,
890
+ "rewards/rejected": -0.24164767563343048,
891
+ "step": 42
892
+ },
893
+ {
894
+ "debug/policy_chosen_logits": -3.366682291030884,
895
+ "debug/policy_chosen_logps": -174.59375,
896
+ "debug/policy_rejected_logits": -3.141162157058716,
897
+ "debug/policy_rejected_logps": -200.96633911132812,
898
+ "debug/reference_chosen_logps": -205.5618438720703,
899
+ "debug/reference_rejected_logps": -192.6947021484375,
900
+ "epoch": 0.8431372549019608,
901
+ "grad_norm": 3.316025693224064,
902
+ "learning_rate": 1e-06,
903
+ "logits/chosen": -3.366682291030884,
904
+ "logits/rejected": -3.141162157058716,
905
+ "logps/chosen": -174.59375,
906
+ "logps/rejected": -200.96633911132812,
907
+ "loss": 0.1598,
908
+ "rewards/accuracies": 0.875,
909
+ "rewards/chosen": 0.3096809387207031,
910
+ "rewards/margins": 0.3923972249031067,
911
+ "rewards/rejected": -0.08271628618240356,
912
+ "step": 43
913
+ },
914
+ {
915
+ "debug/policy_chosen_logits": -3.4701032638549805,
916
+ "debug/policy_chosen_logps": -172.69961547851562,
917
+ "debug/policy_rejected_logits": -3.0816352367401123,
918
+ "debug/policy_rejected_logps": -192.14527893066406,
919
+ "debug/reference_chosen_logps": -203.4981231689453,
920
+ "debug/reference_rejected_logps": -165.00967407226562,
921
+ "epoch": 0.8627450980392157,
922
+ "grad_norm": 5.732457245614043,
923
+ "learning_rate": 1e-06,
924
+ "logits/chosen": -3.4701032638549805,
925
+ "logits/rejected": -3.0816352367401123,
926
+ "logps/chosen": -172.69961547851562,
927
+ "logps/rejected": -192.14527893066406,
928
+ "loss": 0.2551,
929
+ "rewards/accuracies": 0.875,
930
+ "rewards/chosen": 0.30798494815826416,
931
+ "rewards/margins": 0.5793408155441284,
932
+ "rewards/rejected": -0.27135586738586426,
933
+ "step": 44
934
+ },
935
+ {
936
+ "debug/policy_chosen_logits": -3.3291234970092773,
937
+ "debug/policy_chosen_logps": -170.3388671875,
938
+ "debug/policy_rejected_logits": -2.937793493270874,
939
+ "debug/policy_rejected_logps": -192.89581298828125,
940
+ "debug/reference_chosen_logps": -202.19802856445312,
941
+ "debug/reference_rejected_logps": -159.46893310546875,
942
+ "epoch": 0.8823529411764706,
943
+ "grad_norm": 5.771805579285295,
944
+ "learning_rate": 1e-06,
945
+ "logits/chosen": -3.3291234970092773,
946
+ "logits/rejected": -2.937793493270874,
947
+ "logps/chosen": -170.3388671875,
948
+ "logps/rejected": -192.89581298828125,
949
+ "loss": 0.2151,
950
+ "rewards/accuracies": 0.875,
951
+ "rewards/chosen": 0.31859153509140015,
952
+ "rewards/margins": 0.6528602242469788,
953
+ "rewards/rejected": -0.334268718957901,
954
+ "step": 45
955
+ },
956
+ {
957
+ "debug/policy_chosen_logits": -3.226428270339966,
958
+ "debug/policy_chosen_logps": -187.73739624023438,
959
+ "debug/policy_rejected_logits": -2.986654043197632,
960
+ "debug/policy_rejected_logps": -192.33885192871094,
961
+ "debug/reference_chosen_logps": -220.32342529296875,
962
+ "debug/reference_rejected_logps": -167.2851104736328,
963
+ "epoch": 0.9019607843137255,
964
+ "grad_norm": 2.8823875670773695,
965
+ "learning_rate": 1e-06,
966
+ "logits/chosen": -3.226428270339966,
967
+ "logits/rejected": -2.986654043197632,
968
+ "logps/chosen": -187.73739624023438,
969
+ "logps/rejected": -192.33885192871094,
970
+ "loss": 0.1605,
971
+ "rewards/accuracies": 0.875,
972
+ "rewards/chosen": 0.32586026191711426,
973
+ "rewards/margins": 0.5763977766036987,
974
+ "rewards/rejected": -0.2505374550819397,
975
+ "step": 46
976
+ },
977
+ {
978
+ "debug/policy_chosen_logits": -3.5862808227539062,
979
+ "debug/policy_chosen_logps": -182.93630981445312,
980
+ "debug/policy_rejected_logits": -3.0501527786254883,
981
+ "debug/policy_rejected_logps": -197.54100036621094,
982
+ "debug/reference_chosen_logps": -213.4707794189453,
983
+ "debug/reference_rejected_logps": -172.90322875976562,
984
+ "epoch": 0.9215686274509803,
985
+ "grad_norm": 2.720609192674643,
986
+ "learning_rate": 1e-06,
987
+ "logits/chosen": -3.5862808227539062,
988
+ "logits/rejected": -3.0501527786254883,
989
+ "logps/chosen": -182.93630981445312,
990
+ "logps/rejected": -197.54100036621094,
991
+ "loss": 0.1686,
992
+ "rewards/accuracies": 1.0,
993
+ "rewards/chosen": 0.30534470081329346,
994
+ "rewards/margins": 0.5517222881317139,
995
+ "rewards/rejected": -0.2463776171207428,
996
+ "step": 47
997
+ },
998
+ {
999
+ "debug/policy_chosen_logits": -3.3840320110321045,
1000
+ "debug/policy_chosen_logps": -175.84884643554688,
1001
+ "debug/policy_rejected_logits": -3.043017625808716,
1002
+ "debug/policy_rejected_logps": -202.57257080078125,
1003
+ "debug/reference_chosen_logps": -203.88320922851562,
1004
+ "debug/reference_rejected_logps": -172.34735107421875,
1005
+ "epoch": 0.9411764705882353,
1006
+ "grad_norm": 2.309188784503618,
1007
+ "learning_rate": 1e-06,
1008
+ "logits/chosen": -3.3840320110321045,
1009
+ "logits/rejected": -3.043017625808716,
1010
+ "logps/chosen": -175.84884643554688,
1011
+ "logps/rejected": -202.57257080078125,
1012
+ "loss": 0.1666,
1013
+ "rewards/accuracies": 1.0,
1014
+ "rewards/chosen": 0.28034350275993347,
1015
+ "rewards/margins": 0.5825955271720886,
1016
+ "rewards/rejected": -0.30225205421447754,
1017
+ "step": 48
1018
+ },
1019
+ {
1020
+ "debug/policy_chosen_logits": -3.3747286796569824,
1021
+ "debug/policy_chosen_logps": -185.90341186523438,
1022
+ "debug/policy_rejected_logits": -3.0797741413116455,
1023
+ "debug/policy_rejected_logps": -202.92413330078125,
1024
+ "debug/reference_chosen_logps": -221.17340087890625,
1025
+ "debug/reference_rejected_logps": -154.19923400878906,
1026
+ "epoch": 0.9607843137254902,
1027
+ "grad_norm": 2.7307344297698157,
1028
+ "learning_rate": 1e-06,
1029
+ "logits/chosen": -3.3747286796569824,
1030
+ "logits/rejected": -3.0797741413116455,
1031
+ "logps/chosen": -185.90341186523438,
1032
+ "logps/rejected": -202.92413330078125,
1033
+ "loss": 0.1461,
1034
+ "rewards/accuracies": 1.0,
1035
+ "rewards/chosen": 0.3527000844478607,
1036
+ "rewards/margins": 0.8399491310119629,
1037
+ "rewards/rejected": -0.48724907636642456,
1038
+ "step": 49
1039
+ },
1040
+ {
1041
+ "debug/policy_chosen_logits": -3.4781532287597656,
1042
+ "debug/policy_chosen_logps": -177.86842346191406,
1043
+ "debug/policy_rejected_logits": -3.0594165325164795,
1044
+ "debug/policy_rejected_logps": -195.89004516601562,
1045
+ "debug/reference_chosen_logps": -203.58407592773438,
1046
+ "debug/reference_rejected_logps": -161.39697265625,
1047
+ "epoch": 0.9803921568627451,
1048
+ "grad_norm": 3.3901490209041247,
1049
+ "learning_rate": 1e-06,
1050
+ "logits/chosen": -3.4781532287597656,
1051
+ "logits/rejected": -3.0594165325164795,
1052
+ "logps/chosen": -177.86842346191406,
1053
+ "logps/rejected": -195.89004516601562,
1054
+ "loss": 0.1895,
1055
+ "rewards/accuracies": 0.875,
1056
+ "rewards/chosen": 0.2571565508842468,
1057
+ "rewards/margins": 0.6020870208740234,
1058
+ "rewards/rejected": -0.3449305295944214,
1059
+ "step": 50
1060
+ },
1061
+ {
1062
+ "debug/policy_chosen_logits": -3.4821436405181885,
1063
+ "debug/policy_chosen_logps": -177.99000549316406,
1064
+ "debug/policy_rejected_logits": -3.0359578132629395,
1065
+ "debug/policy_rejected_logps": -203.94029235839844,
1066
+ "debug/reference_chosen_logps": -212.01348876953125,
1067
+ "debug/reference_rejected_logps": -166.06936645507812,
1068
+ "epoch": 1.0,
1069
+ "grad_norm": 3.0238129167739807,
1070
+ "learning_rate": 1e-06,
1071
+ "logits/chosen": -3.4821436405181885,
1072
+ "logits/rejected": -3.0359578132629395,
1073
+ "logps/chosen": -177.99000549316406,
1074
+ "logps/rejected": -203.94029235839844,
1075
+ "loss": 0.1383,
1076
+ "rewards/accuracies": 0.875,
1077
+ "rewards/chosen": 0.3402349352836609,
1078
+ "rewards/margins": 0.7189440727233887,
1079
+ "rewards/rejected": -0.37870916724205017,
1080
+ "step": 51
1081
+ },
1082
+ {
1083
+ "epoch": 1.0,
1084
+ "step": 51,
1085
+ "total_flos": 0.0,
1086
+ "train_loss": 0.26999635322421206,
1087
+ "train_runtime": 166.8574,
1088
+ "train_samples_per_second": 19.31,
1089
+ "train_steps_per_second": 0.306
1090
+ }
1091
+ ],
1092
+ "logging_steps": 1,
1093
+ "max_steps": 51,
1094
+ "num_input_tokens_seen": 0,
1095
+ "num_train_epochs": 1,
1096
+ "save_steps": 500,
1097
+ "stateful_callbacks": {
1098
+ "TrainerControl": {
1099
+ "args": {
1100
+ "should_epoch_stop": false,
1101
+ "should_evaluate": false,
1102
+ "should_log": false,
1103
+ "should_save": true,
1104
+ "should_training_stop": true
1105
+ },
1106
+ "attributes": {}
1107
+ }
1108
+ },
1109
+ "total_flos": 0.0,
1110
+ "train_batch_size": 8,
1111
+ "trial_name": null,
1112
+ "trial_params": null
1113
+ }