yiran-wang3 commited on
Commit
b167453
1 Parent(s): 4b0307a

End of training

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: yiran-wang3/qwen2_chat_reflct_adamw_iter2
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/qw2_reflct_sppo_hard_new_cn_mining_oj_iter2-binarized-reflection-scored
12
+ model-index:
13
+ - name: qwen2_chat_reflct_adamw_iter3
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # qwen2_chat_reflct_adamw_iter3
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/qwen2_chat_reflct_adamw_iter2](https://huggingface.co/yiran-wang3/qwen2_chat_reflct_adamw_iter2) on the self-generate/qw2_reflct_sppo_hard_new_cn_mining_oj_iter2-binarized-reflection-scored dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.2
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.46715006164529105,
5
+ "train_runtime": 182.0537,
6
+ "train_samples": 2811,
7
+ "train_samples_per_second": 15.44,
8
+ "train_steps_per_second": 0.242
9
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.45.0"
14
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.46715006164529105,
5
+ "train_runtime": 182.0537,
6
+ "train_samples": 2811,
7
+ "train_samples_per_second": 15.44,
8
+ "train_steps_per_second": 0.242
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 44,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": -1.5945265293121338,
13
+ "debug/policy_chosen_logps": -248.4810028076172,
14
+ "debug/policy_rejected_logits": -1.576964020729065,
15
+ "debug/policy_rejected_logps": -333.0391540527344,
16
+ "debug/reference_chosen_logps": -248.4810028076172,
17
+ "debug/reference_rejected_logps": -333.0391540527344,
18
+ "epoch": 0.022727272727272728,
19
+ "grad_norm": 18.66563913313551,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": -1.5945265293121338,
22
+ "logits/rejected": -1.576964020729065,
23
+ "logps/chosen": -248.4810028076172,
24
+ "logps/rejected": -333.0391540527344,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": -1.4691928625106812,
34
+ "debug/policy_chosen_logps": -252.78114318847656,
35
+ "debug/policy_rejected_logits": -1.6123135089874268,
36
+ "debug/policy_rejected_logps": -287.5721130371094,
37
+ "debug/reference_chosen_logps": -252.46141052246094,
38
+ "debug/reference_rejected_logps": -286.8046875,
39
+ "epoch": 0.045454545454545456,
40
+ "grad_norm": 8.201360686444898,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": -1.4691928625106812,
43
+ "logits/rejected": -1.6123135089874268,
44
+ "logps/chosen": -252.78114318847656,
45
+ "logps/rejected": -287.5721130371094,
46
+ "loss": 0.4958,
47
+ "rewards/accuracies": 0.625,
48
+ "rewards/chosen": -0.003197269281372428,
49
+ "rewards/margins": 0.004476870875805616,
50
+ "rewards/rejected": -0.007674140390008688,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": -1.5802356004714966,
55
+ "debug/policy_chosen_logps": -264.4571533203125,
56
+ "debug/policy_rejected_logits": -1.5960618257522583,
57
+ "debug/policy_rejected_logps": -261.72198486328125,
58
+ "debug/reference_chosen_logps": -265.066162109375,
59
+ "debug/reference_rejected_logps": -261.2563171386719,
60
+ "epoch": 0.06818181818181818,
61
+ "grad_norm": 13.69003465257947,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": -1.5802356004714966,
64
+ "logits/rejected": -1.5960618257522583,
65
+ "logps/chosen": -264.4571533203125,
66
+ "logps/rejected": -261.72198486328125,
67
+ "loss": 0.4938,
68
+ "rewards/accuracies": 0.75,
69
+ "rewards/chosen": 0.006090049631893635,
70
+ "rewards/margins": 0.010746955871582031,
71
+ "rewards/rejected": -0.004656905774027109,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": -1.508122205734253,
76
+ "debug/policy_chosen_logps": -233.5382537841797,
77
+ "debug/policy_rejected_logits": -1.4848679304122925,
78
+ "debug/policy_rejected_logps": -340.95074462890625,
79
+ "debug/reference_chosen_logps": -233.94361877441406,
80
+ "debug/reference_rejected_logps": -341.15191650390625,
81
+ "epoch": 0.09090909090909091,
82
+ "grad_norm": 8.511003954283053,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": -1.508122205734253,
85
+ "logits/rejected": -1.4848679304122925,
86
+ "logps/chosen": -233.5382537841797,
87
+ "logps/rejected": -340.95074462890625,
88
+ "loss": 0.4939,
89
+ "rewards/accuracies": 0.625,
90
+ "rewards/chosen": 0.004053611773997545,
91
+ "rewards/margins": 0.0020416262559592724,
92
+ "rewards/rejected": 0.002011985518038273,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": -1.5614800453186035,
97
+ "debug/policy_chosen_logps": -265.24407958984375,
98
+ "debug/policy_rejected_logits": -1.5772254467010498,
99
+ "debug/policy_rejected_logps": -268.71136474609375,
100
+ "debug/reference_chosen_logps": -263.666259765625,
101
+ "debug/reference_rejected_logps": -268.02203369140625,
102
+ "epoch": 0.11363636363636363,
103
+ "grad_norm": 9.050149178031523,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": -1.5614800453186035,
106
+ "logits/rejected": -1.5772254467010498,
107
+ "logps/chosen": -265.24407958984375,
108
+ "logps/rejected": -268.71136474609375,
109
+ "loss": 0.4907,
110
+ "rewards/accuracies": 0.375,
111
+ "rewards/chosen": -0.0157785601913929,
112
+ "rewards/margins": -0.008885439485311508,
113
+ "rewards/rejected": -0.006893118843436241,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": -1.5854442119598389,
118
+ "debug/policy_chosen_logps": -246.00494384765625,
119
+ "debug/policy_rejected_logits": -1.5459868907928467,
120
+ "debug/policy_rejected_logps": -258.53924560546875,
121
+ "debug/reference_chosen_logps": -244.73007202148438,
122
+ "debug/reference_rejected_logps": -257.6000061035156,
123
+ "epoch": 0.13636363636363635,
124
+ "grad_norm": 8.222575188444678,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": -1.5854442119598389,
127
+ "logits/rejected": -1.5459868907928467,
128
+ "logps/chosen": -246.00494384765625,
129
+ "logps/rejected": -258.53924560546875,
130
+ "loss": 0.4788,
131
+ "rewards/accuracies": 0.375,
132
+ "rewards/chosen": -0.012748966924846172,
133
+ "rewards/margins": -0.0033567622303962708,
134
+ "rewards/rejected": -0.009392204694449902,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": -1.676500678062439,
139
+ "debug/policy_chosen_logps": -229.7958984375,
140
+ "debug/policy_rejected_logits": -1.6866339445114136,
141
+ "debug/policy_rejected_logps": -250.69273376464844,
142
+ "debug/reference_chosen_logps": -228.82376098632812,
143
+ "debug/reference_rejected_logps": -247.0283203125,
144
+ "epoch": 0.1590909090909091,
145
+ "grad_norm": 15.044352960137017,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": -1.676500678062439,
148
+ "logits/rejected": -1.6866339445114136,
149
+ "logps/chosen": -229.7958984375,
150
+ "logps/rejected": -250.69273376464844,
151
+ "loss": 0.4891,
152
+ "rewards/accuracies": 0.75,
153
+ "rewards/chosen": -0.009721355512738228,
154
+ "rewards/margins": 0.026922915130853653,
155
+ "rewards/rejected": -0.03664426505565643,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": -1.6314057111740112,
160
+ "debug/policy_chosen_logps": -240.93679809570312,
161
+ "debug/policy_rejected_logits": -1.511518120765686,
162
+ "debug/policy_rejected_logps": -286.17041015625,
163
+ "debug/reference_chosen_logps": -241.15689086914062,
164
+ "debug/reference_rejected_logps": -284.9034118652344,
165
+ "epoch": 0.18181818181818182,
166
+ "grad_norm": 9.208531697498636,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": -1.6314057111740112,
169
+ "logits/rejected": -1.511518120765686,
170
+ "logps/chosen": -240.93679809570312,
171
+ "logps/rejected": -286.17041015625,
172
+ "loss": 0.4948,
173
+ "rewards/accuracies": 0.5,
174
+ "rewards/chosen": 0.0022008903324604034,
175
+ "rewards/margins": 0.014870930463075638,
176
+ "rewards/rejected": -0.012670040130615234,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": -1.586374044418335,
181
+ "debug/policy_chosen_logps": -259.5894775390625,
182
+ "debug/policy_rejected_logits": -1.6641004085540771,
183
+ "debug/policy_rejected_logps": -292.8546447753906,
184
+ "debug/reference_chosen_logps": -258.95721435546875,
185
+ "debug/reference_rejected_logps": -290.0174560546875,
186
+ "epoch": 0.20454545454545456,
187
+ "grad_norm": 7.694356082960458,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": -1.586374044418335,
190
+ "logits/rejected": -1.6641004085540771,
191
+ "logps/chosen": -259.5894775390625,
192
+ "logps/rejected": -292.8546447753906,
193
+ "loss": 0.4878,
194
+ "rewards/accuracies": 0.625,
195
+ "rewards/chosen": -0.006322898901998997,
196
+ "rewards/margins": 0.022048911079764366,
197
+ "rewards/rejected": -0.028371810913085938,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": -1.6060452461242676,
202
+ "debug/policy_chosen_logps": -243.36305236816406,
203
+ "debug/policy_rejected_logits": -1.3573477268218994,
204
+ "debug/policy_rejected_logps": -231.2056427001953,
205
+ "debug/reference_chosen_logps": -243.5709686279297,
206
+ "debug/reference_rejected_logps": -233.7530517578125,
207
+ "epoch": 0.22727272727272727,
208
+ "grad_norm": 9.648028168333992,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": -1.6060452461242676,
211
+ "logits/rejected": -1.3573477268218994,
212
+ "logps/chosen": -243.36305236816406,
213
+ "logps/rejected": -231.2056427001953,
214
+ "loss": 0.494,
215
+ "rewards/accuracies": 0.375,
216
+ "rewards/chosen": 0.0020791429560631514,
217
+ "rewards/margins": -0.02339502051472664,
218
+ "rewards/rejected": 0.025474166497588158,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": -1.5926264524459839,
223
+ "debug/policy_chosen_logps": -252.65029907226562,
224
+ "debug/policy_rejected_logits": -1.53349769115448,
225
+ "debug/policy_rejected_logps": -275.5002136230469,
226
+ "debug/reference_chosen_logps": -250.59860229492188,
227
+ "debug/reference_rejected_logps": -272.654052734375,
228
+ "epoch": 0.25,
229
+ "grad_norm": 9.286445190753403,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": -1.5926264524459839,
232
+ "logits/rejected": -1.53349769115448,
233
+ "logps/chosen": -252.65029907226562,
234
+ "logps/rejected": -275.5002136230469,
235
+ "loss": 0.4821,
236
+ "rewards/accuracies": 0.625,
237
+ "rewards/chosen": -0.02051687240600586,
238
+ "rewards/margins": 0.007944850251078606,
239
+ "rewards/rejected": -0.028461724519729614,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": -1.6525866985321045,
244
+ "debug/policy_chosen_logps": -241.84744262695312,
245
+ "debug/policy_rejected_logits": -1.5179157257080078,
246
+ "debug/policy_rejected_logps": -306.1255798339844,
247
+ "debug/reference_chosen_logps": -244.6147003173828,
248
+ "debug/reference_rejected_logps": -300.14434814453125,
249
+ "epoch": 0.2727272727272727,
250
+ "grad_norm": 10.568220890948059,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": -1.6525866985321045,
253
+ "logits/rejected": -1.5179157257080078,
254
+ "logps/chosen": -241.84744262695312,
255
+ "logps/rejected": -306.1255798339844,
256
+ "loss": 0.453,
257
+ "rewards/accuracies": 0.75,
258
+ "rewards/chosen": 0.027672480791807175,
259
+ "rewards/margins": 0.08748496323823929,
260
+ "rewards/rejected": -0.05981248617172241,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": -1.5735505819320679,
265
+ "debug/policy_chosen_logps": -249.08001708984375,
266
+ "debug/policy_rejected_logits": -1.5205522775650024,
267
+ "debug/policy_rejected_logps": -316.7584228515625,
268
+ "debug/reference_chosen_logps": -248.20947265625,
269
+ "debug/reference_rejected_logps": -316.1866149902344,
270
+ "epoch": 0.29545454545454547,
271
+ "grad_norm": 13.229143544524582,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": -1.5735505819320679,
274
+ "logits/rejected": -1.5205522775650024,
275
+ "logps/chosen": -249.08001708984375,
276
+ "logps/rejected": -316.7584228515625,
277
+ "loss": 0.4744,
278
+ "rewards/accuracies": 0.625,
279
+ "rewards/chosen": -0.008705596439540386,
280
+ "rewards/margins": -0.002987404353916645,
281
+ "rewards/rejected": -0.005718193016946316,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": -1.6373316049575806,
286
+ "debug/policy_chosen_logps": -212.08322143554688,
287
+ "debug/policy_rejected_logits": -1.6967588663101196,
288
+ "debug/policy_rejected_logps": -224.3775634765625,
289
+ "debug/reference_chosen_logps": -214.68466186523438,
290
+ "debug/reference_rejected_logps": -228.5543212890625,
291
+ "epoch": 0.3181818181818182,
292
+ "grad_norm": 9.790511964324931,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": -1.6373316049575806,
295
+ "logits/rejected": -1.6967588663101196,
296
+ "logps/chosen": -212.08322143554688,
297
+ "logps/rejected": -224.3775634765625,
298
+ "loss": 0.4798,
299
+ "rewards/accuracies": 0.375,
300
+ "rewards/chosen": 0.026014385744929314,
301
+ "rewards/margins": -0.015753211453557014,
302
+ "rewards/rejected": 0.04176759719848633,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": -1.6537970304489136,
307
+ "debug/policy_chosen_logps": -229.5901336669922,
308
+ "debug/policy_rejected_logits": -1.5129016637802124,
309
+ "debug/policy_rejected_logps": -324.74163818359375,
310
+ "debug/reference_chosen_logps": -231.54010009765625,
311
+ "debug/reference_rejected_logps": -320.255859375,
312
+ "epoch": 0.3409090909090909,
313
+ "grad_norm": 10.206777571830463,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": -1.6537970304489136,
316
+ "logits/rejected": -1.5129016637802124,
317
+ "logps/chosen": -229.5901336669922,
318
+ "logps/rejected": -324.74163818359375,
319
+ "loss": 0.4786,
320
+ "rewards/accuracies": 0.75,
321
+ "rewards/chosen": 0.019499586895108223,
322
+ "rewards/margins": 0.06435704976320267,
323
+ "rewards/rejected": -0.044857464730739594,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": -1.5613651275634766,
328
+ "debug/policy_chosen_logps": -236.87420654296875,
329
+ "debug/policy_rejected_logits": -1.4253557920455933,
330
+ "debug/policy_rejected_logps": -288.8067626953125,
331
+ "debug/reference_chosen_logps": -237.55490112304688,
332
+ "debug/reference_rejected_logps": -285.34820556640625,
333
+ "epoch": 0.36363636363636365,
334
+ "grad_norm": 9.102977996005102,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": -1.5613651275634766,
337
+ "logits/rejected": -1.4253557920455933,
338
+ "logps/chosen": -236.87420654296875,
339
+ "logps/rejected": -288.8067626953125,
340
+ "loss": 0.4795,
341
+ "rewards/accuracies": 0.75,
342
+ "rewards/chosen": 0.006807117722928524,
343
+ "rewards/margins": 0.041392937302589417,
344
+ "rewards/rejected": -0.03458581864833832,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": -1.5483341217041016,
349
+ "debug/policy_chosen_logps": -263.6671447753906,
350
+ "debug/policy_rejected_logits": -1.5648168325424194,
351
+ "debug/policy_rejected_logps": -270.9856262207031,
352
+ "debug/reference_chosen_logps": -263.8393249511719,
353
+ "debug/reference_rejected_logps": -265.57177734375,
354
+ "epoch": 0.38636363636363635,
355
+ "grad_norm": 8.919951058839446,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": -1.5483341217041016,
358
+ "logits/rejected": -1.5648168325424194,
359
+ "logps/chosen": -263.6671447753906,
360
+ "logps/rejected": -270.9856262207031,
361
+ "loss": 0.4743,
362
+ "rewards/accuracies": 0.625,
363
+ "rewards/chosen": 0.0017218790017068386,
364
+ "rewards/margins": 0.055860649794340134,
365
+ "rewards/rejected": -0.054138775914907455,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": -1.5947633981704712,
370
+ "debug/policy_chosen_logps": -235.00515747070312,
371
+ "debug/policy_rejected_logits": -1.6195746660232544,
372
+ "debug/policy_rejected_logps": -274.81976318359375,
373
+ "debug/reference_chosen_logps": -233.3927001953125,
374
+ "debug/reference_rejected_logps": -270.100830078125,
375
+ "epoch": 0.4090909090909091,
376
+ "grad_norm": 8.403923001724355,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": -1.5947633981704712,
379
+ "logits/rejected": -1.6195746660232544,
380
+ "logps/chosen": -235.00515747070312,
381
+ "logps/rejected": -274.81976318359375,
382
+ "loss": 0.503,
383
+ "rewards/accuracies": 0.5,
384
+ "rewards/chosen": -0.01612449437379837,
385
+ "rewards/margins": 0.03106483817100525,
386
+ "rewards/rejected": -0.04718932881951332,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": -1.621622920036316,
391
+ "debug/policy_chosen_logps": -232.43492126464844,
392
+ "debug/policy_rejected_logits": -1.6612340211868286,
393
+ "debug/policy_rejected_logps": -258.89154052734375,
394
+ "debug/reference_chosen_logps": -229.03573608398438,
395
+ "debug/reference_rejected_logps": -256.68316650390625,
396
+ "epoch": 0.4318181818181818,
397
+ "grad_norm": 8.489598917138654,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": -1.621622920036316,
400
+ "logits/rejected": -1.6612340211868286,
401
+ "logps/chosen": -232.43492126464844,
402
+ "logps/rejected": -258.89154052734375,
403
+ "loss": 0.4798,
404
+ "rewards/accuracies": 0.625,
405
+ "rewards/chosen": -0.03399191051721573,
406
+ "rewards/margins": -0.011908477172255516,
407
+ "rewards/rejected": -0.022083435207605362,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": -1.582741141319275,
412
+ "debug/policy_chosen_logps": -225.0340118408203,
413
+ "debug/policy_rejected_logits": -1.40219247341156,
414
+ "debug/policy_rejected_logps": -290.873046875,
415
+ "debug/reference_chosen_logps": -227.0716552734375,
416
+ "debug/reference_rejected_logps": -287.50537109375,
417
+ "epoch": 0.45454545454545453,
418
+ "grad_norm": 14.964543325135429,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": -1.582741141319275,
421
+ "logits/rejected": -1.40219247341156,
422
+ "logps/chosen": -225.0340118408203,
423
+ "logps/rejected": -290.873046875,
424
+ "loss": 0.4544,
425
+ "rewards/accuracies": 0.625,
426
+ "rewards/chosen": 0.020376453176140785,
427
+ "rewards/margins": 0.05405297875404358,
428
+ "rewards/rejected": -0.03367652744054794,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": -1.6128125190734863,
433
+ "debug/policy_chosen_logps": -246.7266845703125,
434
+ "debug/policy_rejected_logits": -1.5290520191192627,
435
+ "debug/policy_rejected_logps": -311.530029296875,
436
+ "debug/reference_chosen_logps": -244.32289123535156,
437
+ "debug/reference_rejected_logps": -304.60919189453125,
438
+ "epoch": 0.4772727272727273,
439
+ "grad_norm": 7.778047811002451,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": -1.6128125190734863,
442
+ "logits/rejected": -1.5290520191192627,
443
+ "logps/chosen": -246.7266845703125,
444
+ "logps/rejected": -311.530029296875,
445
+ "loss": 0.4494,
446
+ "rewards/accuracies": 0.75,
447
+ "rewards/chosen": -0.024038048461079597,
448
+ "rewards/margins": 0.04517022892832756,
449
+ "rewards/rejected": -0.06920827925205231,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": -1.5806679725646973,
454
+ "debug/policy_chosen_logps": -243.53294372558594,
455
+ "debug/policy_rejected_logits": -1.4768867492675781,
456
+ "debug/policy_rejected_logps": -298.703369140625,
457
+ "debug/reference_chosen_logps": -249.50653076171875,
458
+ "debug/reference_rejected_logps": -297.78338623046875,
459
+ "epoch": 0.5,
460
+ "grad_norm": 14.495393655090531,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": -1.5806679725646973,
463
+ "logits/rejected": -1.4768867492675781,
464
+ "logps/chosen": -243.53294372558594,
465
+ "logps/rejected": -298.703369140625,
466
+ "loss": 0.4588,
467
+ "rewards/accuracies": 0.875,
468
+ "rewards/chosen": 0.05973583087325096,
469
+ "rewards/margins": 0.0689353495836258,
470
+ "rewards/rejected": -0.00919952243566513,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": -1.544007658958435,
475
+ "debug/policy_chosen_logps": -237.45614624023438,
476
+ "debug/policy_rejected_logits": -1.5286664962768555,
477
+ "debug/policy_rejected_logps": -277.1255187988281,
478
+ "debug/reference_chosen_logps": -240.9324188232422,
479
+ "debug/reference_rejected_logps": -278.2364196777344,
480
+ "epoch": 0.5227272727272727,
481
+ "grad_norm": 9.777674269402368,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": -1.544007658958435,
484
+ "logits/rejected": -1.5286664962768555,
485
+ "logps/chosen": -237.45614624023438,
486
+ "logps/rejected": -277.1255187988281,
487
+ "loss": 0.4841,
488
+ "rewards/accuracies": 0.5,
489
+ "rewards/chosen": 0.03476274386048317,
490
+ "rewards/margins": 0.023653697222471237,
491
+ "rewards/rejected": 0.011109047569334507,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": -1.7358722686767578,
496
+ "debug/policy_chosen_logps": -225.25177001953125,
497
+ "debug/policy_rejected_logits": -1.6812310218811035,
498
+ "debug/policy_rejected_logps": -280.5274658203125,
499
+ "debug/reference_chosen_logps": -234.92703247070312,
500
+ "debug/reference_rejected_logps": -281.25482177734375,
501
+ "epoch": 0.5454545454545454,
502
+ "grad_norm": 13.467530720556999,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": -1.7358722686767578,
505
+ "logits/rejected": -1.6812310218811035,
506
+ "logps/chosen": -225.25177001953125,
507
+ "logps/rejected": -280.5274658203125,
508
+ "loss": 0.4623,
509
+ "rewards/accuracies": 0.625,
510
+ "rewards/chosen": 0.09675268083810806,
511
+ "rewards/margins": 0.08947925269603729,
512
+ "rewards/rejected": 0.00727342814207077,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": -1.6739790439605713,
517
+ "debug/policy_chosen_logps": -236.86959838867188,
518
+ "debug/policy_rejected_logits": -1.6139239072799683,
519
+ "debug/policy_rejected_logps": -299.10101318359375,
520
+ "debug/reference_chosen_logps": -242.93692016601562,
521
+ "debug/reference_rejected_logps": -300.6515197753906,
522
+ "epoch": 0.5681818181818182,
523
+ "grad_norm": 17.285734295822873,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": -1.6739790439605713,
526
+ "logits/rejected": -1.6139239072799683,
527
+ "logps/chosen": -236.86959838867188,
528
+ "logps/rejected": -299.10101318359375,
529
+ "loss": 0.4723,
530
+ "rewards/accuracies": 0.875,
531
+ "rewards/chosen": 0.060673218220472336,
532
+ "rewards/margins": 0.045168302953243256,
533
+ "rewards/rejected": 0.01550491526722908,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": -1.6005784273147583,
538
+ "debug/policy_chosen_logps": -259.8432312011719,
539
+ "debug/policy_rejected_logits": -1.6029760837554932,
540
+ "debug/policy_rejected_logps": -321.1543273925781,
541
+ "debug/reference_chosen_logps": -260.79193115234375,
542
+ "debug/reference_rejected_logps": -315.56201171875,
543
+ "epoch": 0.5909090909090909,
544
+ "grad_norm": 8.691201102596596,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": -1.6005784273147583,
547
+ "logits/rejected": -1.6029760837554932,
548
+ "logps/chosen": -259.8432312011719,
549
+ "logps/rejected": -321.1543273925781,
550
+ "loss": 0.4433,
551
+ "rewards/accuracies": 0.625,
552
+ "rewards/chosen": 0.009487076662480831,
553
+ "rewards/margins": 0.06540995091199875,
554
+ "rewards/rejected": -0.055922869592905045,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": -1.4657841920852661,
559
+ "debug/policy_chosen_logps": -224.77310180664062,
560
+ "debug/policy_rejected_logits": -1.356827974319458,
561
+ "debug/policy_rejected_logps": -251.26858520507812,
562
+ "debug/reference_chosen_logps": -233.67315673828125,
563
+ "debug/reference_rejected_logps": -253.70326232910156,
564
+ "epoch": 0.6136363636363636,
565
+ "grad_norm": 17.374104001690984,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": -1.4657841920852661,
568
+ "logits/rejected": -1.356827974319458,
569
+ "logps/chosen": -224.77310180664062,
570
+ "logps/rejected": -251.26858520507812,
571
+ "loss": 0.4737,
572
+ "rewards/accuracies": 0.75,
573
+ "rewards/chosen": 0.08900048583745956,
574
+ "rewards/margins": 0.06465375423431396,
575
+ "rewards/rejected": 0.0243467316031456,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": -1.553269863128662,
580
+ "debug/policy_chosen_logps": -263.990478515625,
581
+ "debug/policy_rejected_logits": -1.5638726949691772,
582
+ "debug/policy_rejected_logps": -269.4271545410156,
583
+ "debug/reference_chosen_logps": -269.1437683105469,
584
+ "debug/reference_rejected_logps": -266.83984375,
585
+ "epoch": 0.6363636363636364,
586
+ "grad_norm": 13.189942958136367,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": -1.553269863128662,
589
+ "logits/rejected": -1.5638726949691772,
590
+ "logps/chosen": -263.990478515625,
591
+ "logps/rejected": -269.4271545410156,
592
+ "loss": 0.456,
593
+ "rewards/accuracies": 0.75,
594
+ "rewards/chosen": 0.051532648503780365,
595
+ "rewards/margins": 0.0774059072136879,
596
+ "rewards/rejected": -0.02587326057255268,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": -1.6705049276351929,
601
+ "debug/policy_chosen_logps": -209.72177124023438,
602
+ "debug/policy_rejected_logits": -1.5858540534973145,
603
+ "debug/policy_rejected_logps": -255.63687133789062,
604
+ "debug/reference_chosen_logps": -215.01263427734375,
605
+ "debug/reference_rejected_logps": -254.82583618164062,
606
+ "epoch": 0.6590909090909091,
607
+ "grad_norm": 9.026294915195177,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": -1.6705049276351929,
610
+ "logits/rejected": -1.5858540534973145,
611
+ "logps/chosen": -209.72177124023438,
612
+ "logps/rejected": -255.63687133789062,
613
+ "loss": 0.4122,
614
+ "rewards/accuracies": 0.5,
615
+ "rewards/chosen": 0.05290856957435608,
616
+ "rewards/margins": 0.0610189251601696,
617
+ "rewards/rejected": -0.008110351860523224,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": -1.4666748046875,
622
+ "debug/policy_chosen_logps": -205.2412109375,
623
+ "debug/policy_rejected_logits": -1.454092264175415,
624
+ "debug/policy_rejected_logps": -251.50238037109375,
625
+ "debug/reference_chosen_logps": -201.99588012695312,
626
+ "debug/reference_rejected_logps": -243.31808471679688,
627
+ "epoch": 0.6818181818181818,
628
+ "grad_norm": 30.79651895186043,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": -1.4666748046875,
631
+ "logits/rejected": -1.454092264175415,
632
+ "logps/chosen": -205.2412109375,
633
+ "logps/rejected": -251.50238037109375,
634
+ "loss": 0.4794,
635
+ "rewards/accuracies": 0.625,
636
+ "rewards/chosen": -0.0324535071849823,
637
+ "rewards/margins": 0.049389228224754333,
638
+ "rewards/rejected": -0.08184273540973663,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": -1.6505978107452393,
643
+ "debug/policy_chosen_logps": -214.40921020507812,
644
+ "debug/policy_rejected_logits": -1.487164855003357,
645
+ "debug/policy_rejected_logps": -263.3447570800781,
646
+ "debug/reference_chosen_logps": -216.8272247314453,
647
+ "debug/reference_rejected_logps": -259.3106689453125,
648
+ "epoch": 0.7045454545454546,
649
+ "grad_norm": 22.79786121885546,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": -1.6505978107452393,
652
+ "logits/rejected": -1.487164855003357,
653
+ "logps/chosen": -214.40921020507812,
654
+ "logps/rejected": -263.3447570800781,
655
+ "loss": 0.47,
656
+ "rewards/accuracies": 0.625,
657
+ "rewards/chosen": 0.024180222302675247,
658
+ "rewards/margins": 0.06452129781246185,
659
+ "rewards/rejected": -0.04034107178449631,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": -1.5119901895523071,
664
+ "debug/policy_chosen_logps": -211.55398559570312,
665
+ "debug/policy_rejected_logits": -1.435441493988037,
666
+ "debug/policy_rejected_logps": -212.512451171875,
667
+ "debug/reference_chosen_logps": -216.06100463867188,
668
+ "debug/reference_rejected_logps": -215.74588012695312,
669
+ "epoch": 0.7272727272727273,
670
+ "grad_norm": 13.807319731580307,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": -1.5119901895523071,
673
+ "logits/rejected": -1.435441493988037,
674
+ "logps/chosen": -211.55398559570312,
675
+ "logps/rejected": -212.512451171875,
676
+ "loss": 0.4313,
677
+ "rewards/accuracies": 0.5,
678
+ "rewards/chosen": 0.045070286840200424,
679
+ "rewards/margins": 0.012735961005091667,
680
+ "rewards/rejected": 0.032334327697753906,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": -1.5955944061279297,
685
+ "debug/policy_chosen_logps": -233.2834014892578,
686
+ "debug/policy_rejected_logits": -1.5012489557266235,
687
+ "debug/policy_rejected_logps": -314.1435546875,
688
+ "debug/reference_chosen_logps": -234.11337280273438,
689
+ "debug/reference_rejected_logps": -305.76068115234375,
690
+ "epoch": 0.75,
691
+ "grad_norm": 13.254851372269933,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": -1.5955944061279297,
694
+ "logits/rejected": -1.5012489557266235,
695
+ "logps/chosen": -233.2834014892578,
696
+ "logps/rejected": -314.1435546875,
697
+ "loss": 0.4447,
698
+ "rewards/accuracies": 0.625,
699
+ "rewards/chosen": 0.008299730718135834,
700
+ "rewards/margins": 0.09212875366210938,
701
+ "rewards/rejected": -0.08382902294397354,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": -1.6708979606628418,
706
+ "debug/policy_chosen_logps": -233.26309204101562,
707
+ "debug/policy_rejected_logits": -1.615405797958374,
708
+ "debug/policy_rejected_logps": -270.6951904296875,
709
+ "debug/reference_chosen_logps": -235.64840698242188,
710
+ "debug/reference_rejected_logps": -270.08343505859375,
711
+ "epoch": 0.7727272727272727,
712
+ "grad_norm": 23.473338325941466,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": -1.6708979606628418,
715
+ "logits/rejected": -1.615405797958374,
716
+ "logps/chosen": -233.26309204101562,
717
+ "logps/rejected": -270.6951904296875,
718
+ "loss": 0.459,
719
+ "rewards/accuracies": 0.5,
720
+ "rewards/chosen": 0.02385326474905014,
721
+ "rewards/margins": 0.02997133508324623,
722
+ "rewards/rejected": -0.00611807219684124,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": -1.6434283256530762,
727
+ "debug/policy_chosen_logps": -223.66957092285156,
728
+ "debug/policy_rejected_logits": -1.710732340812683,
729
+ "debug/policy_rejected_logps": -259.24407958984375,
730
+ "debug/reference_chosen_logps": -227.90109252929688,
731
+ "debug/reference_rejected_logps": -260.5845031738281,
732
+ "epoch": 0.7954545454545454,
733
+ "grad_norm": 8.241099124072319,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": -1.6434283256530762,
736
+ "logits/rejected": -1.710732340812683,
737
+ "logps/chosen": -223.66957092285156,
738
+ "logps/rejected": -259.24407958984375,
739
+ "loss": 0.4578,
740
+ "rewards/accuracies": 0.5,
741
+ "rewards/chosen": 0.04231514036655426,
742
+ "rewards/margins": 0.028910866007208824,
743
+ "rewards/rejected": 0.013404272496700287,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": -1.6134809255599976,
748
+ "debug/policy_chosen_logps": -236.91664123535156,
749
+ "debug/policy_rejected_logits": -1.330764889717102,
750
+ "debug/policy_rejected_logps": -294.82269287109375,
751
+ "debug/reference_chosen_logps": -239.8042755126953,
752
+ "debug/reference_rejected_logps": -286.74749755859375,
753
+ "epoch": 0.8181818181818182,
754
+ "grad_norm": 13.808704186127128,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": -1.6134809255599976,
757
+ "logits/rejected": -1.330764889717102,
758
+ "logps/chosen": -236.91664123535156,
759
+ "logps/rejected": -294.82269287109375,
760
+ "loss": 0.432,
761
+ "rewards/accuracies": 0.75,
762
+ "rewards/chosen": 0.02887626551091671,
763
+ "rewards/margins": 0.10962820053100586,
764
+ "rewards/rejected": -0.080751933157444,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": -1.649086594581604,
769
+ "debug/policy_chosen_logps": -271.196533203125,
770
+ "debug/policy_rejected_logits": -1.5742888450622559,
771
+ "debug/policy_rejected_logps": -250.44020080566406,
772
+ "debug/reference_chosen_logps": -270.84820556640625,
773
+ "debug/reference_rejected_logps": -250.24411010742188,
774
+ "epoch": 0.8409090909090909,
775
+ "grad_norm": 12.080523316758347,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": -1.649086594581604,
778
+ "logits/rejected": -1.5742888450622559,
779
+ "logps/chosen": -271.196533203125,
780
+ "logps/rejected": -250.44020080566406,
781
+ "loss": 0.4533,
782
+ "rewards/accuracies": 0.375,
783
+ "rewards/chosen": -0.003483029082417488,
784
+ "rewards/margins": -0.0015221424400806427,
785
+ "rewards/rejected": -0.00196088757365942,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": -1.4421508312225342,
790
+ "debug/policy_chosen_logps": -252.74948120117188,
791
+ "debug/policy_rejected_logits": -1.4143319129943848,
792
+ "debug/policy_rejected_logps": -235.70460510253906,
793
+ "debug/reference_chosen_logps": -255.84397888183594,
794
+ "debug/reference_rejected_logps": -241.24551391601562,
795
+ "epoch": 0.8636363636363636,
796
+ "grad_norm": 10.462034516091515,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": -1.4421508312225342,
799
+ "logits/rejected": -1.4143319129943848,
800
+ "logps/chosen": -252.74948120117188,
801
+ "logps/rejected": -235.70460510253906,
802
+ "loss": 0.492,
803
+ "rewards/accuracies": 0.375,
804
+ "rewards/chosen": 0.03094497323036194,
805
+ "rewards/margins": -0.02446414716541767,
806
+ "rewards/rejected": 0.05540912598371506,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": -1.6667423248291016,
811
+ "debug/policy_chosen_logps": -300.1979064941406,
812
+ "debug/policy_rejected_logits": -1.5922452211380005,
813
+ "debug/policy_rejected_logps": -303.7471923828125,
814
+ "debug/reference_chosen_logps": -299.5347900390625,
815
+ "debug/reference_rejected_logps": -300.3440246582031,
816
+ "epoch": 0.8863636363636364,
817
+ "grad_norm": 30.0142964105923,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": -1.6667423248291016,
820
+ "logits/rejected": -1.5922452211380005,
821
+ "logps/chosen": -300.1979064941406,
822
+ "logps/rejected": -303.7471923828125,
823
+ "loss": 0.4455,
824
+ "rewards/accuracies": 0.625,
825
+ "rewards/chosen": -0.0066313184797763824,
826
+ "rewards/margins": 0.027400527149438858,
827
+ "rewards/rejected": -0.03403184935450554,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": -1.4706634283065796,
832
+ "debug/policy_chosen_logps": -262.039306640625,
833
+ "debug/policy_rejected_logits": -1.4372996091842651,
834
+ "debug/policy_rejected_logps": -312.3645324707031,
835
+ "debug/reference_chosen_logps": -265.0177917480469,
836
+ "debug/reference_rejected_logps": -305.94091796875,
837
+ "epoch": 0.9090909090909091,
838
+ "grad_norm": 11.477109959061801,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": -1.4706634283065796,
841
+ "logits/rejected": -1.4372996091842651,
842
+ "logps/chosen": -262.039306640625,
843
+ "logps/rejected": -312.3645324707031,
844
+ "loss": 0.4015,
845
+ "rewards/accuracies": 0.625,
846
+ "rewards/chosen": 0.029784508049488068,
847
+ "rewards/margins": 0.09402050077915192,
848
+ "rewards/rejected": -0.06423598527908325,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": -1.5731853246688843,
853
+ "debug/policy_chosen_logps": -211.87860107421875,
854
+ "debug/policy_rejected_logits": -1.4196109771728516,
855
+ "debug/policy_rejected_logps": -305.4735107421875,
856
+ "debug/reference_chosen_logps": -213.5313720703125,
857
+ "debug/reference_rejected_logps": -291.21087646484375,
858
+ "epoch": 0.9318181818181818,
859
+ "grad_norm": 11.839736690859825,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": -1.5731853246688843,
862
+ "logits/rejected": -1.4196109771728516,
863
+ "logps/chosen": -211.87860107421875,
864
+ "logps/rejected": -305.4735107421875,
865
+ "loss": 0.4282,
866
+ "rewards/accuracies": 0.875,
867
+ "rewards/chosen": 0.0165276937186718,
868
+ "rewards/margins": 0.15915431082248688,
869
+ "rewards/rejected": -0.14262662827968597,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": -1.5689445734024048,
874
+ "debug/policy_chosen_logps": -220.84030151367188,
875
+ "debug/policy_rejected_logits": -1.505204200744629,
876
+ "debug/policy_rejected_logps": -280.9881896972656,
877
+ "debug/reference_chosen_logps": -227.65293884277344,
878
+ "debug/reference_rejected_logps": -277.71539306640625,
879
+ "epoch": 0.9545454545454546,
880
+ "grad_norm": 17.621932069350258,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": -1.5689445734024048,
883
+ "logits/rejected": -1.505204200744629,
884
+ "logps/chosen": -220.84030151367188,
885
+ "logps/rejected": -280.9881896972656,
886
+ "loss": 0.4531,
887
+ "rewards/accuracies": 0.875,
888
+ "rewards/chosen": 0.0681263729929924,
889
+ "rewards/margins": 0.10085420310497284,
890
+ "rewards/rejected": -0.03272783011198044,
891
+ "step": 42
892
+ },
893
+ {
894
+ "debug/policy_chosen_logits": -1.573068618774414,
895
+ "debug/policy_chosen_logps": -226.22598266601562,
896
+ "debug/policy_rejected_logits": -1.4944108724594116,
897
+ "debug/policy_rejected_logps": -252.19342041015625,
898
+ "debug/reference_chosen_logps": -227.93409729003906,
899
+ "debug/reference_rejected_logps": -250.0861358642578,
900
+ "epoch": 0.9772727272727273,
901
+ "grad_norm": 15.558548118353103,
902
+ "learning_rate": 1e-06,
903
+ "logits/chosen": -1.573068618774414,
904
+ "logits/rejected": -1.4944108724594116,
905
+ "logps/chosen": -226.22598266601562,
906
+ "logps/rejected": -252.19342041015625,
907
+ "loss": 0.4693,
908
+ "rewards/accuracies": 0.625,
909
+ "rewards/chosen": 0.0170811265707016,
910
+ "rewards/margins": 0.03815402835607529,
911
+ "rewards/rejected": -0.021072901785373688,
912
+ "step": 43
913
+ },
914
+ {
915
+ "debug/policy_chosen_logits": -1.5688749551773071,
916
+ "debug/policy_chosen_logps": -247.65240478515625,
917
+ "debug/policy_rejected_logits": -1.5015504360198975,
918
+ "debug/policy_rejected_logps": -276.2064208984375,
919
+ "debug/reference_chosen_logps": -250.34555053710938,
920
+ "debug/reference_rejected_logps": -271.73583984375,
921
+ "epoch": 1.0,
922
+ "grad_norm": 27.463260049975947,
923
+ "learning_rate": 1e-06,
924
+ "logits/chosen": -1.5688749551773071,
925
+ "logits/rejected": -1.5015504360198975,
926
+ "logps/chosen": -247.65240478515625,
927
+ "logps/rejected": -276.2064208984375,
928
+ "loss": 0.4479,
929
+ "rewards/accuracies": 0.625,
930
+ "rewards/chosen": 0.0269315168261528,
931
+ "rewards/margins": 0.07163721323013306,
932
+ "rewards/rejected": -0.044705700129270554,
933
+ "step": 44
934
+ },
935
+ {
936
+ "epoch": 1.0,
937
+ "step": 44,
938
+ "total_flos": 0.0,
939
+ "train_loss": 0.46715006164529105,
940
+ "train_runtime": 182.0537,
941
+ "train_samples_per_second": 15.44,
942
+ "train_steps_per_second": 0.242
943
+ }
944
+ ],
945
+ "logging_steps": 1,
946
+ "max_steps": 44,
947
+ "num_input_tokens_seen": 0,
948
+ "num_train_epochs": 1,
949
+ "save_steps": 500,
950
+ "stateful_callbacks": {
951
+ "TrainerControl": {
952
+ "args": {
953
+ "should_epoch_stop": false,
954
+ "should_evaluate": false,
955
+ "should_log": false,
956
+ "should_save": true,
957
+ "should_training_stop": true
958
+ },
959
+ "attributes": {}
960
+ }
961
+ },
962
+ "total_flos": 0.0,
963
+ "train_batch_size": 8,
964
+ "trial_name": null,
965
+ "trial_params": null
966
+ }