yiran-wang3 commited on
Commit
262d5d5
1 Parent(s): 3545b66

End of training

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. all_results.json +4 -4
  3. config.json +1 -1
  4. train_results.json +4 -4
  5. trainer_state.json +805 -805
README.md CHANGED
@@ -61,4 +61,4 @@ The following hyperparameters were used during training:
61
  - Transformers 4.45.0
62
  - Pytorch 2.4.0+cu121
63
  - Datasets 2.14.6
64
- - Tokenizers 0.20.2
 
61
  - Transformers 4.45.0
62
  - Pytorch 2.4.0+cu121
63
  - Datasets 2.14.6
64
+ - Tokenizers 0.20.3
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.44354424568322987,
5
- "train_runtime": 174.5843,
6
  "train_samples": 3272,
7
- "train_samples_per_second": 18.742,
8
- "train_steps_per_second": 0.298
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.45137535952604735,
5
+ "train_runtime": 176.3389,
6
  "train_samples": 3272,
7
+ "train_samples_per_second": 18.555,
8
+ "train_steps_per_second": 0.295
9
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.44354424568322987,
5
- "train_runtime": 174.5843,
6
  "train_samples": 3272,
7
- "train_samples_per_second": 18.742,
8
- "train_steps_per_second": 0.298
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.45137535952604735,
5
+ "train_runtime": 176.3389,
6
  "train_samples": 3272,
7
+ "train_samples_per_second": 18.555,
8
+ "train_steps_per_second": 0.295
9
  }
trainer_state.json CHANGED
@@ -9,19 +9,19 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "debug/policy_chosen_logits": -1.2316170930862427,
13
- "debug/policy_chosen_logps": -132.73162841796875,
14
- "debug/policy_rejected_logits": -1.1728769540786743,
15
- "debug/policy_rejected_logps": -162.23696899414062,
16
- "debug/reference_chosen_logps": -132.73162841796875,
17
- "debug/reference_rejected_logps": -162.23696899414062,
18
  "epoch": 0.019230769230769232,
19
- "grad_norm": 21.661978971839787,
20
  "learning_rate": 1e-06,
21
- "logits/chosen": -1.2316170930862427,
22
- "logits/rejected": -1.1728769540786743,
23
- "logps/chosen": -132.73162841796875,
24
- "logps/rejected": -162.23696899414062,
25
  "loss": 0.5,
26
  "rewards/accuracies": 0.0,
27
  "rewards/chosen": 0.0,
@@ -30,1084 +30,1084 @@
30
  "step": 1
31
  },
32
  {
33
- "debug/policy_chosen_logits": -1.2594889402389526,
34
- "debug/policy_chosen_logps": -179.831298828125,
35
- "debug/policy_rejected_logits": -1.1996418237686157,
36
- "debug/policy_rejected_logps": -205.6752471923828,
37
- "debug/reference_chosen_logps": -178.32009887695312,
38
- "debug/reference_rejected_logps": -204.66696166992188,
39
  "epoch": 0.038461538461538464,
40
- "grad_norm": 38.56925064024332,
41
  "learning_rate": 1e-06,
42
- "logits/chosen": -1.2594889402389526,
43
- "logits/rejected": -1.1996418237686157,
44
- "logps/chosen": -179.831298828125,
45
- "logps/rejected": -205.6752471923828,
46
- "loss": 0.4876,
47
  "rewards/accuracies": 0.375,
48
- "rewards/chosen": -0.01511194184422493,
49
- "rewards/margins": -0.005029010586440563,
50
- "rewards/rejected": -0.010082930326461792,
51
  "step": 2
52
  },
53
  {
54
- "debug/policy_chosen_logits": -1.2830009460449219,
55
- "debug/policy_chosen_logps": -160.35537719726562,
56
- "debug/policy_rejected_logits": -1.2497457265853882,
57
- "debug/policy_rejected_logps": -240.45254516601562,
58
- "debug/reference_chosen_logps": -153.9058837890625,
59
- "debug/reference_rejected_logps": -233.42526245117188,
60
  "epoch": 0.057692307692307696,
61
- "grad_norm": 74.62803863506201,
62
  "learning_rate": 1e-06,
63
- "logits/chosen": -1.2830009460449219,
64
- "logits/rejected": -1.2497457265853882,
65
- "logps/chosen": -160.35537719726562,
66
- "logps/rejected": -240.45254516601562,
67
- "loss": 0.518,
68
  "rewards/accuracies": 0.625,
69
- "rewards/chosen": -0.0644947960972786,
70
- "rewards/margins": 0.005777954123914242,
71
- "rewards/rejected": -0.07027274370193481,
72
  "step": 3
73
  },
74
  {
75
- "debug/policy_chosen_logits": -1.1099666357040405,
76
- "debug/policy_chosen_logps": -96.8184814453125,
77
- "debug/policy_rejected_logits": -1.05377995967865,
78
- "debug/policy_rejected_logps": -202.26805114746094,
79
- "debug/reference_chosen_logps": -95.6224365234375,
80
- "debug/reference_rejected_logps": -194.42694091796875,
81
  "epoch": 0.07692307692307693,
82
- "grad_norm": 90.61433067087236,
83
  "learning_rate": 1e-06,
84
- "logits/chosen": -1.1099666357040405,
85
- "logits/rejected": -1.05377995967865,
86
- "logps/chosen": -96.8184814453125,
87
- "logps/rejected": -202.26805114746094,
88
- "loss": 0.5302,
89
  "rewards/accuracies": 0.875,
90
- "rewards/chosen": -0.011960487812757492,
91
- "rewards/margins": 0.06645052880048752,
92
- "rewards/rejected": -0.07841102033853531,
93
  "step": 4
94
  },
95
  {
96
- "debug/policy_chosen_logits": -1.109797716140747,
97
- "debug/policy_chosen_logps": -120.20796203613281,
98
- "debug/policy_rejected_logits": -1.1077115535736084,
99
- "debug/policy_rejected_logps": -216.43023681640625,
100
- "debug/reference_chosen_logps": -118.03280639648438,
101
- "debug/reference_rejected_logps": -212.2555694580078,
102
  "epoch": 0.09615384615384616,
103
- "grad_norm": 59.296344658626325,
104
  "learning_rate": 1e-06,
105
- "logits/chosen": -1.109797716140747,
106
- "logits/rejected": -1.1077115535736084,
107
- "logps/chosen": -120.20796203613281,
108
- "logps/rejected": -216.43023681640625,
109
- "loss": 0.4996,
110
- "rewards/accuracies": 0.75,
111
- "rewards/chosen": -0.02175154723227024,
112
- "rewards/margins": 0.01999506726861,
113
- "rewards/rejected": -0.04174661636352539,
114
  "step": 5
115
  },
116
  {
117
- "debug/policy_chosen_logits": -1.3628556728363037,
118
- "debug/policy_chosen_logps": -166.7021484375,
119
- "debug/policy_rejected_logits": -1.273955225944519,
120
- "debug/policy_rejected_logps": -198.93994140625,
121
- "debug/reference_chosen_logps": -166.29620361328125,
122
- "debug/reference_rejected_logps": -197.29959106445312,
123
  "epoch": 0.11538461538461539,
124
- "grad_norm": 19.083922077983136,
125
  "learning_rate": 1e-06,
126
- "logits/chosen": -1.3628556728363037,
127
- "logits/rejected": -1.273955225944519,
128
- "logps/chosen": -166.7021484375,
129
- "logps/rejected": -198.93994140625,
130
- "loss": 0.4889,
131
- "rewards/accuracies": 0.5,
132
- "rewards/chosen": -0.004059524275362492,
133
- "rewards/margins": 0.012344006448984146,
134
- "rewards/rejected": -0.016403531655669212,
135
  "step": 6
136
  },
137
  {
138
- "debug/policy_chosen_logits": -1.2453309297561646,
139
- "debug/policy_chosen_logps": -135.94847106933594,
140
- "debug/policy_rejected_logits": -1.2954285144805908,
141
- "debug/policy_rejected_logps": -176.56109619140625,
142
- "debug/reference_chosen_logps": -136.97216796875,
143
- "debug/reference_rejected_logps": -176.8516082763672,
144
  "epoch": 0.1346153846153846,
145
- "grad_norm": 30.257774152774047,
146
  "learning_rate": 1e-06,
147
- "logits/chosen": -1.2453309297561646,
148
- "logits/rejected": -1.2954285144805908,
149
- "logps/chosen": -135.94847106933594,
150
- "logps/rejected": -176.56109619140625,
151
- "loss": 0.4773,
152
- "rewards/accuracies": 0.625,
153
- "rewards/chosen": 0.010236978530883789,
154
- "rewards/margins": 0.0073319049552083015,
155
- "rewards/rejected": 0.002905072644352913,
156
  "step": 7
157
  },
158
  {
159
- "debug/policy_chosen_logits": -1.1769282817840576,
160
- "debug/policy_chosen_logps": -130.25563049316406,
161
- "debug/policy_rejected_logits": -1.251535177230835,
162
- "debug/policy_rejected_logps": -211.3385009765625,
163
- "debug/reference_chosen_logps": -135.37171936035156,
164
- "debug/reference_rejected_logps": -218.43051147460938,
165
  "epoch": 0.15384615384615385,
166
- "grad_norm": 53.6538734909917,
167
  "learning_rate": 1e-06,
168
- "logits/chosen": -1.1769282817840576,
169
- "logits/rejected": -1.251535177230835,
170
- "logps/chosen": -130.25563049316406,
171
- "logps/rejected": -211.3385009765625,
172
- "loss": 0.4852,
173
  "rewards/accuracies": 0.625,
174
- "rewards/chosen": 0.05116088315844536,
175
- "rewards/margins": -0.019759180024266243,
176
- "rewards/rejected": 0.07092006504535675,
177
  "step": 8
178
  },
179
  {
180
- "debug/policy_chosen_logits": -1.1520813703536987,
181
- "debug/policy_chosen_logps": -130.01480102539062,
182
- "debug/policy_rejected_logits": -1.0665879249572754,
183
- "debug/policy_rejected_logps": -164.55233764648438,
184
- "debug/reference_chosen_logps": -131.8277130126953,
185
- "debug/reference_rejected_logps": -168.2589569091797,
186
  "epoch": 0.17307692307692307,
187
- "grad_norm": 68.56866910167538,
188
  "learning_rate": 1e-06,
189
- "logits/chosen": -1.1520813703536987,
190
- "logits/rejected": -1.0665879249572754,
191
- "logps/chosen": -130.01480102539062,
192
- "logps/rejected": -164.55233764648438,
193
- "loss": 0.4662,
194
- "rewards/accuracies": 0.375,
195
- "rewards/chosen": 0.018129272386431694,
196
- "rewards/margins": -0.018936950713396072,
197
- "rewards/rejected": 0.03706622123718262,
198
  "step": 9
199
  },
200
  {
201
- "debug/policy_chosen_logits": -1.2677048444747925,
202
- "debug/policy_chosen_logps": -108.82756042480469,
203
- "debug/policy_rejected_logits": -1.1619057655334473,
204
- "debug/policy_rejected_logps": -212.99595642089844,
205
- "debug/reference_chosen_logps": -115.01768493652344,
206
- "debug/reference_rejected_logps": -210.9394073486328,
207
  "epoch": 0.19230769230769232,
208
- "grad_norm": 29.961533234795134,
209
  "learning_rate": 1e-06,
210
- "logits/chosen": -1.2677048444747925,
211
- "logits/rejected": -1.1619057655334473,
212
- "logps/chosen": -108.82756042480469,
213
- "logps/rejected": -212.99595642089844,
214
- "loss": 0.4517,
215
  "rewards/accuracies": 0.75,
216
- "rewards/chosen": 0.061901286244392395,
217
- "rewards/margins": 0.0824667438864708,
218
- "rewards/rejected": -0.020565450191497803,
219
  "step": 10
220
  },
221
  {
222
- "debug/policy_chosen_logits": -1.229325532913208,
223
- "debug/policy_chosen_logps": -152.08340454101562,
224
- "debug/policy_rejected_logits": -1.1806657314300537,
225
- "debug/policy_rejected_logps": -284.9648132324219,
226
- "debug/reference_chosen_logps": -145.8704376220703,
227
- "debug/reference_rejected_logps": -278.4148254394531,
228
  "epoch": 0.21153846153846154,
229
- "grad_norm": 42.197936495884626,
230
  "learning_rate": 1e-06,
231
- "logits/chosen": -1.229325532913208,
232
- "logits/rejected": -1.1806657314300537,
233
- "logps/chosen": -152.08340454101562,
234
- "logps/rejected": -284.9648132324219,
235
- "loss": 0.4869,
236
  "rewards/accuracies": 0.75,
237
- "rewards/chosen": -0.0621296763420105,
238
- "rewards/margins": 0.0033701807260513306,
239
- "rewards/rejected": -0.06549985706806183,
240
  "step": 11
241
  },
242
  {
243
- "debug/policy_chosen_logits": -0.962820291519165,
244
- "debug/policy_chosen_logps": -134.88470458984375,
245
- "debug/policy_rejected_logits": -1.1076884269714355,
246
- "debug/policy_rejected_logps": -276.01092529296875,
247
- "debug/reference_chosen_logps": -137.8263397216797,
248
- "debug/reference_rejected_logps": -271.51300048828125,
249
  "epoch": 0.23076923076923078,
250
- "grad_norm": 19.480574052456717,
251
  "learning_rate": 1e-06,
252
- "logits/chosen": -0.962820291519165,
253
- "logits/rejected": -1.1076884269714355,
254
- "logps/chosen": -134.88470458984375,
255
- "logps/rejected": -276.01092529296875,
256
- "loss": 0.4631,
257
  "rewards/accuracies": 0.75,
258
- "rewards/chosen": 0.029416313394904137,
259
- "rewards/margins": 0.07439534366130829,
260
- "rewards/rejected": -0.0449790358543396,
261
  "step": 12
262
  },
263
  {
264
- "debug/policy_chosen_logits": -1.187856912612915,
265
- "debug/policy_chosen_logps": -136.24522399902344,
266
- "debug/policy_rejected_logits": -1.2246490716934204,
267
- "debug/policy_rejected_logps": -233.98336791992188,
268
- "debug/reference_chosen_logps": -139.45712280273438,
269
- "debug/reference_rejected_logps": -229.1366729736328,
270
  "epoch": 0.25,
271
- "grad_norm": 56.94222632531445,
272
  "learning_rate": 1e-06,
273
- "logits/chosen": -1.187856912612915,
274
- "logits/rejected": -1.2246490716934204,
275
- "logps/chosen": -136.24522399902344,
276
- "logps/rejected": -233.98336791992188,
277
- "loss": 0.4664,
278
- "rewards/accuracies": 0.75,
279
- "rewards/chosen": 0.03211888298392296,
280
- "rewards/margins": 0.08058580756187439,
281
- "rewards/rejected": -0.04846692830324173,
282
  "step": 13
283
  },
284
  {
285
- "debug/policy_chosen_logits": -1.2769882678985596,
286
- "debug/policy_chosen_logps": -104.90899658203125,
287
- "debug/policy_rejected_logits": -1.2562098503112793,
288
- "debug/policy_rejected_logps": -237.41644287109375,
289
- "debug/reference_chosen_logps": -109.57907104492188,
290
- "debug/reference_rejected_logps": -236.04598999023438,
291
  "epoch": 0.2692307692307692,
292
- "grad_norm": 24.76649234851808,
293
  "learning_rate": 1e-06,
294
- "logits/chosen": -1.2769882678985596,
295
- "logits/rejected": -1.2562098503112793,
296
- "logps/chosen": -104.90899658203125,
297
- "logps/rejected": -237.41644287109375,
298
- "loss": 0.4384,
299
- "rewards/accuracies": 0.875,
300
- "rewards/chosen": 0.04670079052448273,
301
- "rewards/margins": 0.06040526181459427,
302
- "rewards/rejected": -0.013704471290111542,
303
  "step": 14
304
  },
305
  {
306
- "debug/policy_chosen_logits": -1.2544196844100952,
307
- "debug/policy_chosen_logps": -109.65789031982422,
308
- "debug/policy_rejected_logits": -1.310359001159668,
309
- "debug/policy_rejected_logps": -199.24005126953125,
310
- "debug/reference_chosen_logps": -108.41796875,
311
- "debug/reference_rejected_logps": -197.53604125976562,
312
  "epoch": 0.28846153846153844,
313
- "grad_norm": 59.48873508478175,
314
  "learning_rate": 1e-06,
315
- "logits/chosen": -1.2544196844100952,
316
- "logits/rejected": -1.310359001159668,
317
- "logps/chosen": -109.65789031982422,
318
- "logps/rejected": -199.24005126953125,
319
- "loss": 0.4714,
320
  "rewards/accuracies": 0.625,
321
- "rewards/chosen": -0.012399187311530113,
322
- "rewards/margins": 0.004640864208340645,
323
- "rewards/rejected": -0.017040051519870758,
324
  "step": 15
325
  },
326
  {
327
- "debug/policy_chosen_logits": -1.2345539331436157,
328
- "debug/policy_chosen_logps": -149.3842010498047,
329
- "debug/policy_rejected_logits": -1.1627442836761475,
330
- "debug/policy_rejected_logps": -241.727294921875,
331
- "debug/reference_chosen_logps": -145.96755981445312,
332
- "debug/reference_rejected_logps": -233.75701904296875,
333
  "epoch": 0.3076923076923077,
334
- "grad_norm": 54.175150299747976,
335
  "learning_rate": 1e-06,
336
- "logits/chosen": -1.2345539331436157,
337
- "logits/rejected": -1.1627442836761475,
338
- "logps/chosen": -149.3842010498047,
339
- "logps/rejected": -241.727294921875,
340
- "loss": 0.4362,
341
  "rewards/accuracies": 0.75,
342
- "rewards/chosen": -0.034166477620601654,
343
- "rewards/margins": 0.0455363467335701,
344
- "rewards/rejected": -0.07970283180475235,
345
  "step": 16
346
  },
347
  {
348
- "debug/policy_chosen_logits": -1.2490094900131226,
349
- "debug/policy_chosen_logps": -146.42208862304688,
350
- "debug/policy_rejected_logits": -1.1850720643997192,
351
- "debug/policy_rejected_logps": -166.25057983398438,
352
- "debug/reference_chosen_logps": -150.5763397216797,
353
- "debug/reference_rejected_logps": -161.2530975341797,
354
  "epoch": 0.3269230769230769,
355
- "grad_norm": 33.4444193325989,
356
  "learning_rate": 1e-06,
357
- "logits/chosen": -1.2490094900131226,
358
- "logits/rejected": -1.1850720643997192,
359
- "logps/chosen": -146.42208862304688,
360
- "logps/rejected": -166.25057983398438,
361
- "loss": 0.4438,
362
- "rewards/accuracies": 0.875,
363
- "rewards/chosen": 0.04154255986213684,
364
- "rewards/margins": 0.09151734411716461,
365
- "rewards/rejected": -0.04997478425502777,
366
  "step": 17
367
  },
368
  {
369
- "debug/policy_chosen_logits": -1.2004398107528687,
370
- "debug/policy_chosen_logps": -163.50543212890625,
371
- "debug/policy_rejected_logits": -1.204341173171997,
372
- "debug/policy_rejected_logps": -288.0180358886719,
373
- "debug/reference_chosen_logps": -163.77049255371094,
374
- "debug/reference_rejected_logps": -263.1712646484375,
375
  "epoch": 0.34615384615384615,
376
- "grad_norm": 61.98069348299531,
377
  "learning_rate": 1e-06,
378
- "logits/chosen": -1.2004398107528687,
379
- "logits/rejected": -1.204341173171997,
380
- "logps/chosen": -163.50543212890625,
381
- "logps/rejected": -288.0180358886719,
382
- "loss": 0.4592,
383
- "rewards/accuracies": 0.75,
384
- "rewards/chosen": 0.002650529146194458,
385
- "rewards/margins": 0.251118004322052,
386
- "rewards/rejected": -0.24846749007701874,
387
  "step": 18
388
  },
389
  {
390
- "debug/policy_chosen_logits": -1.3331176042556763,
391
- "debug/policy_chosen_logps": -110.75132751464844,
392
- "debug/policy_rejected_logits": -1.2217522859573364,
393
- "debug/policy_rejected_logps": -207.45452880859375,
394
- "debug/reference_chosen_logps": -115.902587890625,
395
- "debug/reference_rejected_logps": -205.65411376953125,
396
  "epoch": 0.36538461538461536,
397
- "grad_norm": 20.11730700354406,
398
  "learning_rate": 1e-06,
399
- "logits/chosen": -1.3331176042556763,
400
- "logits/rejected": -1.2217522859573364,
401
- "logps/chosen": -110.75132751464844,
402
- "logps/rejected": -207.45452880859375,
403
- "loss": 0.4491,
404
- "rewards/accuracies": 1.0,
405
- "rewards/chosen": 0.05151257663965225,
406
- "rewards/margins": 0.06951689720153809,
407
- "rewards/rejected": -0.018004322424530983,
408
  "step": 19
409
  },
410
  {
411
- "debug/policy_chosen_logits": -1.338209629058838,
412
- "debug/policy_chosen_logps": -144.75852966308594,
413
- "debug/policy_rejected_logits": -1.3235102891921997,
414
- "debug/policy_rejected_logps": -262.59710693359375,
415
- "debug/reference_chosen_logps": -141.7485809326172,
416
- "debug/reference_rejected_logps": -245.33224487304688,
417
  "epoch": 0.38461538461538464,
418
- "grad_norm": 40.754402513433696,
419
  "learning_rate": 1e-06,
420
- "logits/chosen": -1.338209629058838,
421
- "logits/rejected": -1.3235102891921997,
422
- "logps/chosen": -144.75852966308594,
423
- "logps/rejected": -262.59710693359375,
424
- "loss": 0.4624,
425
- "rewards/accuracies": 0.75,
426
- "rewards/chosen": -0.03009958378970623,
427
- "rewards/margins": 0.14254869520664215,
428
- "rewards/rejected": -0.17264828085899353,
429
  "step": 20
430
  },
431
  {
432
- "debug/policy_chosen_logits": -1.2065143585205078,
433
- "debug/policy_chosen_logps": -146.51068115234375,
434
- "debug/policy_rejected_logits": -1.2265596389770508,
435
- "debug/policy_rejected_logps": -175.40896606445312,
436
- "debug/reference_chosen_logps": -140.78436279296875,
437
- "debug/reference_rejected_logps": -168.80673217773438,
438
  "epoch": 0.40384615384615385,
439
- "grad_norm": 22.110741992294734,
440
  "learning_rate": 1e-06,
441
- "logits/chosen": -1.2065143585205078,
442
- "logits/rejected": -1.2265596389770508,
443
- "logps/chosen": -146.51068115234375,
444
- "logps/rejected": -175.40896606445312,
445
- "loss": 0.4701,
446
  "rewards/accuracies": 0.75,
447
- "rewards/chosen": -0.05726320296525955,
448
- "rewards/margins": 0.008759044110774994,
449
- "rewards/rejected": -0.06602225452661514,
450
  "step": 21
451
  },
452
  {
453
- "debug/policy_chosen_logits": -1.2565746307373047,
454
- "debug/policy_chosen_logps": -158.69158935546875,
455
- "debug/policy_rejected_logits": -1.1477868556976318,
456
- "debug/policy_rejected_logps": -176.73370361328125,
457
- "debug/reference_chosen_logps": -156.03787231445312,
458
- "debug/reference_rejected_logps": -172.10971069335938,
459
  "epoch": 0.4230769230769231,
460
- "grad_norm": 44.62287437323654,
461
  "learning_rate": 1e-06,
462
- "logits/chosen": -1.2565746307373047,
463
- "logits/rejected": -1.1477868556976318,
464
- "logps/chosen": -158.69158935546875,
465
- "logps/rejected": -176.73370361328125,
466
- "loss": 0.4407,
467
- "rewards/accuracies": 0.5,
468
- "rewards/chosen": -0.026537198573350906,
469
- "rewards/margins": 0.019702596589922905,
470
- "rewards/rejected": -0.04623979702591896,
471
  "step": 22
472
  },
473
  {
474
- "debug/policy_chosen_logits": -1.1006157398223877,
475
- "debug/policy_chosen_logps": -100.4822998046875,
476
- "debug/policy_rejected_logits": -1.035710096359253,
477
- "debug/policy_rejected_logps": -333.0516357421875,
478
- "debug/reference_chosen_logps": -104.65483093261719,
479
- "debug/reference_rejected_logps": -310.935302734375,
480
  "epoch": 0.4423076923076923,
481
- "grad_norm": 24.118006233865607,
482
  "learning_rate": 1e-06,
483
- "logits/chosen": -1.1006157398223877,
484
- "logits/rejected": -1.035710096359253,
485
- "logps/chosen": -100.4822998046875,
486
- "logps/rejected": -333.0516357421875,
487
- "loss": 0.4535,
488
- "rewards/accuracies": 0.75,
489
- "rewards/chosen": 0.04172533005475998,
490
- "rewards/margins": 0.26288849115371704,
491
- "rewards/rejected": -0.22116313874721527,
492
  "step": 23
493
  },
494
  {
495
- "debug/policy_chosen_logits": -1.350253701210022,
496
- "debug/policy_chosen_logps": -149.94215393066406,
497
- "debug/policy_rejected_logits": -1.2977714538574219,
498
- "debug/policy_rejected_logps": -326.00872802734375,
499
- "debug/reference_chosen_logps": -150.9017333984375,
500
- "debug/reference_rejected_logps": -303.9064636230469,
501
  "epoch": 0.46153846153846156,
502
- "grad_norm": 31.59608650044171,
503
  "learning_rate": 1e-06,
504
- "logits/chosen": -1.350253701210022,
505
- "logits/rejected": -1.2977714538574219,
506
- "logps/chosen": -149.94215393066406,
507
- "logps/rejected": -326.00872802734375,
508
- "loss": 0.4305,
509
- "rewards/accuracies": 0.875,
510
- "rewards/chosen": 0.00959576666355133,
511
- "rewards/margins": 0.23061853647232056,
512
- "rewards/rejected": -0.22102276980876923,
513
  "step": 24
514
  },
515
  {
516
- "debug/policy_chosen_logits": -1.2466541528701782,
517
- "debug/policy_chosen_logps": -179.1575164794922,
518
- "debug/policy_rejected_logits": -1.2147475481033325,
519
- "debug/policy_rejected_logps": -295.2662658691406,
520
- "debug/reference_chosen_logps": -180.40638732910156,
521
- "debug/reference_rejected_logps": -281.09112548828125,
522
  "epoch": 0.4807692307692308,
523
- "grad_norm": 47.73909960510058,
524
  "learning_rate": 1e-06,
525
- "logits/chosen": -1.2466541528701782,
526
- "logits/rejected": -1.2147475481033325,
527
- "logps/chosen": -179.1575164794922,
528
- "logps/rejected": -295.2662658691406,
529
- "loss": 0.4171,
530
  "rewards/accuracies": 0.75,
531
- "rewards/chosen": 0.012488747015595436,
532
- "rewards/margins": 0.1542397141456604,
533
- "rewards/rejected": -0.14175096154212952,
534
  "step": 25
535
  },
536
  {
537
- "debug/policy_chosen_logits": -1.204865574836731,
538
- "debug/policy_chosen_logps": -137.5517578125,
539
- "debug/policy_rejected_logits": -1.3782544136047363,
540
- "debug/policy_rejected_logps": -208.717041015625,
541
- "debug/reference_chosen_logps": -143.9291229248047,
542
- "debug/reference_rejected_logps": -212.34486389160156,
543
  "epoch": 0.5,
544
- "grad_norm": 41.22809898372289,
545
  "learning_rate": 1e-06,
546
- "logits/chosen": -1.204865574836731,
547
- "logits/rejected": -1.3782544136047363,
548
- "logps/chosen": -137.5517578125,
549
- "logps/rejected": -208.717041015625,
550
- "loss": 0.4209,
551
- "rewards/accuracies": 0.875,
552
- "rewards/chosen": 0.06377358734607697,
553
- "rewards/margins": 0.027495335787534714,
554
- "rewards/rejected": 0.03627825155854225,
555
  "step": 26
556
  },
557
  {
558
- "debug/policy_chosen_logits": -1.2046425342559814,
559
- "debug/policy_chosen_logps": -140.90965270996094,
560
- "debug/policy_rejected_logits": -1.2170807123184204,
561
- "debug/policy_rejected_logps": -170.78704833984375,
562
- "debug/reference_chosen_logps": -140.62596130371094,
563
- "debug/reference_rejected_logps": -174.9475860595703,
564
  "epoch": 0.5192307692307693,
565
- "grad_norm": 15.283751437103401,
566
  "learning_rate": 1e-06,
567
- "logits/chosen": -1.2046425342559814,
568
- "logits/rejected": -1.2170807123184204,
569
- "logps/chosen": -140.90965270996094,
570
- "logps/rejected": -170.78704833984375,
571
- "loss": 0.4228,
572
- "rewards/accuracies": 0.125,
573
- "rewards/chosen": -0.002836771309375763,
574
- "rewards/margins": -0.04444221407175064,
575
- "rewards/rejected": 0.04160544276237488,
576
  "step": 27
577
  },
578
  {
579
- "debug/policy_chosen_logits": -1.4107595682144165,
580
- "debug/policy_chosen_logps": -130.7558135986328,
581
- "debug/policy_rejected_logits": -1.4000462293624878,
582
- "debug/policy_rejected_logps": -164.4125213623047,
583
- "debug/reference_chosen_logps": -137.77110290527344,
584
- "debug/reference_rejected_logps": -167.27540588378906,
585
  "epoch": 0.5384615384615384,
586
- "grad_norm": 38.338385484834546,
587
  "learning_rate": 1e-06,
588
- "logits/chosen": -1.4107595682144165,
589
- "logits/rejected": -1.4000462293624878,
590
- "logps/chosen": -130.7558135986328,
591
- "logps/rejected": -164.4125213623047,
592
- "loss": 0.4467,
593
- "rewards/accuracies": 0.25,
594
- "rewards/chosen": 0.07015287131071091,
595
- "rewards/margins": 0.04152403771877289,
596
- "rewards/rejected": 0.02862883359193802,
597
  "step": 28
598
  },
599
  {
600
- "debug/policy_chosen_logits": -1.398485779762268,
601
- "debug/policy_chosen_logps": -150.90823364257812,
602
- "debug/policy_rejected_logits": -1.174936294555664,
603
- "debug/policy_rejected_logps": -291.80194091796875,
604
- "debug/reference_chosen_logps": -158.24673461914062,
605
- "debug/reference_rejected_logps": -276.33837890625,
606
  "epoch": 0.5576923076923077,
607
- "grad_norm": 15.468395775938875,
608
  "learning_rate": 1e-06,
609
- "logits/chosen": -1.398485779762268,
610
- "logits/rejected": -1.174936294555664,
611
- "logps/chosen": -150.90823364257812,
612
- "logps/rejected": -291.80194091796875,
613
- "loss": 0.4396,
614
  "rewards/accuracies": 0.625,
615
- "rewards/chosen": 0.07338497042655945,
616
- "rewards/margins": 0.22802035510540009,
617
- "rewards/rejected": -0.15463536977767944,
618
  "step": 29
619
  },
620
  {
621
- "debug/policy_chosen_logits": -1.336907982826233,
622
- "debug/policy_chosen_logps": -133.89599609375,
623
- "debug/policy_rejected_logits": -1.2603211402893066,
624
- "debug/policy_rejected_logps": -196.19960021972656,
625
- "debug/reference_chosen_logps": -142.37545776367188,
626
- "debug/reference_rejected_logps": -191.63192749023438,
627
  "epoch": 0.5769230769230769,
628
- "grad_norm": 29.826267277718433,
629
  "learning_rate": 1e-06,
630
- "logits/chosen": -1.336907982826233,
631
- "logits/rejected": -1.2603211402893066,
632
- "logps/chosen": -133.89599609375,
633
- "logps/rejected": -196.19960021972656,
634
- "loss": 0.4405,
635
  "rewards/accuracies": 0.875,
636
- "rewards/chosen": 0.08479461818933487,
637
- "rewards/margins": 0.1304713636636734,
638
- "rewards/rejected": -0.04567674547433853,
639
  "step": 30
640
  },
641
  {
642
- "debug/policy_chosen_logits": -1.3420963287353516,
643
- "debug/policy_chosen_logps": -101.8618392944336,
644
- "debug/policy_rejected_logits": -1.1649795770645142,
645
- "debug/policy_rejected_logps": -241.93505859375,
646
- "debug/reference_chosen_logps": -110.1383056640625,
647
- "debug/reference_rejected_logps": -240.0742950439453,
648
  "epoch": 0.5961538461538461,
649
- "grad_norm": 27.431269846469586,
650
  "learning_rate": 1e-06,
651
- "logits/chosen": -1.3420963287353516,
652
- "logits/rejected": -1.1649795770645142,
653
- "logps/chosen": -101.8618392944336,
654
- "logps/rejected": -241.93505859375,
655
- "loss": 0.468,
656
- "rewards/accuracies": 0.75,
657
- "rewards/chosen": 0.08276471495628357,
658
- "rewards/margins": 0.10137245059013367,
659
- "rewards/rejected": -0.0186077281832695,
660
  "step": 31
661
  },
662
  {
663
- "debug/policy_chosen_logits": -1.2078020572662354,
664
- "debug/policy_chosen_logps": -119.91991424560547,
665
- "debug/policy_rejected_logits": -0.9945322871208191,
666
- "debug/policy_rejected_logps": -330.7967529296875,
667
- "debug/reference_chosen_logps": -139.70213317871094,
668
- "debug/reference_rejected_logps": -319.4359130859375,
669
  "epoch": 0.6153846153846154,
670
- "grad_norm": 14.75813816004449,
671
  "learning_rate": 1e-06,
672
- "logits/chosen": -1.2078020572662354,
673
- "logits/rejected": -0.9945322871208191,
674
- "logps/chosen": -119.91991424560547,
675
- "logps/rejected": -330.7967529296875,
676
- "loss": 0.4,
677
  "rewards/accuracies": 1.0,
678
- "rewards/chosen": 0.1978221833705902,
679
- "rewards/margins": 0.3114301562309265,
680
- "rewards/rejected": -0.11360795795917511,
681
  "step": 32
682
  },
683
  {
684
- "debug/policy_chosen_logits": -1.1920416355133057,
685
- "debug/policy_chosen_logps": -98.884765625,
686
- "debug/policy_rejected_logits": -1.2546271085739136,
687
- "debug/policy_rejected_logps": -283.1683654785156,
688
- "debug/reference_chosen_logps": -110.47108459472656,
689
- "debug/reference_rejected_logps": -274.7044677734375,
690
  "epoch": 0.6346153846153846,
691
- "grad_norm": 17.792135410719844,
692
  "learning_rate": 1e-06,
693
- "logits/chosen": -1.1920416355133057,
694
- "logits/rejected": -1.2546271085739136,
695
- "logps/chosen": -98.884765625,
696
- "logps/rejected": -283.1683654785156,
697
- "loss": 0.404,
698
  "rewards/accuracies": 0.875,
699
- "rewards/chosen": 0.11586315184831619,
700
- "rewards/margins": 0.20050224661827087,
701
- "rewards/rejected": -0.08463907986879349,
702
  "step": 33
703
  },
704
  {
705
- "debug/policy_chosen_logits": -1.32485830783844,
706
- "debug/policy_chosen_logps": -133.64706420898438,
707
- "debug/policy_rejected_logits": -1.2567251920700073,
708
- "debug/policy_rejected_logps": -243.16220092773438,
709
- "debug/reference_chosen_logps": -137.5300750732422,
710
- "debug/reference_rejected_logps": -242.2387237548828,
711
  "epoch": 0.6538461538461539,
712
- "grad_norm": 76.76926374768559,
713
  "learning_rate": 1e-06,
714
- "logits/chosen": -1.32485830783844,
715
- "logits/rejected": -1.2567251920700073,
716
- "logps/chosen": -133.64706420898438,
717
- "logps/rejected": -243.16220092773438,
718
- "loss": 0.4264,
719
  "rewards/accuracies": 0.625,
720
- "rewards/chosen": 0.03883013129234314,
721
- "rewards/margins": 0.04806497320532799,
722
- "rewards/rejected": -0.009234847500920296,
723
  "step": 34
724
  },
725
  {
726
- "debug/policy_chosen_logits": -1.3281021118164062,
727
- "debug/policy_chosen_logps": -114.21416473388672,
728
- "debug/policy_rejected_logits": -1.1842998266220093,
729
- "debug/policy_rejected_logps": -230.96640014648438,
730
- "debug/reference_chosen_logps": -126.98857116699219,
731
- "debug/reference_rejected_logps": -227.5498809814453,
732
  "epoch": 0.6730769230769231,
733
- "grad_norm": 16.48513566801007,
734
  "learning_rate": 1e-06,
735
- "logits/chosen": -1.3281021118164062,
736
- "logits/rejected": -1.1842998266220093,
737
- "logps/chosen": -114.21416473388672,
738
- "logps/rejected": -230.96640014648438,
739
- "loss": 0.4124,
740
- "rewards/accuracies": 0.875,
741
- "rewards/chosen": 0.12774410843849182,
742
- "rewards/margins": 0.1619093120098114,
743
- "rewards/rejected": -0.03416522219777107,
744
  "step": 35
745
  },
746
  {
747
- "debug/policy_chosen_logits": -1.1566346883773804,
748
- "debug/policy_chosen_logps": -117.29952239990234,
749
- "debug/policy_rejected_logits": -1.0660278797149658,
750
- "debug/policy_rejected_logps": -323.2428894042969,
751
- "debug/reference_chosen_logps": -125.93283081054688,
752
- "debug/reference_rejected_logps": -305.2673645019531,
753
  "epoch": 0.6923076923076923,
754
- "grad_norm": 17.26263202281334,
755
  "learning_rate": 1e-06,
756
- "logits/chosen": -1.1566346883773804,
757
- "logits/rejected": -1.0660278797149658,
758
- "logps/chosen": -117.29952239990234,
759
- "logps/rejected": -323.2428894042969,
760
- "loss": 0.4024,
761
  "rewards/accuracies": 1.0,
762
- "rewards/chosen": 0.08633305877447128,
763
- "rewards/margins": 0.2660883069038391,
764
- "rewards/rejected": -0.17975522577762604,
765
  "step": 36
766
  },
767
  {
768
- "debug/policy_chosen_logits": -1.2661734819412231,
769
- "debug/policy_chosen_logps": -139.73275756835938,
770
- "debug/policy_rejected_logits": -1.219040036201477,
771
- "debug/policy_rejected_logps": -192.42286682128906,
772
- "debug/reference_chosen_logps": -141.48370361328125,
773
- "debug/reference_rejected_logps": -179.03509521484375,
774
  "epoch": 0.7115384615384616,
775
- "grad_norm": 31.112423756254753,
776
  "learning_rate": 1e-06,
777
- "logits/chosen": -1.2661734819412231,
778
- "logits/rejected": -1.219040036201477,
779
- "logps/chosen": -139.73275756835938,
780
- "logps/rejected": -192.42286682128906,
781
- "loss": 0.4352,
782
  "rewards/accuracies": 0.75,
783
- "rewards/chosen": 0.017509642988443375,
784
- "rewards/margins": 0.15138748288154602,
785
- "rewards/rejected": -0.13387782871723175,
786
  "step": 37
787
  },
788
  {
789
- "debug/policy_chosen_logits": -1.4617388248443604,
790
- "debug/policy_chosen_logps": -119.26033020019531,
791
- "debug/policy_rejected_logits": -1.4288504123687744,
792
- "debug/policy_rejected_logps": -181.7017364501953,
793
- "debug/reference_chosen_logps": -127.11656188964844,
794
- "debug/reference_rejected_logps": -176.86419677734375,
795
  "epoch": 0.7307692307692307,
796
- "grad_norm": 42.01899889081456,
797
  "learning_rate": 1e-06,
798
- "logits/chosen": -1.4617388248443604,
799
- "logits/rejected": -1.4288504123687744,
800
- "logps/chosen": -119.26033020019531,
801
- "logps/rejected": -181.7017364501953,
802
- "loss": 0.3721,
803
- "rewards/accuracies": 0.875,
804
- "rewards/chosen": 0.07856231927871704,
805
- "rewards/margins": 0.12693758308887482,
806
- "rewards/rejected": -0.04837527871131897,
807
  "step": 38
808
  },
809
  {
810
- "debug/policy_chosen_logits": -1.2514652013778687,
811
- "debug/policy_chosen_logps": -214.69508361816406,
812
- "debug/policy_rejected_logits": -1.1772401332855225,
813
- "debug/policy_rejected_logps": -197.77090454101562,
814
- "debug/reference_chosen_logps": -205.87307739257812,
815
- "debug/reference_rejected_logps": -191.47265625,
816
  "epoch": 0.75,
817
- "grad_norm": 22.111384375060176,
818
  "learning_rate": 1e-06,
819
- "logits/chosen": -1.2514652013778687,
820
- "logits/rejected": -1.1772401332855225,
821
- "logps/chosen": -214.69508361816406,
822
- "logps/rejected": -197.77090454101562,
823
- "loss": 0.4443,
824
- "rewards/accuracies": 0.5,
825
- "rewards/chosen": -0.08822001516819,
826
- "rewards/margins": -0.02523757889866829,
827
- "rewards/rejected": -0.06298243254423141,
828
  "step": 39
829
  },
830
  {
831
- "debug/policy_chosen_logits": -1.4284347295761108,
832
- "debug/policy_chosen_logps": -147.825927734375,
833
- "debug/policy_rejected_logits": -1.3076362609863281,
834
- "debug/policy_rejected_logps": -254.9888153076172,
835
- "debug/reference_chosen_logps": -146.136962890625,
836
- "debug/reference_rejected_logps": -221.61865234375,
837
  "epoch": 0.7692307692307693,
838
- "grad_norm": 78.31350758335499,
839
  "learning_rate": 1e-06,
840
- "logits/chosen": -1.4284347295761108,
841
- "logits/rejected": -1.3076362609863281,
842
- "logps/chosen": -147.825927734375,
843
- "logps/rejected": -254.9888153076172,
844
- "loss": 0.4668,
845
  "rewards/accuracies": 0.875,
846
- "rewards/chosen": -0.016889560967683792,
847
- "rewards/margins": 0.316812127828598,
848
- "rewards/rejected": -0.3337016999721527,
849
  "step": 40
850
  },
851
  {
852
- "debug/policy_chosen_logits": -1.2104414701461792,
853
- "debug/policy_chosen_logps": -180.59503173828125,
854
- "debug/policy_rejected_logits": -1.1626763343811035,
855
- "debug/policy_rejected_logps": -299.04248046875,
856
- "debug/reference_chosen_logps": -183.44879150390625,
857
- "debug/reference_rejected_logps": -268.26422119140625,
858
  "epoch": 0.7884615384615384,
859
- "grad_norm": 50.194451097738806,
860
  "learning_rate": 1e-06,
861
- "logits/chosen": -1.2104414701461792,
862
- "logits/rejected": -1.1626763343811035,
863
- "logps/chosen": -180.59503173828125,
864
- "logps/rejected": -299.04248046875,
865
- "loss": 0.3985,
866
  "rewards/accuracies": 0.75,
867
- "rewards/chosen": 0.02853771299123764,
868
- "rewards/margins": 0.3363204598426819,
869
- "rewards/rejected": -0.30778273940086365,
870
  "step": 41
871
  },
872
  {
873
- "debug/policy_chosen_logits": -1.2823656797409058,
874
- "debug/policy_chosen_logps": -105.43904876708984,
875
- "debug/policy_rejected_logits": -1.2949018478393555,
876
- "debug/policy_rejected_logps": -217.46287536621094,
877
- "debug/reference_chosen_logps": -116.0798568725586,
878
- "debug/reference_rejected_logps": -202.04270935058594,
879
  "epoch": 0.8076923076923077,
880
- "grad_norm": 71.44356233819985,
881
  "learning_rate": 1e-06,
882
- "logits/chosen": -1.2823656797409058,
883
- "logits/rejected": -1.2949018478393555,
884
- "logps/chosen": -105.43904876708984,
885
- "logps/rejected": -217.46287536621094,
886
- "loss": 0.4176,
887
  "rewards/accuracies": 1.0,
888
- "rewards/chosen": 0.10640807449817657,
889
- "rewards/margins": 0.26060980558395386,
890
- "rewards/rejected": -0.1542017161846161,
891
  "step": 42
892
  },
893
  {
894
- "debug/policy_chosen_logits": -1.2382934093475342,
895
- "debug/policy_chosen_logps": -112.60398864746094,
896
- "debug/policy_rejected_logits": -1.0149949789047241,
897
- "debug/policy_rejected_logps": -285.73968505859375,
898
- "debug/reference_chosen_logps": -115.7171630859375,
899
- "debug/reference_rejected_logps": -250.61151123046875,
900
  "epoch": 0.8269230769230769,
901
- "grad_norm": 36.616872187513835,
902
  "learning_rate": 1e-06,
903
- "logits/chosen": -1.2382934093475342,
904
- "logits/rejected": -1.0149949789047241,
905
- "logps/chosen": -112.60398864746094,
906
- "logps/rejected": -285.73968505859375,
907
- "loss": 0.4429,
908
  "rewards/accuracies": 1.0,
909
- "rewards/chosen": 0.031131763011217117,
910
- "rewards/margins": 0.382413387298584,
911
- "rewards/rejected": -0.35128161311149597,
912
  "step": 43
913
  },
914
  {
915
- "debug/policy_chosen_logits": -1.4913691282272339,
916
- "debug/policy_chosen_logps": -89.32835388183594,
917
- "debug/policy_rejected_logits": -1.456650972366333,
918
- "debug/policy_rejected_logps": -236.55682373046875,
919
- "debug/reference_chosen_logps": -100.08968353271484,
920
- "debug/reference_rejected_logps": -225.67787170410156,
921
  "epoch": 0.8461538461538461,
922
- "grad_norm": 15.043363687225893,
923
  "learning_rate": 1e-06,
924
- "logits/chosen": -1.4913691282272339,
925
- "logits/rejected": -1.456650972366333,
926
- "logps/chosen": -89.32835388183594,
927
- "logps/rejected": -236.55682373046875,
928
- "loss": 0.3994,
929
  "rewards/accuracies": 0.75,
930
- "rewards/chosen": 0.10761332511901855,
931
- "rewards/margins": 0.21640273928642273,
932
- "rewards/rejected": -0.10878939926624298,
933
  "step": 44
934
  },
935
  {
936
- "debug/policy_chosen_logits": -1.3964580297470093,
937
- "debug/policy_chosen_logps": -127.14812469482422,
938
- "debug/policy_rejected_logits": -1.4482215642929077,
939
- "debug/policy_rejected_logps": -165.46240234375,
940
- "debug/reference_chosen_logps": -140.458984375,
941
- "debug/reference_rejected_logps": -171.06005859375,
942
  "epoch": 0.8653846153846154,
943
- "grad_norm": 49.59204781141965,
944
  "learning_rate": 1e-06,
945
- "logits/chosen": -1.3964580297470093,
946
- "logits/rejected": -1.4482215642929077,
947
- "logps/chosen": -127.14812469482422,
948
- "logps/rejected": -165.46240234375,
949
- "loss": 0.4419,
950
- "rewards/accuracies": 0.5,
951
- "rewards/chosen": 0.13310852646827698,
952
- "rewards/margins": 0.07713213562965393,
953
- "rewards/rejected": 0.05597639083862305,
954
  "step": 45
955
  },
956
  {
957
- "debug/policy_chosen_logits": -1.2927504777908325,
958
- "debug/policy_chosen_logps": -114.67044830322266,
959
- "debug/policy_rejected_logits": -1.279125452041626,
960
- "debug/policy_rejected_logps": -271.4466247558594,
961
- "debug/reference_chosen_logps": -128.33384704589844,
962
- "debug/reference_rejected_logps": -258.589599609375,
963
  "epoch": 0.8846153846153846,
964
- "grad_norm": 20.796125210807904,
965
  "learning_rate": 1e-06,
966
- "logits/chosen": -1.2927504777908325,
967
- "logits/rejected": -1.279125452041626,
968
- "logps/chosen": -114.67044830322266,
969
- "logps/rejected": -271.4466247558594,
970
- "loss": 0.4084,
971
- "rewards/accuracies": 1.0,
972
- "rewards/chosen": 0.136633962392807,
973
- "rewards/margins": 0.2652040123939514,
974
- "rewards/rejected": -0.1285700500011444,
975
  "step": 46
976
  },
977
  {
978
- "debug/policy_chosen_logits": -1.203310251235962,
979
- "debug/policy_chosen_logps": -116.76630401611328,
980
- "debug/policy_rejected_logits": -1.2802678346633911,
981
- "debug/policy_rejected_logps": -212.71511840820312,
982
- "debug/reference_chosen_logps": -124.21476745605469,
983
- "debug/reference_rejected_logps": -208.848388671875,
984
  "epoch": 0.9038461538461539,
985
- "grad_norm": 22.549023272823217,
986
  "learning_rate": 1e-06,
987
- "logits/chosen": -1.203310251235962,
988
- "logits/rejected": -1.2802678346633911,
989
- "logps/chosen": -116.76630401611328,
990
- "logps/rejected": -212.71511840820312,
991
- "loss": 0.4288,
992
  "rewards/accuracies": 0.625,
993
- "rewards/chosen": 0.07448464632034302,
994
- "rewards/margins": 0.11315208673477173,
995
- "rewards/rejected": -0.03866744041442871,
996
  "step": 47
997
  },
998
  {
999
- "debug/policy_chosen_logits": -1.329450011253357,
1000
- "debug/policy_chosen_logps": -141.68182373046875,
1001
- "debug/policy_rejected_logits": -1.164079189300537,
1002
- "debug/policy_rejected_logps": -292.22705078125,
1003
- "debug/reference_chosen_logps": -148.44650268554688,
1004
- "debug/reference_rejected_logps": -282.9841003417969,
1005
  "epoch": 0.9230769230769231,
1006
- "grad_norm": 38.21519514395376,
1007
  "learning_rate": 1e-06,
1008
- "logits/chosen": -1.329450011253357,
1009
- "logits/rejected": -1.164079189300537,
1010
- "logps/chosen": -141.68182373046875,
1011
- "logps/rejected": -292.22705078125,
1012
- "loss": 0.4126,
1013
  "rewards/accuracies": 0.875,
1014
- "rewards/chosen": 0.06764666736125946,
1015
- "rewards/margins": 0.16007646918296814,
1016
- "rewards/rejected": -0.09242980927228928,
1017
  "step": 48
1018
  },
1019
  {
1020
- "debug/policy_chosen_logits": -1.1785225868225098,
1021
- "debug/policy_chosen_logps": -123.70880889892578,
1022
- "debug/policy_rejected_logits": -1.090317964553833,
1023
- "debug/policy_rejected_logps": -169.21560668945312,
1024
- "debug/reference_chosen_logps": -136.04171752929688,
1025
- "debug/reference_rejected_logps": -169.77383422851562,
1026
  "epoch": 0.9423076923076923,
1027
- "grad_norm": 35.07581270707382,
1028
  "learning_rate": 1e-06,
1029
- "logits/chosen": -1.1785225868225098,
1030
- "logits/rejected": -1.090317964553833,
1031
- "logps/chosen": -123.70880889892578,
1032
- "logps/rejected": -169.21560668945312,
1033
- "loss": 0.4397,
1034
- "rewards/accuracies": 0.75,
1035
- "rewards/chosen": 0.12332899868488312,
1036
- "rewards/margins": 0.11774662882089615,
1037
- "rewards/rejected": 0.005582377314567566,
1038
  "step": 49
1039
  },
1040
  {
1041
- "debug/policy_chosen_logits": -1.2230808734893799,
1042
- "debug/policy_chosen_logps": -157.45596313476562,
1043
- "debug/policy_rejected_logits": -1.170206904411316,
1044
- "debug/policy_rejected_logps": -291.5883483886719,
1045
- "debug/reference_chosen_logps": -161.05508422851562,
1046
- "debug/reference_rejected_logps": -282.8891296386719,
1047
  "epoch": 0.9615384615384616,
1048
- "grad_norm": 15.529898113956751,
1049
  "learning_rate": 1e-06,
1050
- "logits/chosen": -1.2230808734893799,
1051
- "logits/rejected": -1.170206904411316,
1052
- "logps/chosen": -157.45596313476562,
1053
- "logps/rejected": -291.5883483886719,
1054
- "loss": 0.3907,
1055
  "rewards/accuracies": 0.875,
1056
- "rewards/chosen": 0.03599133342504501,
1057
- "rewards/margins": 0.12298347800970078,
1058
- "rewards/rejected": -0.08699213713407516,
1059
  "step": 50
1060
  },
1061
  {
1062
- "debug/policy_chosen_logits": -1.3883893489837646,
1063
- "debug/policy_chosen_logps": -146.48504638671875,
1064
- "debug/policy_rejected_logits": -1.2246520519256592,
1065
- "debug/policy_rejected_logps": -147.98838806152344,
1066
- "debug/reference_chosen_logps": -149.21646118164062,
1067
- "debug/reference_rejected_logps": -146.49032592773438,
1068
  "epoch": 0.9807692307692307,
1069
- "grad_norm": 36.66752216762139,
1070
  "learning_rate": 1e-06,
1071
- "logits/chosen": -1.3883893489837646,
1072
- "logits/rejected": -1.2246520519256592,
1073
- "logps/chosen": -146.48504638671875,
1074
- "logps/rejected": -147.98838806152344,
1075
- "loss": 0.4076,
1076
- "rewards/accuracies": 0.625,
1077
- "rewards/chosen": 0.02731417492032051,
1078
- "rewards/margins": 0.04229472205042839,
1079
- "rewards/rejected": -0.01498054713010788,
1080
  "step": 51
1081
  },
1082
  {
1083
- "debug/policy_chosen_logits": -1.2958321571350098,
1084
- "debug/policy_chosen_logps": -112.06967163085938,
1085
- "debug/policy_rejected_logits": -1.363411545753479,
1086
- "debug/policy_rejected_logps": -210.20330810546875,
1087
- "debug/reference_chosen_logps": -118.75313568115234,
1088
- "debug/reference_rejected_logps": -207.29669189453125,
1089
  "epoch": 1.0,
1090
- "grad_norm": 25.501005146613775,
1091
  "learning_rate": 1e-06,
1092
- "logits/chosen": -1.2958321571350098,
1093
- "logits/rejected": -1.363411545753479,
1094
- "logps/chosen": -112.06967163085938,
1095
- "logps/rejected": -210.20330810546875,
1096
- "loss": 0.3806,
1097
- "rewards/accuracies": 0.75,
1098
- "rewards/chosen": 0.06683465838432312,
1099
- "rewards/margins": 0.09590078890323639,
1100
- "rewards/rejected": -0.02906613051891327,
1101
  "step": 52
1102
  },
1103
  {
1104
  "epoch": 1.0,
1105
  "step": 52,
1106
  "total_flos": 0.0,
1107
- "train_loss": 0.44354424568322987,
1108
- "train_runtime": 174.5843,
1109
- "train_samples_per_second": 18.742,
1110
- "train_steps_per_second": 0.298
1111
  }
1112
  ],
1113
  "logging_steps": 1,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "debug/policy_chosen_logits": -1.2348219156265259,
13
+ "debug/policy_chosen_logps": -132.92245483398438,
14
+ "debug/policy_rejected_logits": -1.1747448444366455,
15
+ "debug/policy_rejected_logps": -163.80661010742188,
16
+ "debug/reference_chosen_logps": -132.92245483398438,
17
+ "debug/reference_rejected_logps": -163.80661010742188,
18
  "epoch": 0.019230769230769232,
19
+ "grad_norm": 21.739564681695516,
20
  "learning_rate": 1e-06,
21
+ "logits/chosen": -1.2348219156265259,
22
+ "logits/rejected": -1.1747448444366455,
23
+ "logps/chosen": -132.92245483398438,
24
+ "logps/rejected": -163.80661010742188,
25
  "loss": 0.5,
26
  "rewards/accuracies": 0.0,
27
  "rewards/chosen": 0.0,
 
30
  "step": 1
31
  },
32
  {
33
+ "debug/policy_chosen_logits": -1.2617549896240234,
34
+ "debug/policy_chosen_logps": -182.63986206054688,
35
+ "debug/policy_rejected_logits": -1.2036871910095215,
36
+ "debug/policy_rejected_logps": -207.30313110351562,
37
+ "debug/reference_chosen_logps": -180.3394012451172,
38
+ "debug/reference_rejected_logps": -207.17822265625,
39
  "epoch": 0.038461538461538464,
40
+ "grad_norm": 38.38047434038335,
41
  "learning_rate": 1e-06,
42
+ "logits/chosen": -1.2617549896240234,
43
+ "logits/rejected": -1.2036871910095215,
44
+ "logps/chosen": -182.63986206054688,
45
+ "logps/rejected": -207.30313110351562,
46
+ "loss": 0.4956,
47
  "rewards/accuracies": 0.375,
48
+ "rewards/chosen": -0.02300471067428589,
49
+ "rewards/margins": -0.021755550056695938,
50
+ "rewards/rejected": -0.0012491607340052724,
51
  "step": 2
52
  },
53
  {
54
+ "debug/policy_chosen_logits": -1.2853220701217651,
55
+ "debug/policy_chosen_logps": -163.3322296142578,
56
+ "debug/policy_rejected_logits": -1.2501897811889648,
57
+ "debug/policy_rejected_logps": -241.1845703125,
58
+ "debug/reference_chosen_logps": -157.95523071289062,
59
+ "debug/reference_rejected_logps": -234.6351776123047,
60
  "epoch": 0.057692307692307696,
61
+ "grad_norm": 71.93148801287602,
62
  "learning_rate": 1e-06,
63
+ "logits/chosen": -1.2853220701217651,
64
+ "logits/rejected": -1.2501897811889648,
65
+ "logps/chosen": -163.3322296142578,
66
+ "logps/rejected": -241.1845703125,
67
+ "loss": 0.5122,
68
  "rewards/accuracies": 0.625,
69
+ "rewards/chosen": -0.05377005785703659,
70
+ "rewards/margins": 0.011723717674612999,
71
+ "rewards/rejected": -0.06549377739429474,
72
  "step": 3
73
  },
74
  {
75
+ "debug/policy_chosen_logits": -1.1130611896514893,
76
+ "debug/policy_chosen_logps": -97.92842102050781,
77
+ "debug/policy_rejected_logits": -1.056685209274292,
78
+ "debug/policy_rejected_logps": -202.48904418945312,
79
+ "debug/reference_chosen_logps": -96.54649353027344,
80
+ "debug/reference_rejected_logps": -196.40328979492188,
81
  "epoch": 0.07692307692307693,
82
+ "grad_norm": 94.04471405781264,
83
  "learning_rate": 1e-06,
84
+ "logits/chosen": -1.1130611896514893,
85
+ "logits/rejected": -1.056685209274292,
86
+ "logps/chosen": -97.92842102050781,
87
+ "logps/rejected": -202.48904418945312,
88
+ "loss": 0.5269,
89
  "rewards/accuracies": 0.875,
90
+ "rewards/chosen": -0.013819340616464615,
91
+ "rewards/margins": 0.047038186341524124,
92
+ "rewards/rejected": -0.06085752695798874,
93
  "step": 4
94
  },
95
  {
96
+ "debug/policy_chosen_logits": -1.1152414083480835,
97
+ "debug/policy_chosen_logps": -123.81487274169922,
98
+ "debug/policy_rejected_logits": -1.1132621765136719,
99
+ "debug/policy_rejected_logps": -219.38116455078125,
100
+ "debug/reference_chosen_logps": -121.18955993652344,
101
+ "debug/reference_rejected_logps": -214.46588134765625,
102
  "epoch": 0.09615384615384616,
103
+ "grad_norm": 74.87758823721254,
104
  "learning_rate": 1e-06,
105
+ "logits/chosen": -1.1152414083480835,
106
+ "logits/rejected": -1.1132621765136719,
107
+ "logps/chosen": -123.81487274169922,
108
+ "logps/rejected": -219.38116455078125,
109
+ "loss": 0.5125,
110
+ "rewards/accuracies": 0.875,
111
+ "rewards/chosen": -0.02625308930873871,
112
+ "rewards/margins": 0.022899730131030083,
113
+ "rewards/rejected": -0.04915282130241394,
114
  "step": 5
115
  },
116
  {
117
+ "debug/policy_chosen_logits": -1.3609522581100464,
118
+ "debug/policy_chosen_logps": -170.71603393554688,
119
+ "debug/policy_rejected_logits": -1.2694591283798218,
120
+ "debug/policy_rejected_logps": -202.8367156982422,
121
+ "debug/reference_chosen_logps": -169.4237823486328,
122
+ "debug/reference_rejected_logps": -198.68704223632812,
123
  "epoch": 0.11538461538461539,
124
+ "grad_norm": 19.390219879650054,
125
  "learning_rate": 1e-06,
126
+ "logits/chosen": -1.3609522581100464,
127
+ "logits/rejected": -1.2694591283798218,
128
+ "logps/chosen": -170.71603393554688,
129
+ "logps/rejected": -202.8367156982422,
130
+ "loss": 0.4924,
131
+ "rewards/accuracies": 0.625,
132
+ "rewards/chosen": -0.012922583147883415,
133
+ "rewards/margins": 0.028574256226420403,
134
+ "rewards/rejected": -0.04149683937430382,
135
  "step": 6
136
  },
137
  {
138
+ "debug/policy_chosen_logits": -1.247850775718689,
139
+ "debug/policy_chosen_logps": -137.57997131347656,
140
+ "debug/policy_rejected_logits": -1.2975486516952515,
141
+ "debug/policy_rejected_logps": -177.66110229492188,
142
+ "debug/reference_chosen_logps": -139.8856658935547,
143
+ "debug/reference_rejected_logps": -178.00439453125,
144
  "epoch": 0.1346153846153846,
145
+ "grad_norm": 26.09557066418132,
146
  "learning_rate": 1e-06,
147
+ "logits/chosen": -1.247850775718689,
148
+ "logits/rejected": -1.2975486516952515,
149
+ "logps/chosen": -137.57997131347656,
150
+ "logps/rejected": -177.66110229492188,
151
+ "loss": 0.4741,
152
+ "rewards/accuracies": 0.5,
153
+ "rewards/chosen": 0.023056859150528908,
154
+ "rewards/margins": 0.019623905420303345,
155
+ "rewards/rejected": 0.003432951867580414,
156
  "step": 7
157
  },
158
  {
159
+ "debug/policy_chosen_logits": -1.1798572540283203,
160
+ "debug/policy_chosen_logps": -131.80767822265625,
161
+ "debug/policy_rejected_logits": -1.2539470195770264,
162
+ "debug/policy_rejected_logps": -215.11492919921875,
163
+ "debug/reference_chosen_logps": -136.596435546875,
164
+ "debug/reference_rejected_logps": -221.34326171875,
165
  "epoch": 0.15384615384615385,
166
+ "grad_norm": 47.26951009616528,
167
  "learning_rate": 1e-06,
168
+ "logits/chosen": -1.1798572540283203,
169
+ "logits/rejected": -1.2539470195770264,
170
+ "logps/chosen": -131.80767822265625,
171
+ "logps/rejected": -215.11492919921875,
172
+ "loss": 0.4814,
173
  "rewards/accuracies": 0.625,
174
+ "rewards/chosen": 0.047887563705444336,
175
+ "rewards/margins": -0.014395834878087044,
176
+ "rewards/rejected": 0.06228340044617653,
177
  "step": 8
178
  },
179
  {
180
+ "debug/policy_chosen_logits": -1.153782844543457,
181
+ "debug/policy_chosen_logps": -130.36990356445312,
182
+ "debug/policy_rejected_logits": -1.068179965019226,
183
+ "debug/policy_rejected_logps": -165.0654754638672,
184
+ "debug/reference_chosen_logps": -133.1195068359375,
185
+ "debug/reference_rejected_logps": -170.12216186523438,
186
  "epoch": 0.17307692307692307,
187
+ "grad_norm": 71.68724756772349,
188
  "learning_rate": 1e-06,
189
+ "logits/chosen": -1.153782844543457,
190
+ "logits/rejected": -1.068179965019226,
191
+ "logps/chosen": -130.36990356445312,
192
+ "logps/rejected": -165.0654754638672,
193
+ "loss": 0.4776,
194
+ "rewards/accuracies": 0.625,
195
+ "rewards/chosen": 0.027496041730046272,
196
+ "rewards/margins": -0.02307066321372986,
197
+ "rewards/rejected": 0.05056670308113098,
198
  "step": 9
199
  },
200
  {
201
+ "debug/policy_chosen_logits": -1.2667584419250488,
202
+ "debug/policy_chosen_logps": -111.53315734863281,
203
+ "debug/policy_rejected_logits": -1.161044955253601,
204
+ "debug/policy_rejected_logps": -214.258056640625,
205
+ "debug/reference_chosen_logps": -117.96336364746094,
206
+ "debug/reference_rejected_logps": -211.9200439453125,
207
  "epoch": 0.19230769230769232,
208
+ "grad_norm": 31.608202127314065,
209
  "learning_rate": 1e-06,
210
+ "logits/chosen": -1.2667584419250488,
211
+ "logits/rejected": -1.161044955253601,
212
+ "logps/chosen": -111.53315734863281,
213
+ "logps/rejected": -214.258056640625,
214
+ "loss": 0.4554,
215
  "rewards/accuracies": 0.75,
216
+ "rewards/chosen": 0.06430201232433319,
217
+ "rewards/margins": 0.08768201619386673,
218
+ "rewards/rejected": -0.02338000386953354,
219
  "step": 10
220
  },
221
  {
222
+ "debug/policy_chosen_logits": -1.2289499044418335,
223
+ "debug/policy_chosen_logps": -148.73593139648438,
224
+ "debug/policy_rejected_logits": -1.1797515153884888,
225
+ "debug/policy_rejected_logps": -285.1352844238281,
226
+ "debug/reference_chosen_logps": -146.48251342773438,
227
+ "debug/reference_rejected_logps": -279.6494140625,
228
  "epoch": 0.21153846153846154,
229
+ "grad_norm": 16.687527654352305,
230
  "learning_rate": 1e-06,
231
+ "logits/chosen": -1.2289499044418335,
232
+ "logits/rejected": -1.1797515153884888,
233
+ "logps/chosen": -148.73593139648438,
234
+ "logps/rejected": -285.1352844238281,
235
+ "loss": 0.4788,
236
  "rewards/accuracies": 0.75,
237
+ "rewards/chosen": -0.022534340620040894,
238
+ "rewards/margins": 0.03232429176568985,
239
+ "rewards/rejected": -0.05485863611102104,
240
  "step": 11
241
  },
242
  {
243
+ "debug/policy_chosen_logits": -0.9578278064727783,
244
+ "debug/policy_chosen_logps": -136.0295867919922,
245
+ "debug/policy_rejected_logits": -1.1070349216461182,
246
+ "debug/policy_rejected_logps": -274.4041748046875,
247
+ "debug/reference_chosen_logps": -139.7628173828125,
248
+ "debug/reference_rejected_logps": -272.2436828613281,
249
  "epoch": 0.23076923076923078,
250
+ "grad_norm": 23.74518292076916,
251
  "learning_rate": 1e-06,
252
+ "logits/chosen": -0.9578278064727783,
253
+ "logits/rejected": -1.1070349216461182,
254
+ "logps/chosen": -136.0295867919922,
255
+ "logps/rejected": -274.4041748046875,
256
+ "loss": 0.4588,
257
  "rewards/accuracies": 0.75,
258
+ "rewards/chosen": 0.03733229637145996,
259
+ "rewards/margins": 0.05893722176551819,
260
+ "rewards/rejected": -0.021604929119348526,
261
  "step": 12
262
  },
263
  {
264
+ "debug/policy_chosen_logits": -1.1884337663650513,
265
+ "debug/policy_chosen_logps": -138.00079345703125,
266
+ "debug/policy_rejected_logits": -1.2273510694503784,
267
+ "debug/policy_rejected_logps": -236.30642700195312,
268
+ "debug/reference_chosen_logps": -140.82666015625,
269
+ "debug/reference_rejected_logps": -231.79005432128906,
270
  "epoch": 0.25,
271
+ "grad_norm": 52.574191625685216,
272
  "learning_rate": 1e-06,
273
+ "logits/chosen": -1.1884337663650513,
274
+ "logits/rejected": -1.2273510694503784,
275
+ "logps/chosen": -138.00079345703125,
276
+ "logps/rejected": -236.30642700195312,
277
+ "loss": 0.4658,
278
+ "rewards/accuracies": 0.625,
279
+ "rewards/chosen": 0.0282585509121418,
280
+ "rewards/margins": 0.07342230528593063,
281
+ "rewards/rejected": -0.04516375809907913,
282
  "step": 13
283
  },
284
  {
285
+ "debug/policy_chosen_logits": -1.2762030363082886,
286
+ "debug/policy_chosen_logps": -109.97171020507812,
287
+ "debug/policy_rejected_logits": -1.2517166137695312,
288
+ "debug/policy_rejected_logps": -240.42059326171875,
289
+ "debug/reference_chosen_logps": -110.38009643554688,
290
+ "debug/reference_rejected_logps": -237.3839111328125,
291
  "epoch": 0.2692307692307692,
292
+ "grad_norm": 79.48353423155464,
293
  "learning_rate": 1e-06,
294
+ "logits/chosen": -1.2762030363082886,
295
+ "logits/rejected": -1.2517166137695312,
296
+ "logps/chosen": -109.97171020507812,
297
+ "logps/rejected": -240.42059326171875,
298
+ "loss": 0.4585,
299
+ "rewards/accuracies": 0.75,
300
+ "rewards/chosen": 0.004083747044205666,
301
+ "rewards/margins": 0.03445054590702057,
302
+ "rewards/rejected": -0.030366800725460052,
303
  "step": 14
304
  },
305
  {
306
+ "debug/policy_chosen_logits": -1.258913516998291,
307
+ "debug/policy_chosen_logps": -119.00064086914062,
308
+ "debug/policy_rejected_logits": -1.3117754459381104,
309
+ "debug/policy_rejected_logps": -206.278076171875,
310
+ "debug/reference_chosen_logps": -109.17572021484375,
311
+ "debug/reference_rejected_logps": -200.13653564453125,
312
  "epoch": 0.28846153846153844,
313
+ "grad_norm": 112.2398036104226,
314
  "learning_rate": 1e-06,
315
+ "logits/chosen": -1.258913516998291,
316
+ "logits/rejected": -1.3117754459381104,
317
+ "logps/chosen": -119.00064086914062,
318
+ "logps/rejected": -206.278076171875,
319
+ "loss": 0.5113,
320
  "rewards/accuracies": 0.625,
321
+ "rewards/chosen": -0.098249152302742,
322
+ "rewards/margins": -0.036833763122558594,
323
+ "rewards/rejected": -0.06141539663076401,
324
  "step": 15
325
  },
326
  {
327
+ "debug/policy_chosen_logits": -1.2379214763641357,
328
+ "debug/policy_chosen_logps": -154.35321044921875,
329
+ "debug/policy_rejected_logits": -1.1635435819625854,
330
+ "debug/policy_rejected_logps": -249.5124969482422,
331
+ "debug/reference_chosen_logps": -147.48687744140625,
332
+ "debug/reference_rejected_logps": -231.68386840820312,
333
  "epoch": 0.3076923076923077,
334
+ "grad_norm": 124.12252031892565,
335
  "learning_rate": 1e-06,
336
+ "logits/chosen": -1.2379214763641357,
337
+ "logits/rejected": -1.1635435819625854,
338
+ "logps/chosen": -154.35321044921875,
339
+ "logps/rejected": -249.5124969482422,
340
+ "loss": 0.4921,
341
  "rewards/accuracies": 0.75,
342
+ "rewards/chosen": -0.06866332143545151,
343
+ "rewards/margins": 0.1096230149269104,
344
+ "rewards/rejected": -0.1782863438129425,
345
  "step": 16
346
  },
347
  {
348
+ "debug/policy_chosen_logits": -1.2502650022506714,
349
+ "debug/policy_chosen_logps": -152.2196807861328,
350
+ "debug/policy_rejected_logits": -1.1885775327682495,
351
+ "debug/policy_rejected_logps": -176.0025177001953,
352
+ "debug/reference_chosen_logps": -151.514892578125,
353
+ "debug/reference_rejected_logps": -162.71047973632812,
354
  "epoch": 0.3269230769230769,
355
+ "grad_norm": 91.49358092625756,
356
  "learning_rate": 1e-06,
357
+ "logits/chosen": -1.2502650022506714,
358
+ "logits/rejected": -1.1885775327682495,
359
+ "logps/chosen": -152.2196807861328,
360
+ "logps/rejected": -176.0025177001953,
361
+ "loss": 0.4641,
362
+ "rewards/accuracies": 1.0,
363
+ "rewards/chosen": -0.007047949358820915,
364
+ "rewards/margins": 0.12587252259254456,
365
+ "rewards/rejected": -0.13292045891284943,
366
  "step": 17
367
  },
368
  {
369
+ "debug/policy_chosen_logits": -1.2029650211334229,
370
+ "debug/policy_chosen_logps": -168.9429473876953,
371
+ "debug/policy_rejected_logits": -1.211535930633545,
372
+ "debug/policy_rejected_logps": -307.27362060546875,
373
+ "debug/reference_chosen_logps": -166.69866943359375,
374
+ "debug/reference_rejected_logps": -267.7721252441406,
375
  "epoch": 0.34615384615384615,
376
+ "grad_norm": 30.34097826458164,
377
  "learning_rate": 1e-06,
378
+ "logits/chosen": -1.2029650211334229,
379
+ "logits/rejected": -1.211535930633545,
380
+ "logps/chosen": -168.9429473876953,
381
+ "logps/rejected": -307.27362060546875,
382
+ "loss": 0.4407,
383
+ "rewards/accuracies": 0.875,
384
+ "rewards/chosen": -0.02244272269308567,
385
+ "rewards/margins": 0.37257200479507446,
386
+ "rewards/rejected": -0.39501476287841797,
387
  "step": 18
388
  },
389
  {
390
+ "debug/policy_chosen_logits": -1.3290777206420898,
391
+ "debug/policy_chosen_logps": -113.09046936035156,
392
+ "debug/policy_rejected_logits": -1.2192254066467285,
393
+ "debug/policy_rejected_logps": -211.23617553710938,
394
+ "debug/reference_chosen_logps": -116.71571350097656,
395
+ "debug/reference_rejected_logps": -208.08888244628906,
396
  "epoch": 0.36538461538461536,
397
+ "grad_norm": 15.641150451814267,
398
  "learning_rate": 1e-06,
399
+ "logits/chosen": -1.3290777206420898,
400
+ "logits/rejected": -1.2192254066467285,
401
+ "logps/chosen": -113.09046936035156,
402
+ "logps/rejected": -211.23617553710938,
403
+ "loss": 0.4401,
404
+ "rewards/accuracies": 0.875,
405
+ "rewards/chosen": 0.036252379417419434,
406
+ "rewards/margins": 0.06772524118423462,
407
+ "rewards/rejected": -0.031472865492105484,
408
  "step": 19
409
  },
410
  {
411
+ "debug/policy_chosen_logits": -1.3350298404693604,
412
+ "debug/policy_chosen_logps": -139.8019561767578,
413
+ "debug/policy_rejected_logits": -1.3234355449676514,
414
+ "debug/policy_rejected_logps": -252.89657592773438,
415
+ "debug/reference_chosen_logps": -143.83834838867188,
416
+ "debug/reference_rejected_logps": -245.260986328125,
417
  "epoch": 0.38461538461538464,
418
+ "grad_norm": 26.92984184652646,
419
  "learning_rate": 1e-06,
420
+ "logits/chosen": -1.3350298404693604,
421
+ "logits/rejected": -1.3234355449676514,
422
+ "logps/chosen": -139.8019561767578,
423
+ "logps/rejected": -252.89657592773438,
424
+ "loss": 0.4678,
425
+ "rewards/accuracies": 0.625,
426
+ "rewards/chosen": 0.04036390781402588,
427
+ "rewards/margins": 0.11671990901231766,
428
+ "rewards/rejected": -0.07635599374771118,
429
  "step": 20
430
  },
431
  {
432
+ "debug/policy_chosen_logits": -1.2050725221633911,
433
+ "debug/policy_chosen_logps": -141.1668243408203,
434
+ "debug/policy_rejected_logits": -1.222265601158142,
435
+ "debug/policy_rejected_logps": -172.55345153808594,
436
+ "debug/reference_chosen_logps": -141.07467651367188,
437
+ "debug/reference_rejected_logps": -171.04931640625,
438
  "epoch": 0.40384615384615385,
439
+ "grad_norm": 53.89373486771968,
440
  "learning_rate": 1e-06,
441
+ "logits/chosen": -1.2050725221633911,
442
+ "logits/rejected": -1.222265601158142,
443
+ "logps/chosen": -141.1668243408203,
444
+ "logps/rejected": -172.55345153808594,
445
+ "loss": 0.4892,
446
  "rewards/accuracies": 0.75,
447
+ "rewards/chosen": -0.0009214691817760468,
448
+ "rewards/margins": 0.014119969680905342,
449
+ "rewards/rejected": -0.01504143700003624,
450
  "step": 21
451
  },
452
  {
453
+ "debug/policy_chosen_logits": -1.2559270858764648,
454
+ "debug/policy_chosen_logps": -154.6681365966797,
455
+ "debug/policy_rejected_logits": -1.1459161043167114,
456
+ "debug/policy_rejected_logps": -176.13983154296875,
457
+ "debug/reference_chosen_logps": -158.22708129882812,
458
+ "debug/reference_rejected_logps": -175.68319702148438,
459
  "epoch": 0.4230769230769231,
460
+ "grad_norm": 20.393196406563664,
461
  "learning_rate": 1e-06,
462
+ "logits/chosen": -1.2559270858764648,
463
+ "logits/rejected": -1.1459161043167114,
464
+ "logps/chosen": -154.6681365966797,
465
+ "logps/rejected": -176.13983154296875,
466
+ "loss": 0.4368,
467
+ "rewards/accuracies": 0.75,
468
+ "rewards/chosen": 0.03558942675590515,
469
+ "rewards/margins": 0.04015577584505081,
470
+ "rewards/rejected": -0.004566345363855362,
471
  "step": 22
472
  },
473
  {
474
+ "debug/policy_chosen_logits": -1.099912166595459,
475
+ "debug/policy_chosen_logps": -99.30662536621094,
476
+ "debug/policy_rejected_logits": -1.038980484008789,
477
+ "debug/policy_rejected_logps": -330.9873046875,
478
+ "debug/reference_chosen_logps": -104.75813293457031,
479
+ "debug/reference_rejected_logps": -311.56396484375,
480
  "epoch": 0.4423076923076923,
481
+ "grad_norm": 15.504920171360686,
482
  "learning_rate": 1e-06,
483
+ "logits/chosen": -1.099912166595459,
484
+ "logits/rejected": -1.038980484008789,
485
+ "logps/chosen": -99.30662536621094,
486
+ "logps/rejected": -330.9873046875,
487
+ "loss": 0.4496,
488
+ "rewards/accuracies": 0.625,
489
+ "rewards/chosen": 0.05451509356498718,
490
+ "rewards/margins": 0.24874866008758545,
491
+ "rewards/rejected": -0.19423356652259827,
492
  "step": 23
493
  },
494
  {
495
+ "debug/policy_chosen_logits": -1.353391170501709,
496
+ "debug/policy_chosen_logps": -152.0869598388672,
497
+ "debug/policy_rejected_logits": -1.306003451347351,
498
+ "debug/policy_rejected_logps": -324.70758056640625,
499
+ "debug/reference_chosen_logps": -152.76806640625,
500
+ "debug/reference_rejected_logps": -304.7029724121094,
501
  "epoch": 0.46153846153846156,
502
+ "grad_norm": 37.30857181130373,
503
  "learning_rate": 1e-06,
504
+ "logits/chosen": -1.353391170501709,
505
+ "logits/rejected": -1.306003451347351,
506
+ "logps/chosen": -152.0869598388672,
507
+ "logps/rejected": -324.70758056640625,
508
+ "loss": 0.4288,
509
+ "rewards/accuracies": 0.75,
510
+ "rewards/chosen": 0.006811168976128101,
511
+ "rewards/margins": 0.2068571001291275,
512
+ "rewards/rejected": -0.20004592835903168,
513
  "step": 24
514
  },
515
  {
516
+ "debug/policy_chosen_logits": -1.2506132125854492,
517
+ "debug/policy_chosen_logps": -182.32362365722656,
518
+ "debug/policy_rejected_logits": -1.2176698446273804,
519
+ "debug/policy_rejected_logps": -295.3228454589844,
520
+ "debug/reference_chosen_logps": -182.55697631835938,
521
+ "debug/reference_rejected_logps": -282.0664978027344,
522
  "epoch": 0.4807692307692308,
523
+ "grad_norm": 57.662326320570806,
524
  "learning_rate": 1e-06,
525
+ "logits/chosen": -1.2506132125854492,
526
+ "logits/rejected": -1.2176698446273804,
527
+ "logps/chosen": -182.32362365722656,
528
+ "logps/rejected": -295.3228454589844,
529
+ "loss": 0.4189,
530
  "rewards/accuracies": 0.75,
531
+ "rewards/chosen": 0.002333402633666992,
532
+ "rewards/margins": 0.13489675521850586,
533
+ "rewards/rejected": -0.13256335258483887,
534
  "step": 25
535
  },
536
  {
537
+ "debug/policy_chosen_logits": -1.2043673992156982,
538
+ "debug/policy_chosen_logps": -139.78085327148438,
539
+ "debug/policy_rejected_logits": -1.3776921033859253,
540
+ "debug/policy_rejected_logps": -216.3494873046875,
541
+ "debug/reference_chosen_logps": -146.04248046875,
542
+ "debug/reference_rejected_logps": -214.1402587890625,
543
  "epoch": 0.5,
544
+ "grad_norm": 22.330670420413643,
545
  "learning_rate": 1e-06,
546
+ "logits/chosen": -1.2043673992156982,
547
+ "logits/rejected": -1.3776921033859253,
548
+ "logps/chosen": -139.78085327148438,
549
+ "logps/rejected": -216.3494873046875,
550
+ "loss": 0.4117,
551
+ "rewards/accuracies": 0.75,
552
+ "rewards/chosen": 0.06261642277240753,
553
+ "rewards/margins": 0.0847088098526001,
554
+ "rewards/rejected": -0.02209237962961197,
555
  "step": 26
556
  },
557
  {
558
+ "debug/policy_chosen_logits": -1.211961030960083,
559
+ "debug/policy_chosen_logps": -144.8836669921875,
560
+ "debug/policy_rejected_logits": -1.2210055589675903,
561
+ "debug/policy_rejected_logps": -174.15650939941406,
562
+ "debug/reference_chosen_logps": -143.75244140625,
563
+ "debug/reference_rejected_logps": -176.13311767578125,
564
  "epoch": 0.5192307692307693,
565
+ "grad_norm": 43.60544149312096,
566
  "learning_rate": 1e-06,
567
+ "logits/chosen": -1.211961030960083,
568
+ "logits/rejected": -1.2210055589675903,
569
+ "logps/chosen": -144.8836669921875,
570
+ "logps/rejected": -174.15650939941406,
571
+ "loss": 0.4296,
572
+ "rewards/accuracies": 0.25,
573
+ "rewards/chosen": -0.011312179267406464,
574
+ "rewards/margins": -0.031078338623046875,
575
+ "rewards/rejected": 0.01976615935564041,
576
  "step": 27
577
  },
578
  {
579
+ "debug/policy_chosen_logits": -1.4086847305297852,
580
+ "debug/policy_chosen_logps": -136.67764282226562,
581
+ "debug/policy_rejected_logits": -1.4001612663269043,
582
+ "debug/policy_rejected_logps": -173.57293701171875,
583
+ "debug/reference_chosen_logps": -139.71961975097656,
584
+ "debug/reference_rejected_logps": -167.02276611328125,
585
  "epoch": 0.5384615384615384,
586
+ "grad_norm": 41.878595172037,
587
  "learning_rate": 1e-06,
588
+ "logits/chosen": -1.4086847305297852,
589
+ "logits/rejected": -1.4001612663269043,
590
+ "logps/chosen": -136.67764282226562,
591
+ "logps/rejected": -173.57293701171875,
592
+ "loss": 0.437,
593
+ "rewards/accuracies": 0.75,
594
+ "rewards/chosen": 0.030419737100601196,
595
+ "rewards/margins": 0.09592136740684509,
596
+ "rewards/rejected": -0.0655016228556633,
597
  "step": 28
598
  },
599
  {
600
+ "debug/policy_chosen_logits": -1.394635796546936,
601
+ "debug/policy_chosen_logps": -155.0882568359375,
602
+ "debug/policy_rejected_logits": -1.1727089881896973,
603
+ "debug/policy_rejected_logps": -295.13409423828125,
604
+ "debug/reference_chosen_logps": -160.80213928222656,
605
+ "debug/reference_rejected_logps": -277.72723388671875,
606
  "epoch": 0.5576923076923077,
607
+ "grad_norm": 25.614929675079054,
608
  "learning_rate": 1e-06,
609
+ "logits/chosen": -1.394635796546936,
610
+ "logits/rejected": -1.1727089881896973,
611
+ "logps/chosen": -155.0882568359375,
612
+ "logps/rejected": -295.13409423828125,
613
+ "loss": 0.4438,
614
  "rewards/accuracies": 0.625,
615
+ "rewards/chosen": 0.057138726115226746,
616
+ "rewards/margins": 0.2312072068452835,
617
+ "rewards/rejected": -0.17406848073005676,
618
  "step": 29
619
  },
620
  {
621
+ "debug/policy_chosen_logits": -1.3357499837875366,
622
+ "debug/policy_chosen_logps": -135.82276916503906,
623
+ "debug/policy_rejected_logits": -1.2643619775772095,
624
+ "debug/policy_rejected_logps": -198.8779296875,
625
+ "debug/reference_chosen_logps": -144.52224731445312,
626
+ "debug/reference_rejected_logps": -193.8353271484375,
627
  "epoch": 0.5769230769230769,
628
+ "grad_norm": 35.0050669992775,
629
  "learning_rate": 1e-06,
630
+ "logits/chosen": -1.3357499837875366,
631
+ "logits/rejected": -1.2643619775772095,
632
+ "logps/chosen": -135.82276916503906,
633
+ "logps/rejected": -198.8779296875,
634
+ "loss": 0.4463,
635
  "rewards/accuracies": 0.875,
636
+ "rewards/chosen": 0.08699464797973633,
637
+ "rewards/margins": 0.1374206840991974,
638
+ "rewards/rejected": -0.050426043570041656,
639
  "step": 30
640
  },
641
  {
642
+ "debug/policy_chosen_logits": -1.3438653945922852,
643
+ "debug/policy_chosen_logps": -102.52204132080078,
644
+ "debug/policy_rejected_logits": -1.1623094081878662,
645
+ "debug/policy_rejected_logps": -243.15985107421875,
646
+ "debug/reference_chosen_logps": -112.55838012695312,
647
+ "debug/reference_rejected_logps": -241.82275390625,
648
  "epoch": 0.5961538461538461,
649
+ "grad_norm": 17.73778958561842,
650
  "learning_rate": 1e-06,
651
+ "logits/chosen": -1.3438653945922852,
652
+ "logits/rejected": -1.1623094081878662,
653
+ "logps/chosen": -102.52204132080078,
654
+ "logps/rejected": -243.15985107421875,
655
+ "loss": 0.4711,
656
+ "rewards/accuracies": 0.875,
657
+ "rewards/chosen": 0.10036339610815048,
658
+ "rewards/margins": 0.11373443156480789,
659
+ "rewards/rejected": -0.013371038250625134,
660
  "step": 31
661
  },
662
  {
663
+ "debug/policy_chosen_logits": -1.2000988721847534,
664
+ "debug/policy_chosen_logps": -114.93376159667969,
665
+ "debug/policy_rejected_logits": -0.9910339117050171,
666
+ "debug/policy_rejected_logps": -330.4576416015625,
667
+ "debug/reference_chosen_logps": -141.095703125,
668
+ "debug/reference_rejected_logps": -320.2563171386719,
669
  "epoch": 0.6153846153846154,
670
+ "grad_norm": 43.15694706560031,
671
  "learning_rate": 1e-06,
672
+ "logits/chosen": -1.2000988721847534,
673
+ "logits/rejected": -0.9910339117050171,
674
+ "logps/chosen": -114.93376159667969,
675
+ "logps/rejected": -330.4576416015625,
676
+ "loss": 0.4226,
677
  "rewards/accuracies": 1.0,
678
+ "rewards/chosen": 0.2616194188594818,
679
+ "rewards/margins": 0.3636327385902405,
680
+ "rewards/rejected": -0.10201331228017807,
681
  "step": 32
682
  },
683
  {
684
+ "debug/policy_chosen_logits": -1.1911633014678955,
685
+ "debug/policy_chosen_logps": -96.82029724121094,
686
+ "debug/policy_rejected_logits": -1.2555269002914429,
687
+ "debug/policy_rejected_logps": -276.76422119140625,
688
+ "debug/reference_chosen_logps": -113.63371276855469,
689
+ "debug/reference_rejected_logps": -275.29156494140625,
690
  "epoch": 0.6346153846153846,
691
+ "grad_norm": 44.04898042405896,
692
  "learning_rate": 1e-06,
693
+ "logits/chosen": -1.1911633014678955,
694
+ "logits/rejected": -1.2555269002914429,
695
+ "logps/chosen": -96.82029724121094,
696
+ "logps/rejected": -276.76422119140625,
697
+ "loss": 0.3996,
698
  "rewards/accuracies": 0.875,
699
+ "rewards/chosen": 0.16813413798809052,
700
+ "rewards/margins": 0.18286080658435822,
701
+ "rewards/rejected": -0.014726676046848297,
702
  "step": 33
703
  },
704
  {
705
+ "debug/policy_chosen_logits": -1.3251279592514038,
706
+ "debug/policy_chosen_logps": -133.66395568847656,
707
+ "debug/policy_rejected_logits": -1.258199691772461,
708
+ "debug/policy_rejected_logps": -242.9014892578125,
709
+ "debug/reference_chosen_logps": -143.2382049560547,
710
+ "debug/reference_rejected_logps": -244.9378662109375,
711
  "epoch": 0.6538461538461539,
712
+ "grad_norm": 115.38013119786002,
713
  "learning_rate": 1e-06,
714
+ "logits/chosen": -1.3251279592514038,
715
+ "logits/rejected": -1.258199691772461,
716
+ "logps/chosen": -133.66395568847656,
717
+ "logps/rejected": -242.9014892578125,
718
+ "loss": 0.4549,
719
  "rewards/accuracies": 0.625,
720
+ "rewards/chosen": 0.0957425907254219,
721
+ "rewards/margins": 0.07537883520126343,
722
+ "rewards/rejected": 0.02036374993622303,
723
  "step": 34
724
  },
725
  {
726
+ "debug/policy_chosen_logits": -1.3226226568222046,
727
+ "debug/policy_chosen_logps": -112.6985855102539,
728
+ "debug/policy_rejected_logits": -1.1881129741668701,
729
+ "debug/policy_rejected_logps": -230.60443115234375,
730
+ "debug/reference_chosen_logps": -130.98614501953125,
731
+ "debug/reference_rejected_logps": -228.2262725830078,
732
  "epoch": 0.6730769230769231,
733
+ "grad_norm": 40.4402369628288,
734
  "learning_rate": 1e-06,
735
+ "logits/chosen": -1.3226226568222046,
736
+ "logits/rejected": -1.1881129741668701,
737
+ "logps/chosen": -112.6985855102539,
738
+ "logps/rejected": -230.60443115234375,
739
+ "loss": 0.4216,
740
+ "rewards/accuracies": 1.0,
741
+ "rewards/chosen": 0.18287554383277893,
742
+ "rewards/margins": 0.20665717124938965,
743
+ "rewards/rejected": -0.023781631141901016,
744
  "step": 35
745
  },
746
  {
747
+ "debug/policy_chosen_logits": -1.1547788381576538,
748
+ "debug/policy_chosen_logps": -118.73101806640625,
749
+ "debug/policy_rejected_logits": -1.0697778463363647,
750
+ "debug/policy_rejected_logps": -324.0631103515625,
751
+ "debug/reference_chosen_logps": -127.16613006591797,
752
+ "debug/reference_rejected_logps": -306.434326171875,
753
  "epoch": 0.6923076923076923,
754
+ "grad_norm": 18.889421126870324,
755
  "learning_rate": 1e-06,
756
+ "logits/chosen": -1.1547788381576538,
757
+ "logits/rejected": -1.0697778463363647,
758
+ "logps/chosen": -118.73101806640625,
759
+ "logps/rejected": -324.0631103515625,
760
+ "loss": 0.4124,
761
  "rewards/accuracies": 1.0,
762
+ "rewards/chosen": 0.08435116708278656,
763
+ "rewards/margins": 0.2606390118598938,
764
+ "rewards/rejected": -0.17628784477710724,
765
  "step": 36
766
  },
767
  {
768
+ "debug/policy_chosen_logits": -1.2630081176757812,
769
+ "debug/policy_chosen_logps": -141.00460815429688,
770
+ "debug/policy_rejected_logits": -1.2191274166107178,
771
+ "debug/policy_rejected_logps": -196.04946899414062,
772
+ "debug/reference_chosen_logps": -141.60374450683594,
773
+ "debug/reference_rejected_logps": -181.31646728515625,
774
  "epoch": 0.7115384615384616,
775
+ "grad_norm": 29.84262997736804,
776
  "learning_rate": 1e-06,
777
+ "logits/chosen": -1.2630081176757812,
778
+ "logits/rejected": -1.2191274166107178,
779
+ "logps/chosen": -141.00460815429688,
780
+ "logps/rejected": -196.04946899414062,
781
+ "loss": 0.4429,
782
  "rewards/accuracies": 0.75,
783
+ "rewards/chosen": 0.0059913452714681625,
784
+ "rewards/margins": 0.15332148969173431,
785
+ "rewards/rejected": -0.1473301351070404,
786
  "step": 37
787
  },
788
  {
789
+ "debug/policy_chosen_logits": -1.4539029598236084,
790
+ "debug/policy_chosen_logps": -123.56523895263672,
791
+ "debug/policy_rejected_logits": -1.4236867427825928,
792
+ "debug/policy_rejected_logps": -185.32632446289062,
793
+ "debug/reference_chosen_logps": -129.4361114501953,
794
+ "debug/reference_rejected_logps": -179.44058227539062,
795
  "epoch": 0.7307692307692307,
796
+ "grad_norm": 17.23655033742978,
797
  "learning_rate": 1e-06,
798
+ "logits/chosen": -1.4539029598236084,
799
+ "logits/rejected": -1.4236867427825928,
800
+ "logps/chosen": -123.56523895263672,
801
+ "logps/rejected": -185.32632446289062,
802
+ "loss": 0.3743,
803
+ "rewards/accuracies": 0.75,
804
+ "rewards/chosen": 0.05870867520570755,
805
+ "rewards/margins": 0.11756613105535507,
806
+ "rewards/rejected": -0.05885745957493782,
807
  "step": 38
808
  },
809
  {
810
+ "debug/policy_chosen_logits": -1.248382806777954,
811
+ "debug/policy_chosen_logps": -220.80645751953125,
812
+ "debug/policy_rejected_logits": -1.1765296459197998,
813
+ "debug/policy_rejected_logps": -198.0468292236328,
814
+ "debug/reference_chosen_logps": -208.3310546875,
815
+ "debug/reference_rejected_logps": -192.87469482421875,
816
  "epoch": 0.75,
817
+ "grad_norm": 42.21364154787503,
818
  "learning_rate": 1e-06,
819
+ "logits/chosen": -1.248382806777954,
820
+ "logits/rejected": -1.1765296459197998,
821
+ "logps/chosen": -220.80645751953125,
822
+ "logps/rejected": -198.0468292236328,
823
+ "loss": 0.456,
824
+ "rewards/accuracies": 0.375,
825
+ "rewards/chosen": -0.12475401908159256,
826
+ "rewards/margins": -0.07303276658058167,
827
+ "rewards/rejected": -0.051721252501010895,
828
  "step": 39
829
  },
830
  {
831
+ "debug/policy_chosen_logits": -1.420503854751587,
832
+ "debug/policy_chosen_logps": -150.28591918945312,
833
+ "debug/policy_rejected_logits": -1.3057647943496704,
834
+ "debug/policy_rejected_logps": -263.63275146484375,
835
+ "debug/reference_chosen_logps": -148.029052734375,
836
+ "debug/reference_rejected_logps": -224.35903930664062,
837
  "epoch": 0.7692307692307693,
838
+ "grad_norm": 100.89795058616748,
839
  "learning_rate": 1e-06,
840
+ "logits/chosen": -1.420503854751587,
841
+ "logits/rejected": -1.3057647943496704,
842
+ "logps/chosen": -150.28591918945312,
843
+ "logps/rejected": -263.63275146484375,
844
+ "loss": 0.4926,
845
  "rewards/accuracies": 0.875,
846
+ "rewards/chosen": -0.02256855182349682,
847
+ "rewards/margins": 0.3701684772968292,
848
+ "rewards/rejected": -0.3927370607852936,
849
  "step": 40
850
  },
851
  {
852
+ "debug/policy_chosen_logits": -1.2056798934936523,
853
+ "debug/policy_chosen_logps": -183.70773315429688,
854
+ "debug/policy_rejected_logits": -1.1638095378875732,
855
+ "debug/policy_rejected_logps": -303.64556884765625,
856
+ "debug/reference_chosen_logps": -184.8590087890625,
857
+ "debug/reference_rejected_logps": -268.26873779296875,
858
  "epoch": 0.7884615384615384,
859
+ "grad_norm": 87.52835684552551,
860
  "learning_rate": 1e-06,
861
+ "logits/chosen": -1.2056798934936523,
862
+ "logits/rejected": -1.1638095378875732,
863
+ "logps/chosen": -183.70773315429688,
864
+ "logps/rejected": -303.64556884765625,
865
+ "loss": 0.4298,
866
  "rewards/accuracies": 0.75,
867
+ "rewards/chosen": 0.011512821540236473,
868
+ "rewards/margins": 0.36528119444847107,
869
+ "rewards/rejected": -0.35376837849617004,
870
  "step": 41
871
  },
872
  {
873
+ "debug/policy_chosen_logits": -1.274660587310791,
874
+ "debug/policy_chosen_logps": -109.7809066772461,
875
+ "debug/policy_rejected_logits": -1.2929130792617798,
876
+ "debug/policy_rejected_logps": -221.91757202148438,
877
+ "debug/reference_chosen_logps": -117.5357437133789,
878
+ "debug/reference_rejected_logps": -203.26840209960938,
879
  "epoch": 0.8076923076923077,
880
+ "grad_norm": 133.93511541568418,
881
  "learning_rate": 1e-06,
882
+ "logits/chosen": -1.274660587310791,
883
+ "logits/rejected": -1.2929130792617798,
884
+ "logps/chosen": -109.7809066772461,
885
+ "logps/rejected": -221.91757202148438,
886
+ "loss": 0.4672,
887
  "rewards/accuracies": 1.0,
888
+ "rewards/chosen": 0.07754837721586227,
889
+ "rewards/margins": 0.26404017210006714,
890
+ "rewards/rejected": -0.18649178743362427,
891
  "step": 42
892
  },
893
  {
894
+ "debug/policy_chosen_logits": -1.229216456413269,
895
+ "debug/policy_chosen_logps": -115.21796417236328,
896
+ "debug/policy_rejected_logits": -1.0166616439819336,
897
+ "debug/policy_rejected_logps": -288.5059814453125,
898
+ "debug/reference_chosen_logps": -118.31993103027344,
899
+ "debug/reference_rejected_logps": -250.616943359375,
900
  "epoch": 0.8269230769230769,
901
+ "grad_norm": 77.32000216633205,
902
  "learning_rate": 1e-06,
903
+ "logits/chosen": -1.229216456413269,
904
+ "logits/rejected": -1.0166616439819336,
905
+ "logps/chosen": -115.21796417236328,
906
+ "logps/rejected": -288.5059814453125,
907
+ "loss": 0.4625,
908
  "rewards/accuracies": 1.0,
909
+ "rewards/chosen": 0.031019629910588264,
910
+ "rewards/margins": 0.4099102020263672,
911
+ "rewards/rejected": -0.3788905739784241,
912
  "step": 43
913
  },
914
  {
915
+ "debug/policy_chosen_logits": -1.4798791408538818,
916
+ "debug/policy_chosen_logps": -89.95073699951172,
917
+ "debug/policy_rejected_logits": -1.4541199207305908,
918
+ "debug/policy_rejected_logps": -239.7828826904297,
919
+ "debug/reference_chosen_logps": -101.15357971191406,
920
+ "debug/reference_rejected_logps": -227.21592712402344,
921
  "epoch": 0.8461538461538461,
922
+ "grad_norm": 40.276665507330875,
923
  "learning_rate": 1e-06,
924
+ "logits/chosen": -1.4798791408538818,
925
+ "logits/rejected": -1.4541199207305908,
926
+ "logps/chosen": -89.95073699951172,
927
+ "logps/rejected": -239.7828826904297,
928
+ "loss": 0.4063,
929
  "rewards/accuracies": 0.75,
930
+ "rewards/chosen": 0.11202842742204666,
931
+ "rewards/margins": 0.2376978099346161,
932
+ "rewards/rejected": -0.12566937506198883,
933
  "step": 44
934
  },
935
  {
936
+ "debug/policy_chosen_logits": -1.385978102684021,
937
+ "debug/policy_chosen_logps": -129.73675537109375,
938
+ "debug/policy_rejected_logits": -1.4437228441238403,
939
+ "debug/policy_rejected_logps": -166.5294189453125,
940
+ "debug/reference_chosen_logps": -142.4840850830078,
941
+ "debug/reference_rejected_logps": -174.26443481445312,
942
  "epoch": 0.8653846153846154,
943
+ "grad_norm": 16.81392684345585,
944
  "learning_rate": 1e-06,
945
+ "logits/chosen": -1.385978102684021,
946
+ "logits/rejected": -1.4437228441238403,
947
+ "logps/chosen": -129.73675537109375,
948
+ "logps/rejected": -166.5294189453125,
949
+ "loss": 0.4461,
950
+ "rewards/accuracies": 0.375,
951
+ "rewards/chosen": 0.12747323513031006,
952
+ "rewards/margins": 0.050123024731874466,
953
+ "rewards/rejected": 0.0773501992225647,
954
  "step": 45
955
  },
956
  {
957
+ "debug/policy_chosen_logits": -1.2822701930999756,
958
+ "debug/policy_chosen_logps": -117.16873168945312,
959
+ "debug/policy_rejected_logits": -1.2780365943908691,
960
+ "debug/policy_rejected_logps": -270.9895324707031,
961
+ "debug/reference_chosen_logps": -130.97329711914062,
962
+ "debug/reference_rejected_logps": -259.4662170410156,
963
  "epoch": 0.8846153846153846,
964
+ "grad_norm": 20.14742752061246,
965
  "learning_rate": 1e-06,
966
+ "logits/chosen": -1.2822701930999756,
967
+ "logits/rejected": -1.2780365943908691,
968
+ "logps/chosen": -117.16873168945312,
969
+ "logps/rejected": -270.9895324707031,
970
+ "loss": 0.4036,
971
+ "rewards/accuracies": 0.875,
972
+ "rewards/chosen": 0.13804557919502258,
973
+ "rewards/margins": 0.2532787024974823,
974
+ "rewards/rejected": -0.11523310840129852,
975
  "step": 46
976
  },
977
  {
978
+ "debug/policy_chosen_logits": -1.1911503076553345,
979
+ "debug/policy_chosen_logps": -116.65675354003906,
980
+ "debug/policy_rejected_logits": -1.2724329233169556,
981
+ "debug/policy_rejected_logps": -212.22393798828125,
982
+ "debug/reference_chosen_logps": -127.14996337890625,
983
+ "debug/reference_rejected_logps": -210.3226776123047,
984
  "epoch": 0.9038461538461539,
985
+ "grad_norm": 36.81625781029569,
986
  "learning_rate": 1e-06,
987
+ "logits/chosen": -1.1911503076553345,
988
+ "logits/rejected": -1.2724329233169556,
989
+ "logps/chosen": -116.65675354003906,
990
+ "logps/rejected": -212.22393798828125,
991
+ "loss": 0.4372,
992
  "rewards/accuracies": 0.625,
993
+ "rewards/chosen": 0.10493214428424835,
994
+ "rewards/margins": 0.12394469976425171,
995
+ "rewards/rejected": -0.01901254430413246,
996
  "step": 47
997
  },
998
  {
999
+ "debug/policy_chosen_logits": -1.3235502243041992,
1000
+ "debug/policy_chosen_logps": -142.3896942138672,
1001
+ "debug/policy_rejected_logits": -1.16089928150177,
1002
+ "debug/policy_rejected_logps": -294.1658935546875,
1003
+ "debug/reference_chosen_logps": -148.7461395263672,
1004
+ "debug/reference_rejected_logps": -284.9529113769531,
1005
  "epoch": 0.9230769230769231,
1006
+ "grad_norm": 55.59751943354035,
1007
  "learning_rate": 1e-06,
1008
+ "logits/chosen": -1.3235502243041992,
1009
+ "logits/rejected": -1.16089928150177,
1010
+ "logps/chosen": -142.3896942138672,
1011
+ "logps/rejected": -294.1658935546875,
1012
+ "loss": 0.4307,
1013
  "rewards/accuracies": 0.875,
1014
+ "rewards/chosen": 0.06356436759233475,
1015
+ "rewards/margins": 0.15569432079792023,
1016
+ "rewards/rejected": -0.09212995320558548,
1017
  "step": 48
1018
  },
1019
  {
1020
+ "debug/policy_chosen_logits": -1.1583129167556763,
1021
+ "debug/policy_chosen_logps": -124.10246276855469,
1022
+ "debug/policy_rejected_logits": -1.0836573839187622,
1023
+ "debug/policy_rejected_logps": -168.42910766601562,
1024
+ "debug/reference_chosen_logps": -138.41485595703125,
1025
+ "debug/reference_rejected_logps": -172.309326171875,
1026
  "epoch": 0.9423076923076923,
1027
+ "grad_norm": 63.48037385177918,
1028
  "learning_rate": 1e-06,
1029
+ "logits/chosen": -1.1583129167556763,
1030
+ "logits/rejected": -1.0836573839187622,
1031
+ "logps/chosen": -124.10246276855469,
1032
+ "logps/rejected": -168.42910766601562,
1033
+ "loss": 0.4456,
1034
+ "rewards/accuracies": 0.625,
1035
+ "rewards/chosen": 0.1431237757205963,
1036
+ "rewards/margins": 0.10432147234678268,
1037
+ "rewards/rejected": 0.038802310824394226,
1038
  "step": 49
1039
  },
1040
  {
1041
+ "debug/policy_chosen_logits": -1.2059780359268188,
1042
+ "debug/policy_chosen_logps": -157.7839813232422,
1043
+ "debug/policy_rejected_logits": -1.1662678718566895,
1044
+ "debug/policy_rejected_logps": -291.1665954589844,
1045
+ "debug/reference_chosen_logps": -163.0614013671875,
1046
+ "debug/reference_rejected_logps": -284.3927307128906,
1047
  "epoch": 0.9615384615384616,
1048
+ "grad_norm": 25.37123962222348,
1049
  "learning_rate": 1e-06,
1050
+ "logits/chosen": -1.2059780359268188,
1051
+ "logits/rejected": -1.1662678718566895,
1052
+ "logps/chosen": -157.7839813232422,
1053
+ "logps/rejected": -291.1665954589844,
1054
+ "loss": 0.3946,
1055
  "rewards/accuracies": 0.875,
1056
+ "rewards/chosen": 0.052774280309677124,
1057
+ "rewards/margins": 0.1205129325389862,
1058
+ "rewards/rejected": -0.06773865222930908,
1059
  "step": 50
1060
  },
1061
  {
1062
+ "debug/policy_chosen_logits": -1.3792681694030762,
1063
+ "debug/policy_chosen_logps": -146.66757202148438,
1064
+ "debug/policy_rejected_logits": -1.214746117591858,
1065
+ "debug/policy_rejected_logps": -145.07708740234375,
1066
+ "debug/reference_chosen_logps": -151.08615112304688,
1067
+ "debug/reference_rejected_logps": -149.86410522460938,
1068
  "epoch": 0.9807692307692307,
1069
+ "grad_norm": 18.070723141611428,
1070
  "learning_rate": 1e-06,
1071
+ "logits/chosen": -1.3792681694030762,
1072
+ "logits/rejected": -1.214746117591858,
1073
+ "logps/chosen": -146.66757202148438,
1074
+ "logps/rejected": -145.07708740234375,
1075
+ "loss": 0.4108,
1076
+ "rewards/accuracies": 0.375,
1077
+ "rewards/chosen": 0.04418577253818512,
1078
+ "rewards/margins": -0.0036845793947577477,
1079
+ "rewards/rejected": 0.047870345413684845,
1080
  "step": 51
1081
  },
1082
  {
1083
+ "debug/policy_chosen_logits": -1.2889589071273804,
1084
+ "debug/policy_chosen_logps": -117.15675354003906,
1085
+ "debug/policy_rejected_logits": -1.353893518447876,
1086
+ "debug/policy_rejected_logps": -214.70697021484375,
1087
+ "debug/reference_chosen_logps": -121.83930969238281,
1088
+ "debug/reference_rejected_logps": -210.53515625,
1089
  "epoch": 1.0,
1090
+ "grad_norm": 37.32087719041123,
1091
  "learning_rate": 1e-06,
1092
+ "logits/chosen": -1.2889589071273804,
1093
+ "logits/rejected": -1.353893518447876,
1094
+ "logps/chosen": -117.15675354003906,
1095
+ "logps/rejected": -214.70697021484375,
1096
+ "loss": 0.3909,
1097
+ "rewards/accuracies": 0.625,
1098
+ "rewards/chosen": 0.0468255952000618,
1099
+ "rewards/margins": 0.08854364603757858,
1100
+ "rewards/rejected": -0.04171804338693619,
1101
  "step": 52
1102
  },
1103
  {
1104
  "epoch": 1.0,
1105
  "step": 52,
1106
  "total_flos": 0.0,
1107
+ "train_loss": 0.45137535952604735,
1108
+ "train_runtime": 176.3389,
1109
+ "train_samples_per_second": 18.555,
1110
+ "train_steps_per_second": 0.295
1111
  }
1112
  ],
1113
  "logging_steps": 1,