NicholasCorrado commited on
Commit
328542f
1 Parent(s): a0b90b2

Model save

Browse files
README.md CHANGED
@@ -3,15 +3,10 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
6
- - alignment-handbook
7
- - trl
8
- - dpo
9
- - generated_from_trainer
10
  - trl
11
  - dpo
 
12
  - generated_from_trainer
13
- datasets:
14
- - data/rlced_conifer
15
  model-index:
16
  - name: rlced-conifer-zephyr-7b-dpo-2e
17
  results: []
@@ -22,17 +17,17 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # rlced-conifer-zephyr-7b-dpo-2e
24
 
25
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the data/rlced_conifer dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.6931
28
- - Rewards/chosen: 0.0
29
- - Rewards/rejected: 0.0
30
- - Rewards/accuracies: 0.0
31
- - Rewards/margins: 0.0
32
- - Logps/rejected: -351.6677
33
- - Logps/chosen: -401.2567
34
- - Logits/rejected: -2.6879
35
- - Logits/chosen: -2.8249
36
 
37
  ## Model description
38
 
@@ -67,6 +62,12 @@ The following hyperparameters were used during training:
67
 
68
  ### Training results
69
 
 
 
 
 
 
 
70
 
71
 
72
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
 
 
 
 
6
  - trl
7
  - dpo
8
+ - alignment-handbook
9
  - generated_from_trainer
 
 
10
  model-index:
11
  - name: rlced-conifer-zephyr-7b-dpo-2e
12
  results: []
 
17
 
18
  # rlced-conifer-zephyr-7b-dpo-2e
19
 
20
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.1578
23
+ - Rewards/chosen: -8.8688
24
+ - Rewards/rejected: -21.7065
25
+ - Rewards/accuracies: 0.9203
26
+ - Rewards/margins: 12.8376
27
+ - Logps/rejected: -2616.5852
28
+ - Logps/chosen: -1310.2896
29
+ - Logits/rejected: 3.0165
30
+ - Logits/chosen: -0.1724
31
 
32
  ## Model description
33
 
 
62
 
63
  ### Training results
64
 
65
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
+ | 0.1586 | 0.4982 | 240 | 0.1689 | -2.6868 | -8.8615 | 0.9252 | 6.1746 | -1332.0890 | -692.0928 | 1.5975 | -0.1975 |
68
+ | 0.1408 | 0.9964 | 480 | 0.1482 | -6.5979 | -15.9265 | 0.9326 | 9.3286 | -2038.5934 | -1083.2030 | 3.3424 | 0.5015 |
69
+ | 0.0852 | 1.4946 | 720 | 0.1644 | -10.0141 | -23.7065 | 0.9228 | 13.6924 | -2816.5886 | -1424.8193 | 3.4873 | 0.0636 |
70
+ | 0.0743 | 1.9927 | 960 | 0.1578 | -8.8688 | -21.7065 | 0.9203 | 12.8376 | -2616.5852 | -1310.2896 | 3.0165 | -0.1724 |
71
 
72
 
73
  ### Framework versions
all_results.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "epoch": 2.0,
3
  "eval_logits/chosen": -2.8249309062957764,
4
  "eval_logits/rejected": -2.6878879070281982,
5
  "eval_logps/chosen": -401.2567138671875,
@@ -14,9 +14,9 @@
14
  "eval_samples_per_second": 17.099,
15
  "eval_steps_per_second": 0.342,
16
  "total_flos": 0.0,
17
- "train_loss": 0.1732867956161499,
18
- "train_runtime": 97.0028,
19
- "train_samples": 50,
20
- "train_samples_per_second": 1.031,
21
- "train_steps_per_second": 0.021
22
  }
 
1
  {
2
+ "epoch": 1.996886351842242,
3
  "eval_logits/chosen": -2.8249309062957764,
4
  "eval_logits/rejected": -2.6878879070281982,
5
  "eval_logps/chosen": -401.2567138671875,
 
14
  "eval_samples_per_second": 17.099,
15
  "eval_steps_per_second": 0.342,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.1585804910288655,
18
+ "train_runtime": 28225.2793,
19
+ "train_samples": 123309,
20
+ "train_samples_per_second": 8.737,
21
+ "train_steps_per_second": 0.034
22
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef524f92367d1777527739b52b7b12a82f76d559c915e0a77ce2017802648f4a
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29a7362c07defb3a6b1f6273dbade7b018c7b81675e8d05f6479514991110160
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63d56363d4009b7206414bcbf7aeaec22897348c62edc2b5b3c2977fb3d53cb0
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58b78cf610b7fd116f39cfadc175aa49f426caf39910c0bb462854aed97c8ad7
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:631f8d5b2eaa23a669996716a42717b0fd77f974c67938de438c66634ea39712
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8117a1f752f54256f6f0a694ad3a933fe782666b10bada44a18eefee6b2d2f7f
3
  size 4540516344
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.1732867956161499,
5
- "train_runtime": 97.0028,
6
- "train_samples": 50,
7
- "train_samples_per_second": 1.031,
8
- "train_steps_per_second": 0.021
9
  }
 
1
  {
2
+ "epoch": 1.996886351842242,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.1585804910288655,
5
+ "train_runtime": 28225.2793,
6
+ "train_samples": 123309,
7
+ "train_samples_per_second": 8.737,
8
+ "train_steps_per_second": 0.034
9
  }
trainer_state.json CHANGED
@@ -1,21 +1,22 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
  "eval_steps": 240,
6
- "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "learning_rate": 5e-07,
14
- "logits/chosen": -2.766833543777466,
15
- "logits/rejected": -2.7548677921295166,
16
- "logps/chosen": -492.5103759765625,
17
- "logps/rejected": -501.75994873046875,
18
- "loss": 0.1733,
 
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -23,17 +24,1521 @@
23
  "step": 1
24
  },
25
  {
26
- "epoch": 2.0,
27
- "step": 2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "total_flos": 0.0,
29
- "train_loss": 0.1732867956161499,
30
- "train_runtime": 97.0028,
31
- "train_samples_per_second": 1.031,
32
- "train_steps_per_second": 0.021
33
  }
34
  ],
35
  "logging_steps": 10,
36
- "max_steps": 2,
37
  "num_input_tokens_seen": 0,
38
  "num_train_epochs": 2,
39
  "save_steps": 240,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.996886351842242,
5
  "eval_steps": 240,
6
+ "global_step": 962,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0020757654385054488,
13
+ "grad_norm": 7.452356109448157,
14
+ "learning_rate": 5.154639175257731e-09,
15
+ "logits/chosen": -2.730942726135254,
16
+ "logits/rejected": -2.654609203338623,
17
+ "logps/chosen": -350.489990234375,
18
+ "logps/rejected": -325.546875,
19
+ "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.02075765438505449,
28
+ "grad_norm": 7.866099026608714,
29
+ "learning_rate": 5.154639175257731e-08,
30
+ "logits/chosen": -2.7327775955200195,
31
+ "logits/rejected": -2.734964609146118,
32
+ "logps/chosen": -366.4884033203125,
33
+ "logps/rejected": -412.2764892578125,
34
+ "loss": 0.6931,
35
+ "rewards/accuracies": 0.4826388955116272,
36
+ "rewards/chosen": 6.40038269921206e-05,
37
+ "rewards/margins": 0.00040091515984386206,
38
+ "rewards/rejected": -0.00033691132557578385,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.04151530877010898,
43
+ "grad_norm": 7.559166611883559,
44
+ "learning_rate": 1.0309278350515462e-07,
45
+ "logits/chosen": -2.717351198196411,
46
+ "logits/rejected": -2.69411563873291,
47
+ "logps/chosen": -378.9507141113281,
48
+ "logps/rejected": -404.0965270996094,
49
+ "loss": 0.6921,
50
+ "rewards/accuracies": 0.59375,
51
+ "rewards/chosen": 0.00041852169670164585,
52
+ "rewards/margins": 0.0022167633287608624,
53
+ "rewards/rejected": -0.0017982417484745383,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.062272963155163466,
58
+ "grad_norm": 7.615638482493831,
59
+ "learning_rate": 1.5463917525773197e-07,
60
+ "logits/chosen": -2.719243288040161,
61
+ "logits/rejected": -2.7044143676757812,
62
+ "logps/chosen": -365.67889404296875,
63
+ "logps/rejected": -388.8330993652344,
64
+ "loss": 0.6864,
65
+ "rewards/accuracies": 0.793749988079071,
66
+ "rewards/chosen": 0.004637600388377905,
67
+ "rewards/margins": 0.013907420448958874,
68
+ "rewards/rejected": -0.009269820526242256,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.08303061754021795,
73
+ "grad_norm": 7.7707957616247505,
74
+ "learning_rate": 2.0618556701030925e-07,
75
+ "logits/chosen": -2.7188639640808105,
76
+ "logits/rejected": -2.6840219497680664,
77
+ "logps/chosen": -350.5911865234375,
78
+ "logps/rejected": -365.0216979980469,
79
+ "loss": 0.6693,
80
+ "rewards/accuracies": 0.871874988079071,
81
+ "rewards/chosen": 0.027326706796884537,
82
+ "rewards/margins": 0.049114055931568146,
83
+ "rewards/rejected": -0.02178734540939331,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.10378827192527244,
88
+ "grad_norm": 8.90458099154434,
89
+ "learning_rate": 2.5773195876288655e-07,
90
+ "logits/chosen": -2.7294702529907227,
91
+ "logits/rejected": -2.7243564128875732,
92
+ "logps/chosen": -358.9928894042969,
93
+ "logps/rejected": -397.3983459472656,
94
+ "loss": 0.63,
95
+ "rewards/accuracies": 0.862500011920929,
96
+ "rewards/chosen": 0.07257182896137238,
97
+ "rewards/margins": 0.14314065873622894,
98
+ "rewards/rejected": -0.07056883722543716,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.12454592631032693,
103
+ "grad_norm": 9.98989696476554,
104
+ "learning_rate": 3.0927835051546394e-07,
105
+ "logits/chosen": -2.6951613426208496,
106
+ "logits/rejected": -2.678609848022461,
107
+ "logps/chosen": -348.3409729003906,
108
+ "logps/rejected": -426.9418029785156,
109
+ "loss": 0.5482,
110
+ "rewards/accuracies": 0.878125011920929,
111
+ "rewards/chosen": -0.011659199371933937,
112
+ "rewards/margins": 0.36892449855804443,
113
+ "rewards/rejected": -0.3805837035179138,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.14530358069538143,
118
+ "grad_norm": 14.516753369132225,
119
+ "learning_rate": 3.608247422680412e-07,
120
+ "logits/chosen": -2.7372078895568848,
121
+ "logits/rejected": -2.6939454078674316,
122
+ "logps/chosen": -451.07293701171875,
123
+ "logps/rejected": -535.2464599609375,
124
+ "loss": 0.4282,
125
+ "rewards/accuracies": 0.856249988079071,
126
+ "rewards/chosen": -0.5447245836257935,
127
+ "rewards/margins": 0.7660267353057861,
128
+ "rewards/rejected": -1.3107513189315796,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.1660612350804359,
133
+ "grad_norm": 15.352248807358325,
134
+ "learning_rate": 4.123711340206185e-07,
135
+ "logits/chosen": -2.7028908729553223,
136
+ "logits/rejected": -2.6827831268310547,
137
+ "logps/chosen": -547.4126586914062,
138
+ "logps/rejected": -697.9911499023438,
139
+ "loss": 0.3596,
140
+ "rewards/accuracies": 0.8687499761581421,
141
+ "rewards/chosen": -1.5639097690582275,
142
+ "rewards/margins": 1.3926727771759033,
143
+ "rewards/rejected": -2.9565823078155518,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.1868188894654904,
148
+ "grad_norm": 17.157617837316515,
149
+ "learning_rate": 4.639175257731959e-07,
150
+ "logits/chosen": -2.664567708969116,
151
+ "logits/rejected": -2.664062023162842,
152
+ "logps/chosen": -561.7244262695312,
153
+ "logps/rejected": -796.8342895507812,
154
+ "loss": 0.3035,
155
+ "rewards/accuracies": 0.878125011920929,
156
+ "rewards/chosen": -1.9646422863006592,
157
+ "rewards/margins": 2.149301052093506,
158
+ "rewards/rejected": -4.113943576812744,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.2075765438505449,
163
+ "grad_norm": 15.724313068127296,
164
+ "learning_rate": 4.999851606199919e-07,
165
+ "logits/chosen": -2.607901096343994,
166
+ "logits/rejected": -2.587402582168579,
167
+ "logps/chosen": -576.0076904296875,
168
+ "logps/rejected": -909.0657348632812,
169
+ "loss": 0.263,
170
+ "rewards/accuracies": 0.903124988079071,
171
+ "rewards/chosen": -2.0050268173217773,
172
+ "rewards/margins": 3.0768344402313232,
173
+ "rewards/rejected": -5.0818610191345215,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.2283341982355994,
178
+ "grad_norm": 22.849604961849657,
179
+ "learning_rate": 4.997213984244138e-07,
180
+ "logits/chosen": -2.399500608444214,
181
+ "logits/rejected": -2.2552199363708496,
182
+ "logps/chosen": -620.6517333984375,
183
+ "logps/rejected": -1053.1802978515625,
184
+ "loss": 0.2333,
185
+ "rewards/accuracies": 0.9312499761581421,
186
+ "rewards/chosen": -2.491640329360962,
187
+ "rewards/margins": 4.089051246643066,
188
+ "rewards/rejected": -6.580691337585449,
189
+ "step": 110
190
+ },
191
+ {
192
+ "epoch": 0.24909185262065386,
193
+ "grad_norm": 15.418275685296619,
194
+ "learning_rate": 4.991282726678214e-07,
195
+ "logits/chosen": -2.1379497051239014,
196
+ "logits/rejected": -1.6612409353256226,
197
+ "logps/chosen": -636.1170654296875,
198
+ "logps/rejected": -1038.722412109375,
199
+ "loss": 0.2114,
200
+ "rewards/accuracies": 0.9125000238418579,
201
+ "rewards/chosen": -2.4115161895751953,
202
+ "rewards/margins": 3.842604875564575,
203
+ "rewards/rejected": -6.254120826721191,
204
+ "step": 120
205
+ },
206
+ {
207
+ "epoch": 0.26984950700570837,
208
+ "grad_norm": 22.122903405439125,
209
+ "learning_rate": 4.982065656380468e-07,
210
+ "logits/chosen": -1.8730089664459229,
211
+ "logits/rejected": -1.1086432933807373,
212
+ "logps/chosen": -624.3699951171875,
213
+ "logps/rejected": -1083.5306396484375,
214
+ "loss": 0.1978,
215
+ "rewards/accuracies": 0.8999999761581421,
216
+ "rewards/chosen": -2.4477767944335938,
217
+ "rewards/margins": 4.317242622375488,
218
+ "rewards/rejected": -6.76501989364624,
219
+ "step": 130
220
+ },
221
+ {
222
+ "epoch": 0.29060716139076287,
223
+ "grad_norm": 13.724914658926766,
224
+ "learning_rate": 4.969574929966689e-07,
225
+ "logits/chosen": -1.063720941543579,
226
+ "logits/rejected": 0.11310062557458878,
227
+ "logps/chosen": -654.7889404296875,
228
+ "logps/rejected": -1111.530517578125,
229
+ "loss": 0.2171,
230
+ "rewards/accuracies": 0.893750011920929,
231
+ "rewards/chosen": -2.8432438373565674,
232
+ "rewards/margins": 4.339225769042969,
233
+ "rewards/rejected": -7.182469844818115,
234
+ "step": 140
235
+ },
236
+ {
237
+ "epoch": 0.3113648157758173,
238
+ "grad_norm": 20.111379920464472,
239
+ "learning_rate": 4.953827021756488e-07,
240
+ "logits/chosen": -1.147434949874878,
241
+ "logits/rejected": 0.20881839096546173,
242
+ "logps/chosen": -609.7511596679688,
243
+ "logps/rejected": -1106.64990234375,
244
+ "loss": 0.1719,
245
+ "rewards/accuracies": 0.921875,
246
+ "rewards/chosen": -2.4953980445861816,
247
+ "rewards/margins": 4.772416591644287,
248
+ "rewards/rejected": -7.267814636230469,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.3321224701608718,
253
+ "grad_norm": 20.712950851230556,
254
+ "learning_rate": 4.93484270204492e-07,
255
+ "logits/chosen": -0.3996439576148987,
256
+ "logits/rejected": 0.9938270449638367,
257
+ "logps/chosen": -663.2901000976562,
258
+ "logps/rejected": -1266.5684814453125,
259
+ "loss": 0.1789,
260
+ "rewards/accuracies": 0.921875,
261
+ "rewards/chosen": -2.7938601970672607,
262
+ "rewards/margins": 5.7776288986206055,
263
+ "rewards/rejected": -8.571489334106445,
264
+ "step": 160
265
+ },
266
+ {
267
+ "epoch": 0.3528801245459263,
268
+ "grad_norm": 13.955815229008401,
269
+ "learning_rate": 4.91264700970804e-07,
270
+ "logits/chosen": -0.3857128620147705,
271
+ "logits/rejected": 1.0256187915802002,
272
+ "logps/chosen": -606.496337890625,
273
+ "logps/rejected": -1150.380126953125,
274
+ "loss": 0.1822,
275
+ "rewards/accuracies": 0.9156249761581421,
276
+ "rewards/chosen": -2.4765748977661133,
277
+ "rewards/margins": 5.2469162940979,
278
+ "rewards/rejected": -7.723490238189697,
279
+ "step": 170
280
+ },
281
+ {
282
+ "epoch": 0.3736377789309808,
283
+ "grad_norm": 19.91766661692759,
284
+ "learning_rate": 4.88726921917853e-07,
285
+ "logits/chosen": -0.7232545614242554,
286
+ "logits/rejected": 0.9214665293693542,
287
+ "logps/chosen": -608.4518432617188,
288
+ "logps/rejected": -1221.78955078125,
289
+ "loss": 0.1832,
290
+ "rewards/accuracies": 0.909375011920929,
291
+ "rewards/chosen": -2.418121337890625,
292
+ "rewards/margins": 5.937131881713867,
293
+ "rewards/rejected": -8.355253219604492,
294
+ "step": 180
295
+ },
296
+ {
297
+ "epoch": 0.39439543331603527,
298
+ "grad_norm": 16.024239064339824,
299
+ "learning_rate": 4.858742801834942e-07,
300
+ "logits/chosen": -0.32018381357192993,
301
+ "logits/rejected": 1.0246493816375732,
302
+ "logps/chosen": -667.6340942382812,
303
+ "logps/rejected": -1306.54833984375,
304
+ "loss": 0.1733,
305
+ "rewards/accuracies": 0.934374988079071,
306
+ "rewards/chosen": -2.8495688438415527,
307
+ "rewards/margins": 6.024672985076904,
308
+ "rewards/rejected": -8.874241828918457,
309
+ "step": 190
310
+ },
311
+ {
312
+ "epoch": 0.4151530877010898,
313
+ "grad_norm": 12.349261100548174,
314
+ "learning_rate": 4.827105381855496e-07,
315
+ "logits/chosen": 0.25890278816223145,
316
+ "logits/rejected": 1.507102370262146,
317
+ "logps/chosen": -618.0540161132812,
318
+ "logps/rejected": -1187.555419921875,
319
+ "loss": 0.1662,
320
+ "rewards/accuracies": 0.918749988079071,
321
+ "rewards/chosen": -2.504819393157959,
322
+ "rewards/margins": 5.406851291656494,
323
+ "rewards/rejected": -7.911670684814453,
324
+ "step": 200
325
+ },
326
+ {
327
+ "epoch": 0.4359107420861443,
328
+ "grad_norm": 13.66810823452425,
329
+ "learning_rate": 4.79239868659464e-07,
330
+ "logits/chosen": -0.016678806394338608,
331
+ "logits/rejected": 1.5381004810333252,
332
+ "logps/chosen": -641.5460815429688,
333
+ "logps/rejected": -1252.699951171875,
334
+ "loss": 0.1584,
335
+ "rewards/accuracies": 0.934374988079071,
336
+ "rewards/chosen": -2.724513530731201,
337
+ "rewards/margins": 5.9185709953308105,
338
+ "rewards/rejected": -8.643084526062012,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 0.4566683964711988,
343
+ "grad_norm": 22.723399659159355,
344
+ "learning_rate": 4.7546684915478443e-07,
345
+ "logits/chosen": -0.5056034326553345,
346
+ "logits/rejected": 1.3583004474639893,
347
+ "logps/chosen": -627.7324829101562,
348
+ "logps/rejected": -1324.6092529296875,
349
+ "loss": 0.1696,
350
+ "rewards/accuracies": 0.9156249761581421,
351
+ "rewards/chosen": -2.6251590251922607,
352
+ "rewards/margins": 6.788819789886475,
353
+ "rewards/rejected": -9.413978576660156,
354
+ "step": 220
355
+ },
356
+ {
357
+ "epoch": 0.4774260508562532,
358
+ "grad_norm": 25.512034717506292,
359
+ "learning_rate": 4.7139645599771953e-07,
360
+ "logits/chosen": -0.05699120834469795,
361
+ "logits/rejected": 1.986132025718689,
362
+ "logps/chosen": -660.3882446289062,
363
+ "logps/rejected": -1403.30322265625,
364
+ "loss": 0.1656,
365
+ "rewards/accuracies": 0.9125000238418579,
366
+ "rewards/chosen": -2.9933114051818848,
367
+ "rewards/margins": 7.2729973793029785,
368
+ "rewards/rejected": -10.266307830810547,
369
+ "step": 230
370
+ },
371
+ {
372
+ "epoch": 0.49818370524130773,
373
+ "grad_norm": 19.374014963474004,
374
+ "learning_rate": 4.6703405772774325e-07,
375
+ "logits/chosen": -0.2160125970840454,
376
+ "logits/rejected": 2.073151111602783,
377
+ "logps/chosen": -610.6514892578125,
378
+ "logps/rejected": -1350.219482421875,
379
+ "loss": 0.1586,
380
+ "rewards/accuracies": 0.9468749761581421,
381
+ "rewards/chosen": -2.5249383449554443,
382
+ "rewards/margins": 7.157525539398193,
383
+ "rewards/rejected": -9.682464599609375,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 0.49818370524130773,
388
+ "eval_logits/chosen": -0.19751745462417603,
389
+ "eval_logits/rejected": 1.597546935081482,
390
+ "eval_logps/chosen": -692.0928344726562,
391
+ "eval_logps/rejected": -1332.0889892578125,
392
+ "eval_loss": 0.1689271330833435,
393
+ "eval_rewards/accuracies": 0.9252451062202454,
394
+ "eval_rewards/chosen": -2.6868460178375244,
395
+ "eval_rewards/margins": 6.174641132354736,
396
+ "eval_rewards/rejected": -8.861486434936523,
397
+ "eval_runtime": 297.8773,
398
+ "eval_samples_per_second": 21.791,
399
+ "eval_steps_per_second": 0.342,
400
+ "step": 240
401
+ },
402
+ {
403
+ "epoch": 0.5189413596263622,
404
+ "grad_norm": 12.036861238820864,
405
+ "learning_rate": 4.6238540801689896e-07,
406
+ "logits/chosen": -0.4214223325252533,
407
+ "logits/rejected": 1.8403441905975342,
408
+ "logps/chosen": -610.5186767578125,
409
+ "logps/rejected": -1277.1844482421875,
410
+ "loss": 0.1604,
411
+ "rewards/accuracies": 0.9437500238418579,
412
+ "rewards/chosen": -2.307227373123169,
413
+ "rewards/margins": 6.4557318687438965,
414
+ "rewards/rejected": -8.762959480285645,
415
+ "step": 250
416
+ },
417
+ {
418
+ "epoch": 0.5396990140114167,
419
+ "grad_norm": 20.090786603415435,
420
+ "learning_rate": 4.5745663808114316e-07,
421
+ "logits/chosen": -0.743881106376648,
422
+ "logits/rejected": 1.5020240545272827,
423
+ "logps/chosen": -657.1211547851562,
424
+ "logps/rejected": -1360.107666015625,
425
+ "loss": 0.1638,
426
+ "rewards/accuracies": 0.9312499761581421,
427
+ "rewards/chosen": -2.839501142501831,
428
+ "rewards/margins": 6.807010650634766,
429
+ "rewards/rejected": -9.646512031555176,
430
+ "step": 260
431
+ },
432
+ {
433
+ "epoch": 0.5604566683964712,
434
+ "grad_norm": 14.890524731187542,
435
+ "learning_rate": 4.5225424859373684e-07,
436
+ "logits/chosen": -0.38103678822517395,
437
+ "logits/rejected": 1.8405994176864624,
438
+ "logps/chosen": -600.785888671875,
439
+ "logps/rejected": -1403.572998046875,
440
+ "loss": 0.1493,
441
+ "rewards/accuracies": 0.9468749761581421,
442
+ "rewards/chosen": -2.4715404510498047,
443
+ "rewards/margins": 7.713797569274902,
444
+ "rewards/rejected": -10.185338020324707,
445
+ "step": 270
446
+ },
447
+ {
448
+ "epoch": 0.5812143227815257,
449
+ "grad_norm": 15.737222130316104,
450
+ "learning_rate": 4.467851011113515e-07,
451
+ "logits/chosen": 0.3215886056423187,
452
+ "logits/rejected": 2.3104662895202637,
453
+ "logps/chosen": -629.4906005859375,
454
+ "logps/rejected": -1383.751953125,
455
+ "loss": 0.1571,
456
+ "rewards/accuracies": 0.925000011920929,
457
+ "rewards/chosen": -2.5350632667541504,
458
+ "rewards/margins": 7.314475059509277,
459
+ "rewards/rejected": -9.849536895751953,
460
+ "step": 280
461
+ },
462
+ {
463
+ "epoch": 0.6019719771665801,
464
+ "grad_norm": 23.328559288837457,
465
+ "learning_rate": 4.410564090241966e-07,
466
+ "logits/chosen": 0.4208109378814697,
467
+ "logits/rejected": 2.2348339557647705,
468
+ "logps/chosen": -675.2015991210938,
469
+ "logps/rejected": -1364.4970703125,
470
+ "loss": 0.1635,
471
+ "rewards/accuracies": 0.9156249761581421,
472
+ "rewards/chosen": -3.0044052600860596,
473
+ "rewards/margins": 6.818602085113525,
474
+ "rewards/rejected": -9.823007583618164,
475
+ "step": 290
476
+ },
477
+ {
478
+ "epoch": 0.6227296315516346,
479
+ "grad_norm": 14.02162805247057,
480
+ "learning_rate": 4.35075728042106e-07,
481
+ "logits/chosen": -0.0718420147895813,
482
+ "logits/rejected": 1.6788543462753296,
483
+ "logps/chosen": -612.244873046875,
484
+ "logps/rejected": -1253.4449462890625,
485
+ "loss": 0.1528,
486
+ "rewards/accuracies": 0.949999988079071,
487
+ "rewards/chosen": -2.3619751930236816,
488
+ "rewards/margins": 6.16156530380249,
489
+ "rewards/rejected": -8.523540496826172,
490
+ "step": 300
491
+ },
492
+ {
493
+ "epoch": 0.6434872859366891,
494
+ "grad_norm": 13.851968293163813,
495
+ "learning_rate": 4.2885094622913016e-07,
496
+ "logits/chosen": -0.06367097049951553,
497
+ "logits/rejected": 1.8669401407241821,
498
+ "logps/chosen": -630.0944213867188,
499
+ "logps/rejected": -1326.2972412109375,
500
+ "loss": 0.1541,
501
+ "rewards/accuracies": 0.90625,
502
+ "rewards/chosen": -2.6027626991271973,
503
+ "rewards/margins": 6.887024879455566,
504
+ "rewards/rejected": -9.489786148071289,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.6642449403217436,
509
+ "grad_norm": 22.056173099963235,
510
+ "learning_rate": 4.223902735997788e-07,
511
+ "logits/chosen": -0.3920370638370514,
512
+ "logits/rejected": 1.7357165813446045,
513
+ "logps/chosen": -582.3004150390625,
514
+ "logps/rejected": -1314.4351806640625,
515
+ "loss": 0.1538,
516
+ "rewards/accuracies": 0.9125000238418579,
517
+ "rewards/chosen": -2.171776294708252,
518
+ "rewards/margins": 7.103475093841553,
519
+ "rewards/rejected": -9.275251388549805,
520
+ "step": 320
521
+ },
522
+ {
523
+ "epoch": 0.6850025947067981,
524
+ "grad_norm": 10.33520679821666,
525
+ "learning_rate": 4.157022312906352e-07,
526
+ "logits/chosen": -1.2140147686004639,
527
+ "logits/rejected": 1.2578856945037842,
528
+ "logps/chosen": -645.0560302734375,
529
+ "logps/rejected": -1458.661376953125,
530
+ "loss": 0.1492,
531
+ "rewards/accuracies": 0.9125000238418579,
532
+ "rewards/chosen": -2.7255163192749023,
533
+ "rewards/margins": 7.840676784515381,
534
+ "rewards/rejected": -10.566194534301758,
535
+ "step": 330
536
+ },
537
+ {
538
+ "epoch": 0.7057602490918526,
539
+ "grad_norm": 12.698623571384912,
540
+ "learning_rate": 4.0879564032162425e-07,
541
+ "logits/chosen": -1.3428138494491577,
542
+ "logits/rejected": 1.8070862293243408,
543
+ "logps/chosen": -719.0316162109375,
544
+ "logps/rejected": -1663.4603271484375,
545
+ "loss": 0.143,
546
+ "rewards/accuracies": 0.9125000238418579,
547
+ "rewards/chosen": -3.4557037353515625,
548
+ "rewards/margins": 9.146575927734375,
549
+ "rewards/rejected": -12.602280616760254,
550
+ "step": 340
551
+ },
552
+ {
553
+ "epoch": 0.7265179034769071,
554
+ "grad_norm": 16.68939070839139,
555
+ "learning_rate": 4.016796099617569e-07,
556
+ "logits/chosen": -1.3726019859313965,
557
+ "logits/rejected": 1.8845252990722656,
558
+ "logps/chosen": -672.85693359375,
559
+ "logps/rejected": -1494.5582275390625,
560
+ "loss": 0.1538,
561
+ "rewards/accuracies": 0.893750011920929,
562
+ "rewards/chosen": -3.1178722381591797,
563
+ "rewards/margins": 7.914468288421631,
564
+ "rewards/rejected": -11.032341003417969,
565
+ "step": 350
566
+ },
567
+ {
568
+ "epoch": 0.7472755578619616,
569
+ "grad_norm": 14.783186894273188,
570
+ "learning_rate": 3.9436352571469577e-07,
571
+ "logits/chosen": -0.5545839071273804,
572
+ "logits/rejected": 3.833822727203369,
573
+ "logps/chosen": -830.2039184570312,
574
+ "logps/rejected": -1744.631103515625,
575
+ "loss": 0.1444,
576
+ "rewards/accuracies": 0.934374988079071,
577
+ "rewards/chosen": -4.657293796539307,
578
+ "rewards/margins": 9.05556583404541,
579
+ "rewards/rejected": -13.712858200073242,
580
+ "step": 360
581
+ },
582
+ {
583
+ "epoch": 0.768033212247016,
584
+ "grad_norm": 15.755226269776646,
585
+ "learning_rate": 3.868570369399893e-07,
586
+ "logits/chosen": -0.6274186372756958,
587
+ "logits/rejected": 3.4528141021728516,
588
+ "logps/chosen": -799.2054443359375,
589
+ "logps/rejected": -1723.1851806640625,
590
+ "loss": 0.1458,
591
+ "rewards/accuracies": 0.921875,
592
+ "rewards/chosen": -4.323949337005615,
593
+ "rewards/margins": 9.110595703125,
594
+ "rewards/rejected": -13.434545516967773,
595
+ "step": 370
596
+ },
597
+ {
598
+ "epoch": 0.7887908666320705,
599
+ "grad_norm": 11.452118871998977,
600
+ "learning_rate": 3.791700441262987e-07,
601
+ "logits/chosen": 0.8511954545974731,
602
+ "logits/rejected": 4.691292762756348,
603
+ "logps/chosen": -939.5437622070312,
604
+ "logps/rejected": -2012.4154052734375,
605
+ "loss": 0.1417,
606
+ "rewards/accuracies": 0.925000011920929,
607
+ "rewards/chosen": -5.871208667755127,
608
+ "rewards/margins": 10.141061782836914,
609
+ "rewards/rejected": -16.012271881103516,
610
+ "step": 380
611
+ },
612
+ {
613
+ "epoch": 0.809548521017125,
614
+ "grad_norm": 13.459005174890807,
615
+ "learning_rate": 3.7131268583340515e-07,
616
+ "logits/chosen": 0.3982798457145691,
617
+ "logits/rejected": 4.274272441864014,
618
+ "logps/chosen": -845.7362060546875,
619
+ "logps/rejected": -1642.904541015625,
620
+ "loss": 0.1523,
621
+ "rewards/accuracies": 0.9437500238418579,
622
+ "rewards/chosen": -4.7007269859313965,
623
+ "rewards/margins": 7.76672887802124,
624
+ "rewards/rejected": -12.467455863952637,
625
+ "step": 390
626
+ },
627
+ {
628
+ "epoch": 0.8303061754021795,
629
+ "grad_norm": 13.869817366585716,
630
+ "learning_rate": 3.632953253202198e-07,
631
+ "logits/chosen": 1.5343601703643799,
632
+ "logits/rejected": 4.5011491775512695,
633
+ "logps/chosen": -881.2744140625,
634
+ "logps/rejected": -1823.2939453125,
635
+ "loss": 0.1449,
636
+ "rewards/accuracies": 0.9312499761581421,
637
+ "rewards/chosen": -5.160914421081543,
638
+ "rewards/margins": 9.128256797790527,
639
+ "rewards/rejected": -14.289172172546387,
640
+ "step": 400
641
+ },
642
+ {
643
+ "epoch": 0.851063829787234,
644
+ "grad_norm": 14.502067608546383,
645
+ "learning_rate": 3.551285368764321e-07,
646
+ "logits/chosen": 1.5195695161819458,
647
+ "logits/rejected": 4.168100357055664,
648
+ "logps/chosen": -846.6906127929688,
649
+ "logps/rejected": -1635.856689453125,
650
+ "loss": 0.1445,
651
+ "rewards/accuracies": 0.918749988079071,
652
+ "rewards/chosen": -4.812326431274414,
653
+ "rewards/margins": 7.744412422180176,
654
+ "rewards/rejected": -12.556737899780273,
655
+ "step": 410
656
+ },
657
+ {
658
+ "epoch": 0.8718214841722886,
659
+ "grad_norm": 30.730295160965017,
660
+ "learning_rate": 3.468230918758242e-07,
661
+ "logits/chosen": -0.11532606929540634,
662
+ "logits/rejected": 3.6724467277526855,
663
+ "logps/chosen": -856.9993286132812,
664
+ "logps/rejected": -1789.368408203125,
665
+ "loss": 0.1521,
666
+ "rewards/accuracies": 0.9281250238418579,
667
+ "rewards/chosen": -4.82938289642334,
668
+ "rewards/margins": 9.0027494430542,
669
+ "rewards/rejected": -13.832133293151855,
670
+ "step": 420
671
+ },
672
+ {
673
+ "epoch": 0.892579138557343,
674
+ "grad_norm": 15.361401417475852,
675
+ "learning_rate": 3.383899445696477e-07,
676
+ "logits/chosen": -0.19606542587280273,
677
+ "logits/rejected": 3.2190608978271484,
678
+ "logps/chosen": -914.5608520507812,
679
+ "logps/rejected": -1852.5074462890625,
680
+ "loss": 0.1317,
681
+ "rewards/accuracies": 0.9281250238418579,
682
+ "rewards/chosen": -5.41568660736084,
683
+ "rewards/margins": 9.153519630432129,
684
+ "rewards/rejected": -14.569204330444336,
685
+ "step": 430
686
+ },
687
+ {
688
+ "epoch": 0.9133367929423976,
689
+ "grad_norm": 15.988866292668206,
690
+ "learning_rate": 3.2984021763879756e-07,
691
+ "logits/chosen": 0.24461348354816437,
692
+ "logits/rejected": 3.7112534046173096,
693
+ "logps/chosen": -1049.1207275390625,
694
+ "logps/rejected": -2157.442138671875,
695
+ "loss": 0.1472,
696
+ "rewards/accuracies": 0.9312499761581421,
697
+ "rewards/chosen": -6.739071846008301,
698
+ "rewards/margins": 10.936814308166504,
699
+ "rewards/rejected": -17.675886154174805,
700
+ "step": 440
701
+ },
702
+ {
703
+ "epoch": 0.934094447327452,
704
+ "grad_norm": 16.98533210114085,
705
+ "learning_rate": 3.211851875238408e-07,
706
+ "logits/chosen": 0.16562503576278687,
707
+ "logits/rejected": 4.111753940582275,
708
+ "logps/chosen": -1058.0418701171875,
709
+ "logps/rejected": -2127.619873046875,
710
+ "loss": 0.1444,
711
+ "rewards/accuracies": 0.9375,
712
+ "rewards/chosen": -6.909078121185303,
713
+ "rewards/margins": 10.602731704711914,
714
+ "rewards/rejected": -17.511810302734375,
715
+ "step": 450
716
+ },
717
+ {
718
+ "epoch": 0.9548521017125065,
719
+ "grad_norm": 14.967379090940785,
720
+ "learning_rate": 3.124362695522476e-07,
721
+ "logits/chosen": 1.7813737392425537,
722
+ "logits/rejected": 4.8395795822143555,
723
+ "logps/chosen": -1155.03125,
724
+ "logps/rejected": -2117.0537109375,
725
+ "loss": 0.1396,
726
+ "rewards/accuracies": 0.949999988079071,
727
+ "rewards/chosen": -7.852016448974609,
728
+ "rewards/margins": 9.481895446777344,
729
+ "rewards/rejected": -17.333911895751953,
730
+ "step": 460
731
+ },
732
+ {
733
+ "epoch": 0.975609756097561,
734
+ "grad_norm": 14.307026095079875,
735
+ "learning_rate": 3.036050028824415e-07,
736
+ "logits/chosen": 1.5212550163269043,
737
+ "logits/rejected": 4.2941412925720215,
738
+ "logps/chosen": -1053.684814453125,
739
+ "logps/rejected": -1923.693359375,
740
+ "loss": 0.125,
741
+ "rewards/accuracies": 0.9281250238418579,
742
+ "rewards/chosen": -6.825259208679199,
743
+ "rewards/margins": 8.460533142089844,
744
+ "rewards/rejected": -15.285791397094727,
745
+ "step": 470
746
+ },
747
+ {
748
+ "epoch": 0.9963674104826155,
749
+ "grad_norm": 11.916307943626432,
750
+ "learning_rate": 2.9470303528452547e-07,
751
+ "logits/chosen": 1.5992648601531982,
752
+ "logits/rejected": 4.383261680603027,
753
+ "logps/chosen": -1080.980224609375,
754
+ "logps/rejected": -2100.404541015625,
755
+ "loss": 0.1408,
756
+ "rewards/accuracies": 0.9437500238418579,
757
+ "rewards/chosen": -7.290544033050537,
758
+ "rewards/margins": 9.89597225189209,
759
+ "rewards/rejected": -17.1865177154541,
760
+ "step": 480
761
+ },
762
+ {
763
+ "epoch": 0.9963674104826155,
764
+ "eval_logits/chosen": 0.5014842748641968,
765
+ "eval_logits/rejected": 3.342353582382202,
766
+ "eval_logps/chosen": -1083.2030029296875,
767
+ "eval_logps/rejected": -2038.5933837890625,
768
+ "eval_loss": 0.14824749529361725,
769
+ "eval_rewards/accuracies": 0.9325980544090271,
770
+ "eval_rewards/chosen": -6.597947597503662,
771
+ "eval_rewards/margins": 9.328583717346191,
772
+ "eval_rewards/rejected": -15.926531791687012,
773
+ "eval_runtime": 296.0381,
774
+ "eval_samples_per_second": 21.926,
775
+ "eval_steps_per_second": 0.345,
776
+ "step": 480
777
+ },
778
+ {
779
+ "epoch": 1.01712506486767,
780
+ "grad_norm": 14.456304387791999,
781
+ "learning_rate": 2.8574210777775755e-07,
782
+ "logits/chosen": 0.9456971287727356,
783
+ "logits/rejected": 4.39624547958374,
784
+ "logps/chosen": -1059.8154296875,
785
+ "logps/rejected": -2144.65576171875,
786
+ "loss": 0.0951,
787
+ "rewards/accuracies": 0.953125,
788
+ "rewards/chosen": -6.945745944976807,
789
+ "rewards/margins": 10.675562858581543,
790
+ "rewards/rejected": -17.621309280395508,
791
+ "step": 490
792
+ },
793
+ {
794
+ "epoch": 1.0378827192527245,
795
+ "grad_norm": 17.681966171705884,
796
+ "learning_rate": 2.767340391450384e-07,
797
+ "logits/chosen": 1.6467583179473877,
798
+ "logits/rejected": 5.070549011230469,
799
+ "logps/chosen": -1184.4014892578125,
800
+ "logps/rejected": -2638.23291015625,
801
+ "loss": 0.0919,
802
+ "rewards/accuracies": 0.9468749761581421,
803
+ "rewards/chosen": -8.20818042755127,
804
+ "rewards/margins": 14.303576469421387,
805
+ "rewards/rejected": -22.511754989624023,
806
+ "step": 500
807
+ },
808
+ {
809
+ "epoch": 1.058640373637779,
810
+ "grad_norm": 13.802679882016555,
811
+ "learning_rate": 2.6769071034483407e-07,
812
+ "logits/chosen": 1.1824378967285156,
813
+ "logits/rejected": 4.564583778381348,
814
+ "logps/chosen": -947.0963745117188,
815
+ "logps/rejected": -2079.13916015625,
816
+ "loss": 0.0879,
817
+ "rewards/accuracies": 0.9781249761581421,
818
+ "rewards/chosen": -5.809744358062744,
819
+ "rewards/margins": 11.030488967895508,
820
+ "rewards/rejected": -16.840232849121094,
821
+ "step": 510
822
+ },
823
+ {
824
+ "epoch": 1.0793980280228335,
825
+ "grad_norm": 22.098742924019817,
826
+ "learning_rate": 2.5862404884109365e-07,
827
+ "logits/chosen": 1.4323368072509766,
828
+ "logits/rejected": 4.83809757232666,
829
+ "logps/chosen": -1047.9368896484375,
830
+ "logps/rejected": -2401.321533203125,
831
+ "loss": 0.0826,
832
+ "rewards/accuracies": 0.965624988079071,
833
+ "rewards/chosen": -6.900972843170166,
834
+ "rewards/margins": 13.123873710632324,
835
+ "rewards/rejected": -20.02484703063965,
836
+ "step": 520
837
+ },
838
+ {
839
+ "epoch": 1.100155682407888,
840
+ "grad_norm": 14.888695199480608,
841
+ "learning_rate": 2.495460128718305e-07,
842
+ "logits/chosen": 0.8169673681259155,
843
+ "logits/rejected": 4.492055416107178,
844
+ "logps/chosen": -1088.3392333984375,
845
+ "logps/rejected": -2334.029052734375,
846
+ "loss": 0.0775,
847
+ "rewards/accuracies": 0.956250011920929,
848
+ "rewards/chosen": -7.015921592712402,
849
+ "rewards/margins": 12.27374267578125,
850
+ "rewards/rejected": -19.289663314819336,
851
+ "step": 530
852
+ },
853
+ {
854
+ "epoch": 1.1209133367929425,
855
+ "grad_norm": 13.711286256989611,
856
+ "learning_rate": 2.404685756771143e-07,
857
+ "logits/chosen": 0.3249141275882721,
858
+ "logits/rejected": 4.28688383102417,
859
+ "logps/chosen": -1050.6568603515625,
860
+ "logps/rejected": -2440.43212890625,
861
+ "loss": 0.0831,
862
+ "rewards/accuracies": 0.956250011920929,
863
+ "rewards/chosen": -6.925228118896484,
864
+ "rewards/margins": 13.674964904785156,
865
+ "rewards/rejected": -20.60019302368164,
866
+ "step": 540
867
+ },
868
+ {
869
+ "epoch": 1.141670991177997,
870
+ "grad_norm": 12.310015587533016,
871
+ "learning_rate": 2.314037097072764e-07,
872
+ "logits/chosen": 0.325296014547348,
873
+ "logits/rejected": 4.037835121154785,
874
+ "logps/chosen": -1070.4940185546875,
875
+ "logps/rejected": -2429.653076171875,
876
+ "loss": 0.0774,
877
+ "rewards/accuracies": 0.9750000238418579,
878
+ "rewards/chosen": -7.072981357574463,
879
+ "rewards/margins": 13.413686752319336,
880
+ "rewards/rejected": -20.48666763305664,
881
+ "step": 550
882
+ },
883
+ {
884
+ "epoch": 1.1624286455630513,
885
+ "grad_norm": 18.024933264685323,
886
+ "learning_rate": 2.2236337083215723e-07,
887
+ "logits/chosen": 0.973385214805603,
888
+ "logits/rejected": 4.8588714599609375,
889
+ "logps/chosen": -1165.290283203125,
890
+ "logps/rejected": -2533.037841796875,
891
+ "loss": 0.0806,
892
+ "rewards/accuracies": 0.965624988079071,
893
+ "rewards/chosen": -7.950223445892334,
894
+ "rewards/margins": 13.607198715209961,
895
+ "rewards/rejected": -21.557422637939453,
896
+ "step": 560
897
+ },
898
+ {
899
+ "epoch": 1.183186299948106,
900
+ "grad_norm": 39.199036754410095,
901
+ "learning_rate": 2.13359482572222e-07,
902
+ "logits/chosen": 0.9793283343315125,
903
+ "logits/rejected": 4.486080169677734,
904
+ "logps/chosen": -1073.2510986328125,
905
+ "logps/rejected": -2294.072509765625,
906
+ "loss": 0.0855,
907
+ "rewards/accuracies": 0.9468749761581421,
908
+ "rewards/chosen": -7.108448028564453,
909
+ "rewards/margins": 11.980100631713867,
910
+ "rewards/rejected": -19.088550567626953,
911
+ "step": 570
912
+ },
913
+ {
914
+ "epoch": 1.2039439543331603,
915
+ "grad_norm": 13.491261159834455,
916
+ "learning_rate": 2.044039203723423e-07,
917
+ "logits/chosen": 0.23347489535808563,
918
+ "logits/rejected": 3.7263119220733643,
919
+ "logps/chosen": -997.2879028320312,
920
+ "logps/rejected": -2139.56298828125,
921
+ "loss": 0.0886,
922
+ "rewards/accuracies": 0.9437500238418579,
923
+ "rewards/chosen": -6.343294143676758,
924
+ "rewards/margins": 11.091756820678711,
925
+ "rewards/rejected": -17.435049057006836,
926
+ "step": 580
927
+ },
928
+ {
929
+ "epoch": 1.2247016087182148,
930
+ "grad_norm": 22.750928538479595,
931
+ "learning_rate": 1.955084959389864e-07,
932
+ "logits/chosen": 0.4886883795261383,
933
+ "logits/rejected": 4.1630377769470215,
934
+ "logps/chosen": -1106.54052734375,
935
+ "logps/rejected": -2491.51416015625,
936
+ "loss": 0.0749,
937
+ "rewards/accuracies": 0.9781249761581421,
938
+ "rewards/chosen": -7.559281826019287,
939
+ "rewards/margins": 13.47680377960205,
940
+ "rewards/rejected": -21.036083221435547,
941
+ "step": 590
942
+ },
943
+ {
944
+ "epoch": 1.2454592631032693,
945
+ "grad_norm": 20.593372405978258,
946
+ "learning_rate": 1.866849416614753e-07,
947
+ "logits/chosen": 0.24794098734855652,
948
+ "logits/rejected": 4.261725425720215,
949
+ "logps/chosen": -1190.895751953125,
950
+ "logps/rejected": -2684.16943359375,
951
+ "loss": 0.0762,
952
+ "rewards/accuracies": 0.9781249761581421,
953
+ "rewards/chosen": -8.191116333007812,
954
+ "rewards/margins": 14.663787841796875,
955
+ "rewards/rejected": -22.854902267456055,
956
+ "step": 600
957
+ },
958
+ {
959
+ "epoch": 1.2662169174883238,
960
+ "grad_norm": 23.08708475257412,
961
+ "learning_rate": 1.7794489513785227e-07,
962
+ "logits/chosen": -0.35908642411231995,
963
+ "logits/rejected": 3.719221830368042,
964
+ "logps/chosen": -1067.587158203125,
965
+ "logps/rejected": -2360.556396484375,
966
+ "loss": 0.0824,
967
+ "rewards/accuracies": 0.96875,
968
+ "rewards/chosen": -6.811041355133057,
969
+ "rewards/margins": 12.751296997070312,
970
+ "rewards/rejected": -19.562339782714844,
971
+ "step": 610
972
+ },
973
+ {
974
+ "epoch": 1.2869745718733783,
975
+ "grad_norm": 11.158342147450194,
976
+ "learning_rate": 1.692998838257744e-07,
977
+ "logits/chosen": -0.18085989356040955,
978
+ "logits/rejected": 3.5346386432647705,
979
+ "logps/chosen": -1038.219970703125,
980
+ "logps/rejected": -2255.41064453125,
981
+ "loss": 0.0769,
982
+ "rewards/accuracies": 0.981249988079071,
983
+ "rewards/chosen": -6.670246124267578,
984
+ "rewards/margins": 11.845807075500488,
985
+ "rewards/rejected": -18.51605224609375,
986
+ "step": 620
987
+ },
988
+ {
989
+ "epoch": 1.3077322262584328,
990
+ "grad_norm": 14.33248304938314,
991
+ "learning_rate": 1.6076130983867191e-07,
992
+ "logits/chosen": -0.1000831350684166,
993
+ "logits/rejected": 4.0322370529174805,
994
+ "logps/chosen": -1037.5277099609375,
995
+ "logps/rejected": -2512.465087890625,
996
+ "loss": 0.0774,
997
+ "rewards/accuracies": 0.9750000238418579,
998
+ "rewards/chosen": -6.664424896240234,
999
+ "rewards/margins": 14.430699348449707,
1000
+ "rewards/rejected": -21.095125198364258,
1001
+ "step": 630
1002
+ },
1003
+ {
1004
+ "epoch": 1.3284898806434873,
1005
+ "grad_norm": 18.5366504084505,
1006
+ "learning_rate": 1.5234043490722587e-07,
1007
+ "logits/chosen": -0.20596277713775635,
1008
+ "logits/rejected": 3.613173246383667,
1009
+ "logps/chosen": -1079.144287109375,
1010
+ "logps/rejected": -2312.60791015625,
1011
+ "loss": 0.0821,
1012
+ "rewards/accuracies": 0.965624988079071,
1013
+ "rewards/chosen": -7.167913913726807,
1014
+ "rewards/margins": 12.191060066223145,
1015
+ "rewards/rejected": -19.35897445678711,
1016
+ "step": 640
1017
+ },
1018
+ {
1019
+ "epoch": 1.3492475350285418,
1020
+ "grad_norm": 18.228054814234156,
1021
+ "learning_rate": 1.44048365526001e-07,
1022
+ "logits/chosen": 0.0638003945350647,
1023
+ "logits/rejected": 3.9387619495391846,
1024
+ "logps/chosen": -1077.5164794921875,
1025
+ "logps/rejected": -2451.49462890625,
1026
+ "loss": 0.0867,
1027
+ "rewards/accuracies": 0.96875,
1028
+ "rewards/chosen": -7.139754295349121,
1029
+ "rewards/margins": 13.3494873046875,
1030
+ "rewards/rejected": -20.489240646362305,
1031
+ "step": 650
1032
+ },
1033
+ {
1034
+ "epoch": 1.3700051894135963,
1035
+ "grad_norm": 16.31608491567421,
1036
+ "learning_rate": 1.3589603830482243e-07,
1037
+ "logits/chosen": -0.08248420059680939,
1038
+ "logits/rejected": 4.198277950286865,
1039
+ "logps/chosen": -1142.9498291015625,
1040
+ "logps/rejected": -2611.378662109375,
1041
+ "loss": 0.0703,
1042
+ "rewards/accuracies": 0.96875,
1043
+ "rewards/chosen": -7.643826484680176,
1044
+ "rewards/margins": 14.516764640808105,
1045
+ "rewards/rejected": -22.160587310791016,
1046
+ "step": 660
1047
+ },
1048
+ {
1049
+ "epoch": 1.3907628437986508,
1050
+ "grad_norm": 21.850286306048936,
1051
+ "learning_rate": 1.2789420554421821e-07,
1052
+ "logits/chosen": -0.33774855732917786,
1053
+ "logits/rejected": 3.5582516193389893,
1054
+ "logps/chosen": -1178.3541259765625,
1055
+ "logps/rejected": -2523.93701171875,
1056
+ "loss": 0.0845,
1057
+ "rewards/accuracies": 0.956250011920929,
1058
+ "rewards/chosen": -8.179405212402344,
1059
+ "rewards/margins": 13.182377815246582,
1060
+ "rewards/rejected": -21.36178207397461,
1061
+ "step": 670
1062
+ },
1063
+ {
1064
+ "epoch": 1.4115204981837053,
1065
+ "grad_norm": 15.996775283562163,
1066
+ "learning_rate": 1.200534210539509e-07,
1067
+ "logits/chosen": -0.7213211059570312,
1068
+ "logits/rejected": 3.0649337768554688,
1069
+ "logps/chosen": -1151.722412109375,
1070
+ "logps/rejected": -2552.525634765625,
1071
+ "loss": 0.0892,
1072
+ "rewards/accuracies": 0.9750000238418579,
1073
+ "rewards/chosen": -7.825603485107422,
1074
+ "rewards/margins": 13.686927795410156,
1075
+ "rewards/rejected": -21.512531280517578,
1076
+ "step": 680
1077
+ },
1078
+ {
1079
+ "epoch": 1.4322781525687598,
1080
+ "grad_norm": 16.75949934654318,
1081
+ "learning_rate": 1.1238402623334492e-07,
1082
+ "logits/chosen": -0.9597622752189636,
1083
+ "logits/rejected": 2.867584228515625,
1084
+ "logps/chosen": -1066.507568359375,
1085
+ "logps/rejected": -2423.34423828125,
1086
+ "loss": 0.0898,
1087
+ "rewards/accuracies": 0.953125,
1088
+ "rewards/chosen": -6.92177677154541,
1089
+ "rewards/margins": 13.205133438110352,
1090
+ "rewards/rejected": -20.126911163330078,
1091
+ "step": 690
1092
+ },
1093
+ {
1094
+ "epoch": 1.4530358069538143,
1095
+ "grad_norm": 11.338012307523448,
1096
+ "learning_rate": 1.0489613643176479e-07,
1097
+ "logits/chosen": -0.6678389310836792,
1098
+ "logits/rejected": 3.2197043895721436,
1099
+ "logps/chosen": -1113.594482421875,
1100
+ "logps/rejected": -2410.258544921875,
1101
+ "loss": 0.0722,
1102
+ "rewards/accuracies": 0.956250011920929,
1103
+ "rewards/chosen": -7.322343349456787,
1104
+ "rewards/margins": 12.825822830200195,
1105
+ "rewards/rejected": -20.14816665649414,
1106
+ "step": 700
1107
+ },
1108
+ {
1109
+ "epoch": 1.4737934613388686,
1110
+ "grad_norm": 15.856495888079317,
1111
+ "learning_rate": 9.759962760723855e-08,
1112
+ "logits/chosen": -0.16049222648143768,
1113
+ "logits/rejected": 4.048083305358887,
1114
+ "logps/chosen": -1173.0849609375,
1115
+ "logps/rejected": -2468.91845703125,
1116
+ "loss": 0.0662,
1117
+ "rewards/accuracies": 0.96875,
1118
+ "rewards/chosen": -8.210199356079102,
1119
+ "rewards/margins": 12.80036735534668,
1120
+ "rewards/rejected": -21.01056671142578,
1121
+ "step": 710
1122
+ },
1123
+ {
1124
+ "epoch": 1.4945511157239233,
1125
+ "grad_norm": 31.07426456754591,
1126
+ "learning_rate": 9.050412330081883e-08,
1127
+ "logits/chosen": -0.20700784027576447,
1128
+ "logits/rejected": 3.9535317420959473,
1129
+ "logps/chosen": -1277.49755859375,
1130
+ "logps/rejected": -2737.533447265625,
1131
+ "loss": 0.0852,
1132
+ "rewards/accuracies": 0.9750000238418579,
1133
+ "rewards/chosen": -9.044589042663574,
1134
+ "rewards/margins": 14.331390380859375,
1135
+ "rewards/rejected": -23.375978469848633,
1136
+ "step": 720
1137
+ },
1138
+ {
1139
+ "epoch": 1.4945511157239233,
1140
+ "eval_logits/chosen": 0.06359091401100159,
1141
+ "eval_logits/rejected": 3.48725962638855,
1142
+ "eval_logps/chosen": -1424.8193359375,
1143
+ "eval_logps/rejected": -2816.588623046875,
1144
+ "eval_loss": 0.16444814205169678,
1145
+ "eval_rewards/accuracies": 0.9227941036224365,
1146
+ "eval_rewards/chosen": -10.01410961151123,
1147
+ "eval_rewards/margins": 13.69237232208252,
1148
+ "eval_rewards/rejected": -23.706483840942383,
1149
+ "eval_runtime": 297.3735,
1150
+ "eval_samples_per_second": 21.828,
1151
+ "eval_steps_per_second": 0.343,
1152
+ "step": 720
1153
+ },
1154
+ {
1155
+ "epoch": 1.5153087701089776,
1156
+ "grad_norm": 21.935108549149014,
1157
+ "learning_rate": 8.36189819438625e-08,
1158
+ "logits/chosen": -0.2683785557746887,
1159
+ "logits/rejected": 3.702204465866089,
1160
+ "logps/chosen": -1242.103759765625,
1161
+ "logps/rejected": -2603.80615234375,
1162
+ "loss": 0.0822,
1163
+ "rewards/accuracies": 0.965624988079071,
1164
+ "rewards/chosen": -8.634117126464844,
1165
+ "rewards/margins": 13.532336235046387,
1166
+ "rewards/rejected": -22.166454315185547,
1167
+ "step": 730
1168
+ },
1169
+ {
1170
+ "epoch": 1.5360664244940323,
1171
+ "grad_norm": 23.902586381058878,
1172
+ "learning_rate": 7.69532845149711e-08,
1173
+ "logits/chosen": -0.5570476055145264,
1174
+ "logits/rejected": 3.649244785308838,
1175
+ "logps/chosen": -1223.75439453125,
1176
+ "logps/rejected": -2637.007080078125,
1177
+ "loss": 0.0769,
1178
+ "rewards/accuracies": 0.9624999761581421,
1179
+ "rewards/chosen": -8.374210357666016,
1180
+ "rewards/margins": 13.947656631469727,
1181
+ "rewards/rejected": -22.321866989135742,
1182
+ "step": 740
1183
+ },
1184
+ {
1185
+ "epoch": 1.5568240788790866,
1186
+ "grad_norm": 20.338315184147856,
1187
+ "learning_rate": 7.051582256286929e-08,
1188
+ "logits/chosen": -0.5207056999206543,
1189
+ "logits/rejected": 3.2942442893981934,
1190
+ "logps/chosen": -1216.895263671875,
1191
+ "logps/rejected": -2651.10107421875,
1192
+ "loss": 0.0706,
1193
+ "rewards/accuracies": 0.9624999761581421,
1194
+ "rewards/chosen": -8.260024070739746,
1195
+ "rewards/margins": 14.131828308105469,
1196
+ "rewards/rejected": -22.39185333251953,
1197
+ "step": 750
1198
+ },
1199
+ {
1200
+ "epoch": 1.5775817332641413,
1201
+ "grad_norm": 19.826108952738736,
1202
+ "learning_rate": 6.431508661101954e-08,
1203
+ "logits/chosen": -0.35470911860466003,
1204
+ "logits/rejected": 3.6188712120056152,
1205
+ "logps/chosen": -1201.515625,
1206
+ "logps/rejected": -2452.108642578125,
1207
+ "loss": 0.0906,
1208
+ "rewards/accuracies": 0.9437500238418579,
1209
+ "rewards/chosen": -8.28996467590332,
1210
+ "rewards/margins": 12.43807315826416,
1211
+ "rewards/rejected": -20.728038787841797,
1212
+ "step": 760
1213
+ },
1214
+ {
1215
+ "epoch": 1.5983393876491956,
1216
+ "grad_norm": 18.731154290431807,
1217
+ "learning_rate": 5.8359254959266826e-08,
1218
+ "logits/chosen": -0.3598397374153137,
1219
+ "logits/rejected": 3.461977481842041,
1220
+ "logps/chosen": -1123.501220703125,
1221
+ "logps/rejected": -2401.59423828125,
1222
+ "loss": 0.0757,
1223
+ "rewards/accuracies": 0.956250011920929,
1224
+ "rewards/chosen": -7.732508659362793,
1225
+ "rewards/margins": 12.514566421508789,
1226
+ "rewards/rejected": -20.247074127197266,
1227
+ "step": 770
1228
+ },
1229
+ {
1230
+ "epoch": 1.61909704203425,
1231
+ "grad_norm": 14.021094730035166,
1232
+ "learning_rate": 5.265618289728199e-08,
1233
+ "logits/chosen": -0.2899594306945801,
1234
+ "logits/rejected": 3.6835436820983887,
1235
+ "logps/chosen": -1112.9346923828125,
1236
+ "logps/rejected": -2395.99560546875,
1237
+ "loss": 0.0748,
1238
+ "rewards/accuracies": 0.965624988079071,
1239
+ "rewards/chosen": -7.564324378967285,
1240
+ "rewards/margins": 12.517146110534668,
1241
+ "rewards/rejected": -20.081470489501953,
1242
+ "step": 780
1243
+ },
1244
+ {
1245
+ "epoch": 1.6398546964193046,
1246
+ "grad_norm": 15.755848122162748,
1247
+ "learning_rate": 4.721339234403121e-08,
1248
+ "logits/chosen": -0.4539187550544739,
1249
+ "logits/rejected": 3.4312031269073486,
1250
+ "logps/chosen": -1102.984375,
1251
+ "logps/rejected": -2424.76904296875,
1252
+ "loss": 0.0889,
1253
+ "rewards/accuracies": 0.9750000238418579,
1254
+ "rewards/chosen": -7.416151523590088,
1255
+ "rewards/margins": 12.970865249633789,
1256
+ "rewards/rejected": -20.387014389038086,
1257
+ "step": 790
1258
+ },
1259
+ {
1260
+ "epoch": 1.660612350804359,
1261
+ "grad_norm": 10.116183369840204,
1262
+ "learning_rate": 4.203806192693587e-08,
1263
+ "logits/chosen": -0.16934530436992645,
1264
+ "logits/rejected": 3.3178775310516357,
1265
+ "logps/chosen": -1117.904541015625,
1266
+ "logps/rejected": -2234.350341796875,
1267
+ "loss": 0.0806,
1268
+ "rewards/accuracies": 0.971875011920929,
1269
+ "rewards/chosen": -7.489673614501953,
1270
+ "rewards/margins": 10.942729949951172,
1271
+ "rewards/rejected": -18.432403564453125,
1272
+ "step": 800
1273
+ },
1274
+ {
1275
+ "epoch": 1.6813700051894136,
1276
+ "grad_norm": 12.314909116730863,
1277
+ "learning_rate": 3.7137017513808544e-08,
1278
+ "logits/chosen": -0.2919425368309021,
1279
+ "logits/rejected": 3.503552198410034,
1280
+ "logps/chosen": -1133.6949462890625,
1281
+ "logps/rejected": -2370.295166015625,
1282
+ "loss": 0.0813,
1283
+ "rewards/accuracies": 0.9593750238418579,
1284
+ "rewards/chosen": -7.621180057525635,
1285
+ "rewards/margins": 12.205830574035645,
1286
+ "rewards/rejected": -19.827011108398438,
1287
+ "step": 810
1288
+ },
1289
+ {
1290
+ "epoch": 1.702127659574468,
1291
+ "grad_norm": 13.100920531092996,
1292
+ "learning_rate": 3.251672321005147e-08,
1293
+ "logits/chosen": -0.3816925585269928,
1294
+ "logits/rejected": 3.223738193511963,
1295
+ "logps/chosen": -1128.720458984375,
1296
+ "logps/rejected": -2432.7451171875,
1297
+ "loss": 0.0799,
1298
+ "rewards/accuracies": 0.96875,
1299
+ "rewards/chosen": -7.558645725250244,
1300
+ "rewards/margins": 12.71537971496582,
1301
+ "rewards/rejected": -20.274024963378906,
1302
+ "step": 820
1303
+ },
1304
+ {
1305
+ "epoch": 1.7228853139595226,
1306
+ "grad_norm": 13.542985324247502,
1307
+ "learning_rate": 2.8183272832992267e-08,
1308
+ "logits/chosen": -0.4070394039154053,
1309
+ "logits/rejected": 3.100729465484619,
1310
+ "logps/chosen": -1098.7197265625,
1311
+ "logps/rejected": -2404.990234375,
1312
+ "loss": 0.0769,
1313
+ "rewards/accuracies": 0.9624999761581421,
1314
+ "rewards/chosen": -7.4235520362854,
1315
+ "rewards/margins": 12.707655906677246,
1316
+ "rewards/rejected": -20.131206512451172,
1317
+ "step": 830
1318
+ },
1319
+ {
1320
+ "epoch": 1.743642968344577,
1321
+ "grad_norm": 19.414761044152183,
1322
+ "learning_rate": 2.414238187460191e-08,
1323
+ "logits/chosen": -0.3680698275566101,
1324
+ "logits/rejected": 3.6575589179992676,
1325
+ "logps/chosen": -1149.003662109375,
1326
+ "logps/rejected": -2435.11474609375,
1327
+ "loss": 0.0801,
1328
+ "rewards/accuracies": 0.971875011920929,
1329
+ "rewards/chosen": -7.7628068923950195,
1330
+ "rewards/margins": 12.7561674118042,
1331
+ "rewards/rejected": -20.51897621154785,
1332
+ "step": 840
1333
+ },
1334
+ {
1335
+ "epoch": 1.7644006227296316,
1336
+ "grad_norm": 18.249543754589453,
1337
+ "learning_rate": 2.0399379963194713e-08,
1338
+ "logits/chosen": -0.15194618701934814,
1339
+ "logits/rejected": 3.823453903198242,
1340
+ "logps/chosen": -1178.7374267578125,
1341
+ "logps/rejected": -2647.753173828125,
1342
+ "loss": 0.0742,
1343
+ "rewards/accuracies": 0.9750000238418579,
1344
+ "rewards/chosen": -8.07176685333252,
1345
+ "rewards/margins": 14.458259582519531,
1346
+ "rewards/rejected": -22.530025482177734,
1347
+ "step": 850
1348
+ },
1349
+ {
1350
+ "epoch": 1.7851582771146859,
1351
+ "grad_norm": 13.687621613308757,
1352
+ "learning_rate": 1.695920383405322e-08,
1353
+ "logits/chosen": -0.1904297024011612,
1354
+ "logits/rejected": 3.487968921661377,
1355
+ "logps/chosen": -1222.4801025390625,
1356
+ "logps/rejected": -2662.45654296875,
1357
+ "loss": 0.0862,
1358
+ "rewards/accuracies": 0.981249988079071,
1359
+ "rewards/chosen": -8.289772033691406,
1360
+ "rewards/margins": 14.194554328918457,
1361
+ "rewards/rejected": -22.484325408935547,
1362
+ "step": 860
1363
+ },
1364
+ {
1365
+ "epoch": 1.8059159314997406,
1366
+ "grad_norm": 20.833924751057065,
1367
+ "learning_rate": 1.3826390818249434e-08,
1368
+ "logits/chosen": -0.10482398420572281,
1369
+ "logits/rejected": 3.5555343627929688,
1370
+ "logps/chosen": -1184.642822265625,
1371
+ "logps/rejected": -2578.11376953125,
1372
+ "loss": 0.0847,
1373
+ "rewards/accuracies": 0.965624988079071,
1374
+ "rewards/chosen": -8.188472747802734,
1375
+ "rewards/margins": 13.555729866027832,
1376
+ "rewards/rejected": -21.74420166015625,
1377
+ "step": 870
1378
+ },
1379
+ {
1380
+ "epoch": 1.826673585884795,
1381
+ "grad_norm": 12.098887002832749,
1382
+ "learning_rate": 1.1005072858249614e-08,
1383
+ "logits/chosen": -0.21788470447063446,
1384
+ "logits/rejected": 3.596818447113037,
1385
+ "logps/chosen": -1198.203125,
1386
+ "logps/rejected": -2579.16455078125,
1387
+ "loss": 0.0742,
1388
+ "rewards/accuracies": 0.9781249761581421,
1389
+ "rewards/chosen": -8.193432807922363,
1390
+ "rewards/margins": 13.623662948608398,
1391
+ "rewards/rejected": -21.817094802856445,
1392
+ "step": 880
1393
+ },
1394
+ {
1395
+ "epoch": 1.8474312402698496,
1396
+ "grad_norm": 22.649648963902354,
1397
+ "learning_rate": 8.498971058195886e-09,
1398
+ "logits/chosen": -0.2010045051574707,
1399
+ "logits/rejected": 3.6617989540100098,
1400
+ "logps/chosen": -1205.736328125,
1401
+ "logps/rejected": -2611.108642578125,
1402
+ "loss": 0.0685,
1403
+ "rewards/accuracies": 0.9781249761581421,
1404
+ "rewards/chosen": -8.295499801635742,
1405
+ "rewards/margins": 13.880526542663574,
1406
+ "rewards/rejected": -22.176025390625,
1407
+ "step": 890
1408
+ },
1409
+ {
1410
+ "epoch": 1.868188894654904,
1411
+ "grad_norm": 14.001123095428785,
1412
+ "learning_rate": 6.311390776052527e-09,
1413
+ "logits/chosen": -0.26746782660484314,
1414
+ "logits/rejected": 3.480095386505127,
1415
+ "logps/chosen": -1162.368408203125,
1416
+ "logps/rejected": -2528.66650390625,
1417
+ "loss": 0.0735,
1418
+ "rewards/accuracies": 0.949999988079071,
1419
+ "rewards/chosen": -7.984139442443848,
1420
+ "rewards/margins": 13.350987434387207,
1421
+ "rewards/rejected": -21.335126876831055,
1422
+ "step": 900
1423
+ },
1424
+ {
1425
+ "epoch": 1.8889465490399586,
1426
+ "grad_norm": 17.75181453766059,
1427
+ "learning_rate": 4.445217264089751e-09,
1428
+ "logits/chosen": -0.3570159077644348,
1429
+ "logits/rejected": 3.7746682167053223,
1430
+ "logps/chosen": -1153.1602783203125,
1431
+ "logps/rejected": -2572.643798828125,
1432
+ "loss": 0.0662,
1433
+ "rewards/accuracies": 0.96875,
1434
+ "rewards/chosen": -7.910887718200684,
1435
+ "rewards/margins": 13.993339538574219,
1436
+ "rewards/rejected": -21.904226303100586,
1437
+ "step": 910
1438
+ },
1439
+ {
1440
+ "epoch": 1.909704203425013,
1441
+ "grad_norm": 19.779224657698173,
1442
+ "learning_rate": 2.902911863455121e-09,
1443
+ "logits/chosen": -0.23463687300682068,
1444
+ "logits/rejected": 3.8247523307800293,
1445
+ "logps/chosen": -1181.158447265625,
1446
+ "logps/rejected": -2677.92041015625,
1447
+ "loss": 0.0761,
1448
+ "rewards/accuracies": 0.9781249761581421,
1449
+ "rewards/chosen": -8.19698429107666,
1450
+ "rewards/margins": 14.724853515625,
1451
+ "rewards/rejected": -22.92184066772461,
1452
+ "step": 920
1453
+ },
1454
+ {
1455
+ "epoch": 1.9304618578100676,
1456
+ "grad_norm": 22.12956855667942,
1457
+ "learning_rate": 1.686508757851507e-09,
1458
+ "logits/chosen": -0.2650212049484253,
1459
+ "logits/rejected": 3.5063083171844482,
1460
+ "logps/chosen": -1194.435791015625,
1461
+ "logps/rejected": -2615.01708984375,
1462
+ "loss": 0.0854,
1463
+ "rewards/accuracies": 0.956250011920929,
1464
+ "rewards/chosen": -8.247761726379395,
1465
+ "rewards/margins": 13.95422077178955,
1466
+ "rewards/rejected": -22.201980590820312,
1467
+ "step": 930
1468
+ },
1469
+ {
1470
+ "epoch": 1.951219512195122,
1471
+ "grad_norm": 20.722879655487088,
1472
+ "learning_rate": 7.976122906031557e-10,
1473
+ "logits/chosen": -0.23710790276527405,
1474
+ "logits/rejected": 3.5924439430236816,
1475
+ "logps/chosen": -1181.043212890625,
1476
+ "logps/rejected": -2523.64794921875,
1477
+ "loss": 0.088,
1478
+ "rewards/accuracies": 0.9437500238418579,
1479
+ "rewards/chosen": -8.123555183410645,
1480
+ "rewards/margins": 13.19268798828125,
1481
+ "rewards/rejected": -21.31624412536621,
1482
+ "step": 940
1483
+ },
1484
+ {
1485
+ "epoch": 1.9719771665801764,
1486
+ "grad_norm": 12.897702624327897,
1487
+ "learning_rate": 2.37394848648792e-10,
1488
+ "logits/chosen": -0.3340440094470978,
1489
+ "logits/rejected": 3.253843307495117,
1490
+ "logps/chosen": -1149.8099365234375,
1491
+ "logps/rejected": -2490.43994140625,
1492
+ "loss": 0.0801,
1493
+ "rewards/accuracies": 0.949999988079071,
1494
+ "rewards/chosen": -7.880959510803223,
1495
+ "rewards/margins": 13.208969116210938,
1496
+ "rewards/rejected": -21.089927673339844,
1497
+ "step": 950
1498
+ },
1499
+ {
1500
+ "epoch": 1.992734820965231,
1501
+ "grad_norm": 17.31058461781814,
1502
+ "learning_rate": 6.5953162521614755e-12,
1503
+ "logits/chosen": -0.3153567910194397,
1504
+ "logits/rejected": 3.3712565898895264,
1505
+ "logps/chosen": -1199.875244140625,
1506
+ "logps/rejected": -2640.38525390625,
1507
+ "loss": 0.0743,
1508
+ "rewards/accuracies": 0.981249988079071,
1509
+ "rewards/chosen": -8.392365455627441,
1510
+ "rewards/margins": 14.206585884094238,
1511
+ "rewards/rejected": -22.598949432373047,
1512
+ "step": 960
1513
+ },
1514
+ {
1515
+ "epoch": 1.992734820965231,
1516
+ "eval_logits/chosen": -0.1724339723587036,
1517
+ "eval_logits/rejected": 3.016493320465088,
1518
+ "eval_logps/chosen": -1310.28955078125,
1519
+ "eval_logps/rejected": -2616.585205078125,
1520
+ "eval_loss": 0.15784408152103424,
1521
+ "eval_rewards/accuracies": 0.9203431606292725,
1522
+ "eval_rewards/chosen": -8.868812561035156,
1523
+ "eval_rewards/margins": 12.837637901306152,
1524
+ "eval_rewards/rejected": -21.706451416015625,
1525
+ "eval_runtime": 296.2637,
1526
+ "eval_samples_per_second": 21.91,
1527
+ "eval_steps_per_second": 0.344,
1528
+ "step": 960
1529
+ },
1530
+ {
1531
+ "epoch": 1.996886351842242,
1532
+ "step": 962,
1533
  "total_flos": 0.0,
1534
+ "train_loss": 0.1585804910288655,
1535
+ "train_runtime": 28225.2793,
1536
+ "train_samples_per_second": 8.737,
1537
+ "train_steps_per_second": 0.034
1538
  }
1539
  ],
1540
  "logging_steps": 10,
1541
+ "max_steps": 962,
1542
  "num_input_tokens_seen": 0,
1543
  "num_train_epochs": 2,
1544
  "save_steps": 240,