just1nseo commited on
Commit
efba803
·
verified ·
1 Parent(s): f564b1d

Model save

Browse files
README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: openbmb/Eurus-7b-sft
9
+ model-index:
10
+ - name: eurus-dpo-qlora-uf-ours-5e-6
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # eurus-dpo-qlora-uf-ours-5e-6
18
+
19
+ This model is a fine-tuned version of [openbmb/Eurus-7b-sft](https://huggingface.co/openbmb/Eurus-7b-sft) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 6.0584
22
+ - Rewards/chosen: -23.3438
23
+ - Rewards/rejected: -32.4235
24
+ - Rewards/accuracies: 0.6280
25
+ - Rewards/margins: 9.0798
26
+ - Rewards/margins Max: 58.3224
27
+ - Rewards/margins Min: -32.8664
28
+ - Rewards/margins Std: 29.5381
29
+ - Logps/rejected: -3499.8743
30
+ - Logps/chosen: -2609.2573
31
+ - Logits/rejected: -0.9160
32
+ - Logits/chosen: -1.0810
33
+
34
+ ## Model description
35
+
36
+ More information needed
37
+
38
+ ## Intended uses & limitations
39
+
40
+ More information needed
41
+
42
+ ## Training and evaluation data
43
+
44
+ More information needed
45
+
46
+ ## Training procedure
47
+
48
+ ### Training hyperparameters
49
+
50
+ The following hyperparameters were used during training:
51
+ - learning_rate: 5e-06
52
+ - train_batch_size: 4
53
+ - eval_batch_size: 8
54
+ - seed: 42
55
+ - distributed_type: multi-GPU
56
+ - num_devices: 2
57
+ - gradient_accumulation_steps: 2
58
+ - total_train_batch_size: 16
59
+ - total_eval_batch_size: 16
60
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
61
+ - lr_scheduler_type: cosine
62
+ - lr_scheduler_warmup_ratio: 0.1
63
+ - num_epochs: 3
64
+
65
+ ### Training results
66
+
67
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Rewards/margins Max | Rewards/margins Min | Rewards/margins Std | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
68
+ |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:-------------------:|:-------------------:|:-------------------:|:--------------:|:------------:|:---------------:|:-------------:|
69
+ | 0.4256 | 0.28 | 100 | 0.8163 | -1.8022 | -1.9583 | 0.5610 | 0.1561 | 2.2049 | -1.8191 | 1.3259 | -453.3455 | -455.0959 | -1.9771 | -2.0751 |
70
+ | 0.1591 | 0.56 | 200 | 1.2122 | -5.0976 | -6.6216 | 0.6050 | 1.5239 | 9.9971 | -4.8753 | 4.8268 | -919.6762 | -784.6454 | -1.3460 | -1.4469 |
71
+ | 0.1126 | 0.85 | 300 | 1.7230 | -6.1628 | -8.5878 | 0.6090 | 2.4250 | 18.9102 | -8.2202 | 8.7236 | -1116.3019 | -891.1599 | -1.2133 | -1.3142 |
72
+ | 0.074 | 1.13 | 400 | 2.0005 | -8.7127 | -11.9396 | 0.6220 | 3.2269 | 20.1537 | -9.9867 | 9.6878 | -1451.4778 | -1146.1495 | -1.3244 | -1.4370 |
73
+ | 0.0551 | 1.41 | 500 | 2.6568 | -10.4325 | -15.1571 | 0.6260 | 4.7246 | 28.6045 | -13.6975 | 13.8040 | -1773.2283 | -1318.1323 | -1.2958 | -1.4257 |
74
+ | 0.169 | 1.69 | 600 | 3.7089 | -14.9797 | -20.5965 | 0.6160 | 5.6168 | 36.0405 | -19.8931 | 18.0728 | -2317.1677 | -1772.8466 | -1.0370 | -1.1529 |
75
+ | 0.0661 | 1.97 | 700 | 4.1957 | -15.9319 | -22.6457 | 0.6220 | 6.7138 | 41.9072 | -22.6906 | 20.9609 | -2522.0879 | -1868.0721 | -1.1163 | -1.2633 |
76
+ | 0.0044 | 2.25 | 800 | 5.9108 | -22.7617 | -31.4584 | 0.6230 | 8.6967 | 56.6380 | -31.9336 | 28.6036 | -3403.3569 | -2551.0461 | -0.9371 | -1.0936 |
77
+ | 0.011 | 2.54 | 900 | 5.9213 | -23.0839 | -32.0567 | 0.6230 | 8.9728 | 56.9548 | -32.0980 | 28.8598 | -3463.1873 | -2583.2671 | -0.9208 | -1.0846 |
78
+ | 0.0138 | 2.82 | 1000 | 6.0584 | -23.3438 | -32.4235 | 0.6280 | 9.0798 | 58.3224 | -32.8664 | 29.5381 | -3499.8743 | -2609.2573 | -0.9160 | -1.0810 |
79
+
80
+
81
+ ### Framework versions
82
+
83
+ - PEFT 0.7.1
84
+ - Transformers 4.39.0.dev0
85
+ - Pytorch 2.1.2+cu121
86
+ - Datasets 2.14.6
87
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bcdb8e126239991dacc1aa8932afeccae2ec7e6eb5d7cfae5b1c4b2d67c7f83
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13bc4c2b1223adb76c88da4dcb6efecd358fb7f4fa4f0432fdc9f710f5eccce8
3
  size 671150064
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.1413031851636692,
4
+ "train_runtime": 20921.2576,
5
+ "train_samples": 5678,
6
+ "train_samples_per_second": 0.814,
7
+ "train_steps_per_second": 0.051
8
+ }
runs/Jul16_14-58-15_node25/events.out.tfevents.1721109533.node25.1416524.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf2b29f68c959241281244038c47b343ab769d1c2a303eccc3b32e4f5fc62fd3
3
- size 103256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99ebb9122cec5932be8a593510cda0a6b2e0143e2d56b031f7b939f76503053
3
+ size 108890
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.1413031851636692,
4
+ "train_runtime": 20921.2576,
5
+ "train_samples": 5678,
6
+ "train_samples_per_second": 0.814,
7
+ "train_steps_per_second": 0.051
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1065,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 2.3852074765983984,
14
+ "learning_rate": 4.672897196261682e-08,
15
+ "logits/chosen": -2.4213736057281494,
16
+ "logits/rejected": -2.1724228858947754,
17
+ "logps/chosen": -311.7572021484375,
18
+ "logps/rejected": -242.86618041992188,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/margins_max": 0.0,
24
+ "rewards/margins_min": 0.0,
25
+ "rewards/margins_std": 0.0,
26
+ "rewards/rejected": 0.0,
27
+ "step": 1
28
+ },
29
+ {
30
+ "epoch": 0.03,
31
+ "grad_norm": 2.3445286792409266,
32
+ "learning_rate": 4.6728971962616824e-07,
33
+ "logits/chosen": -2.4368515014648438,
34
+ "logits/rejected": -2.2642922401428223,
35
+ "logps/chosen": -307.2416076660156,
36
+ "logps/rejected": -312.71978759765625,
37
+ "loss": 0.6929,
38
+ "rewards/accuracies": 0.5555555820465088,
39
+ "rewards/chosen": 0.00042119898716919124,
40
+ "rewards/margins": 0.0008334853337146342,
41
+ "rewards/margins_max": 0.00408088369295001,
42
+ "rewards/margins_min": -0.001809651032090187,
43
+ "rewards/margins_std": 0.0026596880052238703,
44
+ "rewards/rejected": -0.0004122863756492734,
45
+ "step": 10
46
+ },
47
+ {
48
+ "epoch": 0.06,
49
+ "grad_norm": 2.711998547730859,
50
+ "learning_rate": 9.345794392523365e-07,
51
+ "logits/chosen": -2.3758203983306885,
52
+ "logits/rejected": -2.2394745349884033,
53
+ "logps/chosen": -266.3940734863281,
54
+ "logps/rejected": -250.22067260742188,
55
+ "loss": 0.691,
56
+ "rewards/accuracies": 0.7749999761581421,
57
+ "rewards/chosen": 0.00042475899681448936,
58
+ "rewards/margins": 0.004263690672814846,
59
+ "rewards/margins_max": 0.010423297993838787,
60
+ "rewards/margins_min": -0.000582061184104532,
61
+ "rewards/margins_std": 0.004974424839019775,
62
+ "rewards/rejected": -0.003838931443169713,
63
+ "step": 20
64
+ },
65
+ {
66
+ "epoch": 0.08,
67
+ "grad_norm": 3.331840625958437,
68
+ "learning_rate": 1.4018691588785047e-06,
69
+ "logits/chosen": -2.4345130920410156,
70
+ "logits/rejected": -2.2301573753356934,
71
+ "logps/chosen": -277.0610046386719,
72
+ "logps/rejected": -270.2976989746094,
73
+ "loss": 0.6848,
74
+ "rewards/accuracies": 0.9750000238418579,
75
+ "rewards/chosen": 0.0029614921659231186,
76
+ "rewards/margins": 0.016091803088784218,
77
+ "rewards/margins_max": 0.030258700251579285,
78
+ "rewards/margins_min": 0.004859076347202063,
79
+ "rewards/margins_std": 0.011583611369132996,
80
+ "rewards/rejected": -0.01313030906021595,
81
+ "step": 30
82
+ },
83
+ {
84
+ "epoch": 0.11,
85
+ "grad_norm": 2.5069076474331458,
86
+ "learning_rate": 1.869158878504673e-06,
87
+ "logits/chosen": -2.5057766437530518,
88
+ "logits/rejected": -2.2820587158203125,
89
+ "logps/chosen": -263.85906982421875,
90
+ "logps/rejected": -258.41522216796875,
91
+ "loss": 0.6746,
92
+ "rewards/accuracies": 0.925000011920929,
93
+ "rewards/chosen": 0.0037085427902638912,
94
+ "rewards/margins": 0.030518993735313416,
95
+ "rewards/margins_max": 0.0725535899400711,
96
+ "rewards/margins_min": 0.0008986728498712182,
97
+ "rewards/margins_std": 0.03197382763028145,
98
+ "rewards/rejected": -0.026810448616743088,
99
+ "step": 40
100
+ },
101
+ {
102
+ "epoch": 0.14,
103
+ "grad_norm": 3.2621789299254584,
104
+ "learning_rate": 2.3364485981308413e-06,
105
+ "logits/chosen": -2.432992935180664,
106
+ "logits/rejected": -2.222877025604248,
107
+ "logps/chosen": -256.1089782714844,
108
+ "logps/rejected": -280.87469482421875,
109
+ "loss": 0.6526,
110
+ "rewards/accuracies": 0.8999999761581421,
111
+ "rewards/chosen": 0.0002926398010458797,
112
+ "rewards/margins": 0.09236637502908707,
113
+ "rewards/margins_max": 0.2024260014295578,
114
+ "rewards/margins_min": 0.017648588865995407,
115
+ "rewards/margins_std": 0.08838540315628052,
116
+ "rewards/rejected": -0.09207373112440109,
117
+ "step": 50
118
+ },
119
+ {
120
+ "epoch": 0.17,
121
+ "grad_norm": 3.2693868625712255,
122
+ "learning_rate": 2.8037383177570094e-06,
123
+ "logits/chosen": -2.3796916007995605,
124
+ "logits/rejected": -2.216627836227417,
125
+ "logps/chosen": -292.62139892578125,
126
+ "logps/rejected": -324.89324951171875,
127
+ "loss": 0.6222,
128
+ "rewards/accuracies": 0.9375,
129
+ "rewards/chosen": -0.0356663353741169,
130
+ "rewards/margins": 0.15710246562957764,
131
+ "rewards/margins_max": 0.3139253556728363,
132
+ "rewards/margins_min": 0.024519650265574455,
133
+ "rewards/margins_std": 0.131367027759552,
134
+ "rewards/rejected": -0.19276879727840424,
135
+ "step": 60
136
+ },
137
+ {
138
+ "epoch": 0.2,
139
+ "grad_norm": 2.595351862049113,
140
+ "learning_rate": 3.2710280373831774e-06,
141
+ "logits/chosen": -2.4088263511657715,
142
+ "logits/rejected": -2.2108654975891113,
143
+ "logps/chosen": -296.26385498046875,
144
+ "logps/rejected": -306.9451599121094,
145
+ "loss": 0.5849,
146
+ "rewards/accuracies": 0.9375,
147
+ "rewards/chosen": -0.10243145376443863,
148
+ "rewards/margins": 0.22524826228618622,
149
+ "rewards/margins_max": 0.517895519733429,
150
+ "rewards/margins_min": 0.032067470252513885,
151
+ "rewards/margins_std": 0.21916556358337402,
152
+ "rewards/rejected": -0.32767972350120544,
153
+ "step": 70
154
+ },
155
+ {
156
+ "epoch": 0.23,
157
+ "grad_norm": 3.072039945720203,
158
+ "learning_rate": 3.738317757009346e-06,
159
+ "logits/chosen": -2.4255895614624023,
160
+ "logits/rejected": -2.304298162460327,
161
+ "logps/chosen": -329.9145202636719,
162
+ "logps/rejected": -401.10516357421875,
163
+ "loss": 0.5333,
164
+ "rewards/accuracies": 0.9125000238418579,
165
+ "rewards/chosen": -0.2832830548286438,
166
+ "rewards/margins": 0.4214704632759094,
167
+ "rewards/margins_max": 0.8972963094711304,
168
+ "rewards/margins_min": 0.05010233446955681,
169
+ "rewards/margins_std": 0.3841872811317444,
170
+ "rewards/rejected": -0.7047535181045532,
171
+ "step": 80
172
+ },
173
+ {
174
+ "epoch": 0.25,
175
+ "grad_norm": 3.642567259744184,
176
+ "learning_rate": 4.205607476635514e-06,
177
+ "logits/chosen": -2.224909782409668,
178
+ "logits/rejected": -2.080477476119995,
179
+ "logps/chosen": -290.53643798828125,
180
+ "logps/rejected": -332.0419616699219,
181
+ "loss": 0.4756,
182
+ "rewards/accuracies": 0.875,
183
+ "rewards/chosen": -0.42501896619796753,
184
+ "rewards/margins": 0.5945996046066284,
185
+ "rewards/margins_max": 1.4964139461517334,
186
+ "rewards/margins_min": 0.0675990879535675,
187
+ "rewards/margins_std": 0.6498464941978455,
188
+ "rewards/rejected": -1.0196186304092407,
189
+ "step": 90
190
+ },
191
+ {
192
+ "epoch": 0.28,
193
+ "grad_norm": 3.345534089333945,
194
+ "learning_rate": 4.6728971962616825e-06,
195
+ "logits/chosen": -2.17991042137146,
196
+ "logits/rejected": -2.0627541542053223,
197
+ "logps/chosen": -351.88580322265625,
198
+ "logps/rejected": -475.9768981933594,
199
+ "loss": 0.4256,
200
+ "rewards/accuracies": 0.8999999761581421,
201
+ "rewards/chosen": -0.875613808631897,
202
+ "rewards/margins": 0.8630256652832031,
203
+ "rewards/margins_max": 2.0646145343780518,
204
+ "rewards/margins_min": 0.07761440426111221,
205
+ "rewards/margins_std": 0.921543300151825,
206
+ "rewards/rejected": -1.7386394739151,
207
+ "step": 100
208
+ },
209
+ {
210
+ "epoch": 0.28,
211
+ "eval_logits/chosen": -2.0750787258148193,
212
+ "eval_logits/rejected": -1.9770790338516235,
213
+ "eval_logps/chosen": -455.095947265625,
214
+ "eval_logps/rejected": -453.3454895019531,
215
+ "eval_loss": 0.8162721991539001,
216
+ "eval_rewards/accuracies": 0.5609999895095825,
217
+ "eval_rewards/chosen": -1.802152395248413,
218
+ "eval_rewards/margins": 0.15609782934188843,
219
+ "eval_rewards/margins_max": 2.204866886138916,
220
+ "eval_rewards/margins_min": -1.819094181060791,
221
+ "eval_rewards/margins_std": 1.3259263038635254,
222
+ "eval_rewards/rejected": -1.9582501649856567,
223
+ "eval_runtime": 738.2434,
224
+ "eval_samples_per_second": 2.709,
225
+ "eval_steps_per_second": 0.169,
226
+ "step": 100
227
+ },
228
+ {
229
+ "epoch": 0.31,
230
+ "grad_norm": 3.790495737109387,
231
+ "learning_rate": 4.999879018839288e-06,
232
+ "logits/chosen": -2.319441318511963,
233
+ "logits/rejected": -2.0694375038146973,
234
+ "logps/chosen": -491.3214416503906,
235
+ "logps/rejected": -595.2050170898438,
236
+ "loss": 0.3726,
237
+ "rewards/accuracies": 0.8999999761581421,
238
+ "rewards/chosen": -1.4289052486419678,
239
+ "rewards/margins": 1.2257483005523682,
240
+ "rewards/margins_max": 2.6363110542297363,
241
+ "rewards/margins_min": -0.0019395649433135986,
242
+ "rewards/margins_std": 1.1995513439178467,
243
+ "rewards/rejected": -2.654653787612915,
244
+ "step": 110
245
+ },
246
+ {
247
+ "epoch": 0.34,
248
+ "grad_norm": 6.4022099851607335,
249
+ "learning_rate": 4.99772856836941e-06,
250
+ "logits/chosen": -2.0878264904022217,
251
+ "logits/rejected": -1.9656604528427124,
252
+ "logps/chosen": -540.1173095703125,
253
+ "logps/rejected": -716.8549194335938,
254
+ "loss": 0.3017,
255
+ "rewards/accuracies": 0.925000011920929,
256
+ "rewards/chosen": -2.161752462387085,
257
+ "rewards/margins": 1.9279537200927734,
258
+ "rewards/margins_max": 3.7465851306915283,
259
+ "rewards/margins_min": 0.2842390835285187,
260
+ "rewards/margins_std": 1.603005051612854,
261
+ "rewards/rejected": -4.089705944061279,
262
+ "step": 120
263
+ },
264
+ {
265
+ "epoch": 0.37,
266
+ "grad_norm": 3.8141114628331887,
267
+ "learning_rate": 4.992892309373227e-06,
268
+ "logits/chosen": -1.9616358280181885,
269
+ "logits/rejected": -1.7907485961914062,
270
+ "logps/chosen": -607.4080810546875,
271
+ "logps/rejected": -861.5751953125,
272
+ "loss": 0.2862,
273
+ "rewards/accuracies": 0.925000011920929,
274
+ "rewards/chosen": -2.5392794609069824,
275
+ "rewards/margins": 2.649423122406006,
276
+ "rewards/margins_max": 5.628636360168457,
277
+ "rewards/margins_min": 0.41630443930625916,
278
+ "rewards/margins_std": 2.4051852226257324,
279
+ "rewards/rejected": -5.188702583312988,
280
+ "step": 130
281
+ },
282
+ {
283
+ "epoch": 0.39,
284
+ "grad_norm": 18.15292585345447,
285
+ "learning_rate": 4.985375442281969e-06,
286
+ "logits/chosen": -1.8700624704360962,
287
+ "logits/rejected": -1.7391315698623657,
288
+ "logps/chosen": -530.605224609375,
289
+ "logps/rejected": -826.2467651367188,
290
+ "loss": 0.261,
291
+ "rewards/accuracies": 0.9375,
292
+ "rewards/chosen": -2.36662220954895,
293
+ "rewards/margins": 2.824148654937744,
294
+ "rewards/margins_max": 6.096582889556885,
295
+ "rewards/margins_min": 0.574691653251648,
296
+ "rewards/margins_std": 2.5261497497558594,
297
+ "rewards/rejected": -5.190770626068115,
298
+ "step": 140
299
+ },
300
+ {
301
+ "epoch": 0.42,
302
+ "grad_norm": 4.549040288633133,
303
+ "learning_rate": 4.9751860499858175e-06,
304
+ "logits/chosen": -1.6621425151824951,
305
+ "logits/rejected": -1.5595060586929321,
306
+ "logps/chosen": -543.5266723632812,
307
+ "logps/rejected": -745.6972045898438,
308
+ "loss": 0.2821,
309
+ "rewards/accuracies": 0.862500011920929,
310
+ "rewards/chosen": -2.752904176712036,
311
+ "rewards/margins": 2.2729761600494385,
312
+ "rewards/margins_max": 4.967066764831543,
313
+ "rewards/margins_min": 0.14136430621147156,
314
+ "rewards/margins_std": 2.177089214324951,
315
+ "rewards/rejected": -5.025879859924316,
316
+ "step": 150
317
+ },
318
+ {
319
+ "epoch": 0.45,
320
+ "grad_norm": 5.567321059696331,
321
+ "learning_rate": 4.962335089142376e-06,
322
+ "logits/chosen": -1.684842824935913,
323
+ "logits/rejected": -1.5501009225845337,
324
+ "logps/chosen": -474.55108642578125,
325
+ "logps/rejected": -770.0437622070312,
326
+ "loss": 0.2122,
327
+ "rewards/accuracies": 0.875,
328
+ "rewards/chosen": -2.0399696826934814,
329
+ "rewards/margins": 2.976529359817505,
330
+ "rewards/margins_max": 6.136769771575928,
331
+ "rewards/margins_min": 0.4981708526611328,
332
+ "rewards/margins_std": 2.5306599140167236,
333
+ "rewards/rejected": -5.016499042510986,
334
+ "step": 160
335
+ },
336
+ {
337
+ "epoch": 0.48,
338
+ "grad_norm": 10.10370874579834,
339
+ "learning_rate": 4.946836378394967e-06,
340
+ "logits/chosen": -1.719364881515503,
341
+ "logits/rejected": -1.5284518003463745,
342
+ "logps/chosen": -491.3282775878906,
343
+ "logps/rejected": -959.0675659179688,
344
+ "loss": 0.2013,
345
+ "rewards/accuracies": 0.9750000238418579,
346
+ "rewards/chosen": -1.9167028665542603,
347
+ "rewards/margins": 4.33759069442749,
348
+ "rewards/margins_max": 8.84797191619873,
349
+ "rewards/margins_min": 1.2430771589279175,
350
+ "rewards/margins_std": 3.5163204669952393,
351
+ "rewards/rejected": -6.254293918609619,
352
+ "step": 170
353
+ },
354
+ {
355
+ "epoch": 0.51,
356
+ "grad_norm": 5.681771212296982,
357
+ "learning_rate": 4.928706583513441e-06,
358
+ "logits/chosen": -1.5217533111572266,
359
+ "logits/rejected": -1.2059122323989868,
360
+ "logps/chosen": -738.7340698242188,
361
+ "logps/rejected": -1276.347900390625,
362
+ "loss": 0.2247,
363
+ "rewards/accuracies": 0.925000011920929,
364
+ "rewards/chosen": -4.31705904006958,
365
+ "rewards/margins": 5.126463890075684,
366
+ "rewards/margins_max": 10.595057487487793,
367
+ "rewards/margins_min": 0.7047984004020691,
368
+ "rewards/margins_std": 4.3886613845825195,
369
+ "rewards/rejected": -9.443523406982422,
370
+ "step": 180
371
+ },
372
+ {
373
+ "epoch": 0.54,
374
+ "grad_norm": 11.72688194414167,
375
+ "learning_rate": 4.907965199473471e-06,
376
+ "logits/chosen": -1.5132646560668945,
377
+ "logits/rejected": -1.1687037944793701,
378
+ "logps/chosen": -706.4207153320312,
379
+ "logps/rejected": -1510.3458251953125,
380
+ "loss": 0.1628,
381
+ "rewards/accuracies": 0.9750000238418579,
382
+ "rewards/chosen": -4.151159286499023,
383
+ "rewards/margins": 8.210009574890137,
384
+ "rewards/margins_max": 16.38364028930664,
385
+ "rewards/margins_min": 2.5097246170043945,
386
+ "rewards/margins_std": 6.4036359786987305,
387
+ "rewards/rejected": -12.361169815063477,
388
+ "step": 190
389
+ },
390
+ {
391
+ "epoch": 0.56,
392
+ "grad_norm": 27.957964269833404,
393
+ "learning_rate": 4.884634529493591e-06,
394
+ "logits/chosen": -1.494742751121521,
395
+ "logits/rejected": -1.0926433801651,
396
+ "logps/chosen": -993.9827270507812,
397
+ "logps/rejected": -2114.66748046875,
398
+ "loss": 0.1591,
399
+ "rewards/accuracies": 0.9125000238418579,
400
+ "rewards/chosen": -6.776390075683594,
401
+ "rewards/margins": 11.342796325683594,
402
+ "rewards/margins_max": 26.83356285095215,
403
+ "rewards/margins_min": 1.5193722248077393,
404
+ "rewards/margins_std": 11.783136367797852,
405
+ "rewards/rejected": -18.11918830871582,
406
+ "step": 200
407
+ },
408
+ {
409
+ "epoch": 0.56,
410
+ "eval_logits/chosen": -1.446925401687622,
411
+ "eval_logits/rejected": -1.3460043668746948,
412
+ "eval_logps/chosen": -784.6454467773438,
413
+ "eval_logps/rejected": -919.6762084960938,
414
+ "eval_loss": 1.212246060371399,
415
+ "eval_rewards/accuracies": 0.6050000190734863,
416
+ "eval_rewards/chosen": -5.097646713256836,
417
+ "eval_rewards/margins": 1.5239101648330688,
418
+ "eval_rewards/margins_max": 9.997103691101074,
419
+ "eval_rewards/margins_min": -4.875259876251221,
420
+ "eval_rewards/margins_std": 4.826797008514404,
421
+ "eval_rewards/rejected": -6.621557235717773,
422
+ "eval_runtime": 739.1046,
423
+ "eval_samples_per_second": 2.706,
424
+ "eval_steps_per_second": 0.169,
425
+ "step": 200
426
+ },
427
+ {
428
+ "epoch": 0.59,
429
+ "grad_norm": 4.4515073816349,
430
+ "learning_rate": 4.858739661052539e-06,
431
+ "logits/chosen": -1.375280499458313,
432
+ "logits/rejected": -0.9940829277038574,
433
+ "logps/chosen": -801.5595703125,
434
+ "logps/rejected": -1996.09375,
435
+ "loss": 0.1097,
436
+ "rewards/accuracies": 0.987500011920929,
437
+ "rewards/chosen": -5.175868034362793,
438
+ "rewards/margins": 11.802214622497559,
439
+ "rewards/margins_max": 24.37476921081543,
440
+ "rewards/margins_min": 3.159264087677002,
441
+ "rewards/margins_std": 9.701603889465332,
442
+ "rewards/rejected": -16.97808074951172,
443
+ "step": 210
444
+ },
445
+ {
446
+ "epoch": 0.62,
447
+ "grad_norm": 5.583399739036604,
448
+ "learning_rate": 4.830308438912687e-06,
449
+ "logits/chosen": -1.3399736881256104,
450
+ "logits/rejected": -0.8271608352661133,
451
+ "logps/chosen": -1234.1002197265625,
452
+ "logps/rejected": -2999.682861328125,
453
+ "loss": 0.425,
454
+ "rewards/accuracies": 0.9375,
455
+ "rewards/chosen": -9.106898307800293,
456
+ "rewards/margins": 17.739770889282227,
457
+ "rewards/margins_max": 37.61690139770508,
458
+ "rewards/margins_min": 2.8995840549468994,
459
+ "rewards/margins_std": 16.034954071044922,
460
+ "rewards/rejected": -26.846668243408203,
461
+ "step": 220
462
+ },
463
+ {
464
+ "epoch": 0.65,
465
+ "grad_norm": 5.236021050652021,
466
+ "learning_rate": 4.799371435178544e-06,
467
+ "logits/chosen": -1.6026887893676758,
468
+ "logits/rejected": -1.2132550477981567,
469
+ "logps/chosen": -873.904296875,
470
+ "logps/rejected": -1824.0445556640625,
471
+ "loss": 0.1169,
472
+ "rewards/accuracies": 0.9624999761581421,
473
+ "rewards/chosen": -5.2920122146606445,
474
+ "rewards/margins": 9.946649551391602,
475
+ "rewards/margins_max": 21.537805557250977,
476
+ "rewards/margins_min": 1.2446839809417725,
477
+ "rewards/margins_std": 9.166532516479492,
478
+ "rewards/rejected": -15.238659858703613,
479
+ "step": 230
480
+ },
481
+ {
482
+ "epoch": 0.68,
483
+ "grad_norm": 6.9079353036860525,
484
+ "learning_rate": 4.765961916422575e-06,
485
+ "logits/chosen": -1.378777027130127,
486
+ "logits/rejected": -0.9224980473518372,
487
+ "logps/chosen": -1299.450927734375,
488
+ "logps/rejected": -2845.23095703125,
489
+ "loss": 0.1933,
490
+ "rewards/accuracies": 0.949999988079071,
491
+ "rewards/chosen": -9.927515983581543,
492
+ "rewards/margins": 15.248102188110352,
493
+ "rewards/margins_max": 35.37622833251953,
494
+ "rewards/margins_min": 2.7304091453552246,
495
+ "rewards/margins_std": 14.676897048950195,
496
+ "rewards/rejected": -25.175617218017578,
497
+ "step": 240
498
+ },
499
+ {
500
+ "epoch": 0.7,
501
+ "grad_norm": 3.479040193067339,
502
+ "learning_rate": 4.730115807913627e-06,
503
+ "logits/chosen": -1.3043248653411865,
504
+ "logits/rejected": -0.9514943957328796,
505
+ "logps/chosen": -1031.6715087890625,
506
+ "logps/rejected": -2111.33447265625,
507
+ "loss": 0.1542,
508
+ "rewards/accuracies": 0.925000011920929,
509
+ "rewards/chosen": -7.449245452880859,
510
+ "rewards/margins": 10.80849552154541,
511
+ "rewards/margins_max": 24.898605346679688,
512
+ "rewards/margins_min": 2.4293816089630127,
513
+ "rewards/margins_std": 10.350214958190918,
514
+ "rewards/rejected": -18.257740020751953,
515
+ "step": 250
516
+ },
517
+ {
518
+ "epoch": 0.73,
519
+ "grad_norm": 0.6636821106137679,
520
+ "learning_rate": 4.691871654986485e-06,
521
+ "logits/chosen": -1.3028302192687988,
522
+ "logits/rejected": -0.9591180086135864,
523
+ "logps/chosen": -1015.4215087890625,
524
+ "logps/rejected": -2004.902587890625,
525
+ "loss": 0.1454,
526
+ "rewards/accuracies": 0.9125000238418579,
527
+ "rewards/chosen": -7.468132019042969,
528
+ "rewards/margins": 9.605308532714844,
529
+ "rewards/margins_max": 19.181324005126953,
530
+ "rewards/margins_min": 1.7056152820587158,
531
+ "rewards/margins_std": 7.867035865783691,
532
+ "rewards/rejected": -17.073440551757812,
533
+ "step": 260
534
+ },
535
+ {
536
+ "epoch": 0.76,
537
+ "grad_norm": 16.019493625748307,
538
+ "learning_rate": 4.651270581594054e-06,
539
+ "logits/chosen": -1.0485166311264038,
540
+ "logits/rejected": -0.6178771257400513,
541
+ "logps/chosen": -1578.3748779296875,
542
+ "logps/rejected": -3275.0478515625,
543
+ "loss": 0.2713,
544
+ "rewards/accuracies": 0.925000011920929,
545
+ "rewards/chosen": -12.956689834594727,
546
+ "rewards/margins": 16.732524871826172,
547
+ "rewards/margins_max": 34.99681854248047,
548
+ "rewards/margins_min": 4.456276893615723,
549
+ "rewards/margins_std": 13.886381149291992,
550
+ "rewards/rejected": -29.689218521118164,
551
+ "step": 270
552
+ },
553
+ {
554
+ "epoch": 0.79,
555
+ "grad_norm": 17.992430896909905,
556
+ "learning_rate": 4.6083562460867545e-06,
557
+ "logits/chosen": -1.2608745098114014,
558
+ "logits/rejected": -0.790873646736145,
559
+ "logps/chosen": -1115.44775390625,
560
+ "logps/rejected": -2799.135986328125,
561
+ "loss": 0.0779,
562
+ "rewards/accuracies": 0.949999988079071,
563
+ "rewards/chosen": -8.298356056213379,
564
+ "rewards/margins": 16.56003189086914,
565
+ "rewards/margins_max": 32.726654052734375,
566
+ "rewards/margins_min": 4.10027551651001,
567
+ "rewards/margins_std": 13.312190055847168,
568
+ "rewards/rejected": -24.858386993408203,
569
+ "step": 280
570
+ },
571
+ {
572
+ "epoch": 0.82,
573
+ "grad_norm": 20.054208913991488,
574
+ "learning_rate": 4.563174794266684e-06,
575
+ "logits/chosen": -1.0551059246063232,
576
+ "logits/rejected": -0.4062129557132721,
577
+ "logps/chosen": -1836.141845703125,
578
+ "logps/rejected": -4272.1552734375,
579
+ "loss": 0.3014,
580
+ "rewards/accuracies": 0.8999999761581421,
581
+ "rewards/chosen": -15.701252937316895,
582
+ "rewards/margins": 24.028852462768555,
583
+ "rewards/margins_max": 54.14502716064453,
584
+ "rewards/margins_min": 5.250021934509277,
585
+ "rewards/margins_std": 22.093597412109375,
586
+ "rewards/rejected": -39.73011016845703,
587
+ "step": 290
588
+ },
589
+ {
590
+ "epoch": 0.85,
591
+ "grad_norm": 2.439806702571055,
592
+ "learning_rate": 4.5157748097670125e-06,
593
+ "logits/chosen": -1.5465186834335327,
594
+ "logits/rejected": -1.0866758823394775,
595
+ "logps/chosen": -846.8386840820312,
596
+ "logps/rejected": -2151.494873046875,
597
+ "loss": 0.1126,
598
+ "rewards/accuracies": 0.987500011920929,
599
+ "rewards/chosen": -5.326035499572754,
600
+ "rewards/margins": 12.93223762512207,
601
+ "rewards/margins_max": 28.242706298828125,
602
+ "rewards/margins_min": 3.0544486045837402,
603
+ "rewards/margins_std": 11.579388618469238,
604
+ "rewards/rejected": -18.25827407836914,
605
+ "step": 300
606
+ },
607
+ {
608
+ "epoch": 0.85,
609
+ "eval_logits/chosen": -1.3141957521438599,
610
+ "eval_logits/rejected": -1.213302731513977,
611
+ "eval_logps/chosen": -891.159912109375,
612
+ "eval_logps/rejected": -1116.3018798828125,
613
+ "eval_loss": 1.723042607307434,
614
+ "eval_rewards/accuracies": 0.609000027179718,
615
+ "eval_rewards/chosen": -6.162792205810547,
616
+ "eval_rewards/margins": 2.4250221252441406,
617
+ "eval_rewards/margins_max": 18.91021156311035,
618
+ "eval_rewards/margins_min": -8.22018814086914,
619
+ "eval_rewards/margins_std": 8.723590850830078,
620
+ "eval_rewards/rejected": -8.587814331054688,
621
+ "eval_runtime": 739.1854,
622
+ "eval_samples_per_second": 2.706,
623
+ "eval_steps_per_second": 0.169,
624
+ "step": 300
625
+ },
626
+ {
627
+ "epoch": 0.87,
628
+ "grad_norm": 3.8120942447056025,
629
+ "learning_rate": 4.466207261809989e-06,
630
+ "logits/chosen": -1.319267749786377,
631
+ "logits/rejected": -0.8222479820251465,
632
+ "logps/chosen": -1014.4478759765625,
633
+ "logps/rejected": -2260.364501953125,
634
+ "loss": 0.2071,
635
+ "rewards/accuracies": 0.9375,
636
+ "rewards/chosen": -7.412560939788818,
637
+ "rewards/margins": 12.69267749786377,
638
+ "rewards/margins_max": 31.255285263061523,
639
+ "rewards/margins_min": 1.447127103805542,
640
+ "rewards/margins_std": 13.759561538696289,
641
+ "rewards/rejected": -20.105239868164062,
642
+ "step": 310
643
+ },
644
+ {
645
+ "epoch": 0.9,
646
+ "grad_norm": 47.6233390458397,
647
+ "learning_rate": 4.414525450399713e-06,
648
+ "logits/chosen": -1.2297580242156982,
649
+ "logits/rejected": -0.662652850151062,
650
+ "logps/chosen": -1187.2958984375,
651
+ "logps/rejected": -3130.074951171875,
652
+ "loss": 0.1808,
653
+ "rewards/accuracies": 0.8999999761581421,
654
+ "rewards/chosen": -8.963447570800781,
655
+ "rewards/margins": 19.364063262939453,
656
+ "rewards/margins_max": 49.05697250366211,
657
+ "rewards/margins_min": 0.8736963272094727,
658
+ "rewards/margins_std": 21.952783584594727,
659
+ "rewards/rejected": -28.327510833740234,
660
+ "step": 320
661
+ },
662
+ {
663
+ "epoch": 0.93,
664
+ "grad_norm": 8.112447579189505,
665
+ "learning_rate": 4.360784949008615e-06,
666
+ "logits/chosen": -1.1926701068878174,
667
+ "logits/rejected": -0.5003105401992798,
668
+ "logps/chosen": -1699.5732421875,
669
+ "logps/rejected": -4102.2255859375,
670
+ "loss": 0.1992,
671
+ "rewards/accuracies": 0.949999988079071,
672
+ "rewards/chosen": -13.697128295898438,
673
+ "rewards/margins": 24.271343231201172,
674
+ "rewards/margins_max": 48.1143798828125,
675
+ "rewards/margins_min": 6.217949390411377,
676
+ "rewards/margins_std": 18.776830673217773,
677
+ "rewards/rejected": -37.96847152709961,
678
+ "step": 330
679
+ },
680
+ {
681
+ "epoch": 0.96,
682
+ "grad_norm": 2.6961010650119905,
683
+ "learning_rate": 4.30504354481929e-06,
684
+ "logits/chosen": -1.4151222705841064,
685
+ "logits/rejected": -0.9203524589538574,
686
+ "logps/chosen": -864.4846801757812,
687
+ "logps/rejected": -2527.864013671875,
688
+ "loss": 0.1551,
689
+ "rewards/accuracies": 0.949999988079071,
690
+ "rewards/chosen": -5.877382755279541,
691
+ "rewards/margins": 16.530858993530273,
692
+ "rewards/margins_max": 33.9772834777832,
693
+ "rewards/margins_min": 3.8386902809143066,
694
+ "rewards/margins_std": 14.032522201538086,
695
+ "rewards/rejected": -22.408239364624023,
696
+ "step": 340
697
+ },
698
+ {
699
+ "epoch": 0.99,
700
+ "grad_norm": 6.911698207606949,
701
+ "learning_rate": 4.247361176585904e-06,
702
+ "logits/chosen": -1.2823737859725952,
703
+ "logits/rejected": -0.8435856103897095,
704
+ "logps/chosen": -1273.652587890625,
705
+ "logps/rejected": -3208.216064453125,
706
+ "loss": 0.2927,
707
+ "rewards/accuracies": 0.9750000238418579,
708
+ "rewards/chosen": -9.91613483428955,
709
+ "rewards/margins": 18.9328670501709,
710
+ "rewards/margins_max": 38.30847930908203,
711
+ "rewards/margins_min": 6.44967794418335,
712
+ "rewards/margins_std": 14.676486015319824,
713
+ "rewards/rejected": -28.849002838134766,
714
+ "step": 350
715
+ },
716
+ {
717
+ "epoch": 1.01,
718
+ "grad_norm": 0.20907754692817052,
719
+ "learning_rate": 4.187799870182038e-06,
720
+ "logits/chosen": -1.4417529106140137,
721
+ "logits/rejected": -0.9101096391677856,
722
+ "logps/chosen": -1253.6064453125,
723
+ "logps/rejected": -3580.76416015625,
724
+ "loss": 0.0852,
725
+ "rewards/accuracies": 0.9624999761581421,
726
+ "rewards/chosen": -9.736591339111328,
727
+ "rewards/margins": 23.14864730834961,
728
+ "rewards/margins_max": 51.2208251953125,
729
+ "rewards/margins_min": 5.839755058288574,
730
+ "rewards/margins_std": 21.0888671875,
731
+ "rewards/rejected": -32.88523483276367,
732
+ "step": 360
733
+ },
734
+ {
735
+ "epoch": 1.04,
736
+ "grad_norm": 38.435126006464614,
737
+ "learning_rate": 4.1264236719042365e-06,
738
+ "logits/chosen": -1.4028445482254028,
739
+ "logits/rejected": -0.8279297947883606,
740
+ "logps/chosen": -1447.45263671875,
741
+ "logps/rejected": -3875.93994140625,
742
+ "loss": 0.1371,
743
+ "rewards/accuracies": 0.949999988079071,
744
+ "rewards/chosen": -11.246275901794434,
745
+ "rewards/margins": 24.722837448120117,
746
+ "rewards/margins_max": 49.426544189453125,
747
+ "rewards/margins_min": 3.9899165630340576,
748
+ "rewards/margins_std": 21.102214813232422,
749
+ "rewards/rejected": -35.9691162109375,
750
+ "step": 370
751
+ },
752
+ {
753
+ "epoch": 1.07,
754
+ "grad_norm": 5.26142385176066,
755
+ "learning_rate": 4.063298579603001e-06,
756
+ "logits/chosen": -1.4487351179122925,
757
+ "logits/rejected": -0.8309275507926941,
758
+ "logps/chosen": -997.6652221679688,
759
+ "logps/rejected": -3492.02880859375,
760
+ "loss": 0.029,
761
+ "rewards/accuracies": 1.0,
762
+ "rewards/chosen": -7.381462097167969,
763
+ "rewards/margins": 24.83234977722168,
764
+ "rewards/margins_max": 55.610748291015625,
765
+ "rewards/margins_min": 7.033749580383301,
766
+ "rewards/margins_std": 22.912935256958008,
767
+ "rewards/rejected": -32.21381378173828,
768
+ "step": 380
769
+ },
770
+ {
771
+ "epoch": 1.1,
772
+ "grad_norm": 2.2541383197386446,
773
+ "learning_rate": 3.998492471715272e-06,
774
+ "logits/chosen": -1.2715747356414795,
775
+ "logits/rejected": -0.8856738805770874,
776
+ "logps/chosen": -1461.220703125,
777
+ "logps/rejected": -3670.775390625,
778
+ "loss": 0.1251,
779
+ "rewards/accuracies": 1.0,
780
+ "rewards/chosen": -11.813475608825684,
781
+ "rewards/margins": 21.913679122924805,
782
+ "rewards/margins_max": 46.8076057434082,
783
+ "rewards/margins_min": 5.384150505065918,
784
+ "rewards/margins_std": 18.93104362487793,
785
+ "rewards/rejected": -33.72715759277344,
786
+ "step": 390
787
+ },
788
+ {
789
+ "epoch": 1.13,
790
+ "grad_norm": 1.9849281441200701,
791
+ "learning_rate": 3.932075034274723e-06,
792
+ "logits/chosen": -1.3581750392913818,
793
+ "logits/rejected": -0.8970950245857239,
794
+ "logps/chosen": -1317.798095703125,
795
+ "logps/rejected": -3307.11083984375,
796
+ "loss": 0.074,
797
+ "rewards/accuracies": 0.9750000238418579,
798
+ "rewards/chosen": -10.455536842346191,
799
+ "rewards/margins": 19.807851791381836,
800
+ "rewards/margins_max": 41.626365661621094,
801
+ "rewards/margins_min": 5.861770153045654,
802
+ "rewards/margins_std": 16.298381805419922,
803
+ "rewards/rejected": -30.263391494750977,
804
+ "step": 400
805
+ },
806
+ {
807
+ "epoch": 1.13,
808
+ "eval_logits/chosen": -1.4369679689407349,
809
+ "eval_logits/rejected": -1.3244106769561768,
810
+ "eval_logps/chosen": -1146.1495361328125,
811
+ "eval_logps/rejected": -1451.477783203125,
812
+ "eval_loss": 2.0005135536193848,
813
+ "eval_rewards/accuracies": 0.621999979019165,
814
+ "eval_rewards/chosen": -8.712687492370605,
815
+ "eval_rewards/margins": 3.22688364982605,
816
+ "eval_rewards/margins_max": 20.153732299804688,
817
+ "eval_rewards/margins_min": -9.986676216125488,
818
+ "eval_rewards/margins_std": 9.687753677368164,
819
+ "eval_rewards/rejected": -11.93957233428955,
820
+ "eval_runtime": 739.9676,
821
+ "eval_samples_per_second": 2.703,
822
+ "eval_steps_per_second": 0.169,
823
+ "step": 400
824
+ },
825
+ {
826
+ "epoch": 1.15,
827
+ "grad_norm": 2.538517914103785,
828
+ "learning_rate": 3.864117685978339e-06,
829
+ "logits/chosen": -1.3837534189224243,
830
+ "logits/rejected": -0.8808004260063171,
831
+ "logps/chosen": -1536.3201904296875,
832
+ "logps/rejected": -3880.190185546875,
833
+ "loss": 0.041,
834
+ "rewards/accuracies": 1.0,
835
+ "rewards/chosen": -12.580427169799805,
836
+ "rewards/margins": 23.37277603149414,
837
+ "rewards/margins_max": 50.48382568359375,
838
+ "rewards/margins_min": 4.341352939605713,
839
+ "rewards/margins_std": 21.75898551940918,
840
+ "rewards/rejected": -35.95320129394531,
841
+ "step": 410
842
+ },
843
+ {
844
+ "epoch": 1.18,
845
+ "grad_norm": 5.339854465744551,
846
+ "learning_rate": 3.794693501389861e-06,
847
+ "logits/chosen": -1.0723979473114014,
848
+ "logits/rejected": -0.3059498071670532,
849
+ "logps/chosen": -2542.86083984375,
850
+ "logps/rejected": -6684.94140625,
851
+ "loss": 0.0319,
852
+ "rewards/accuracies": 0.9750000238418579,
853
+ "rewards/chosen": -22.507823944091797,
854
+ "rewards/margins": 41.29346466064453,
855
+ "rewards/margins_max": 83.78596496582031,
856
+ "rewards/margins_min": 9.751798629760742,
857
+ "rewards/margins_std": 34.60697937011719,
858
+ "rewards/rejected": -63.80128860473633,
859
+ "step": 420
860
+ },
861
+ {
862
+ "epoch": 1.21,
863
+ "grad_norm": 4.806290621145595,
864
+ "learning_rate": 3.7238771323626822e-06,
865
+ "logits/chosen": -1.1698893308639526,
866
+ "logits/rejected": -0.48589786887168884,
867
+ "logps/chosen": -2241.239013671875,
868
+ "logps/rejected": -5464.1240234375,
869
+ "loss": 0.1424,
870
+ "rewards/accuracies": 0.9624999761581421,
871
+ "rewards/chosen": -19.22347068786621,
872
+ "rewards/margins": 32.403419494628906,
873
+ "rewards/margins_max": 70.16567993164062,
874
+ "rewards/margins_min": 5.948617935180664,
875
+ "rewards/margins_std": 29.28656578063965,
876
+ "rewards/rejected": -51.62689208984375,
877
+ "step": 430
878
+ },
879
+ {
880
+ "epoch": 1.24,
881
+ "grad_norm": 16.40920174339595,
882
+ "learning_rate": 3.651744727766676e-06,
883
+ "logits/chosen": -1.252957820892334,
884
+ "logits/rejected": -0.6666532158851624,
885
+ "logps/chosen": -1763.9290771484375,
886
+ "logps/rejected": -4610.29296875,
887
+ "loss": 0.1017,
888
+ "rewards/accuracies": 0.987500011920929,
889
+ "rewards/chosen": -14.94475269317627,
890
+ "rewards/margins": 28.329538345336914,
891
+ "rewards/margins_max": 58.97295379638672,
892
+ "rewards/margins_min": 7.963846683502197,
893
+ "rewards/margins_std": 23.118122100830078,
894
+ "rewards/rejected": -43.274295806884766,
895
+ "step": 440
896
+ },
897
+ {
898
+ "epoch": 1.27,
899
+ "grad_norm": 8.291804889185014,
900
+ "learning_rate": 3.57837385160529e-06,
901
+ "logits/chosen": -1.4508641958236694,
902
+ "logits/rejected": -0.9649600982666016,
903
+ "logps/chosen": -1113.8978271484375,
904
+ "logps/rejected": -2990.294921875,
905
+ "loss": 0.0341,
906
+ "rewards/accuracies": 0.987500011920929,
907
+ "rewards/chosen": -8.290533065795898,
908
+ "rewards/margins": 18.671770095825195,
909
+ "rewards/margins_max": 35.884986877441406,
910
+ "rewards/margins_min": 5.625349998474121,
911
+ "rewards/margins_std": 13.8485689163208,
912
+ "rewards/rejected": -26.96230125427246,
913
+ "step": 450
914
+ },
915
+ {
916
+ "epoch": 1.3,
917
+ "grad_norm": 5.207329228568341,
918
+ "learning_rate": 3.503843399610941e-06,
919
+ "logits/chosen": -1.5748833417892456,
920
+ "logits/rejected": -1.0820248126983643,
921
+ "logps/chosen": -1375.6676025390625,
922
+ "logps/rejected": -3950.961669921875,
923
+ "loss": 0.1747,
924
+ "rewards/accuracies": 0.987500011920929,
925
+ "rewards/chosen": -10.502653121948242,
926
+ "rewards/margins": 25.67014503479004,
927
+ "rewards/margins_max": 56.58990478515625,
928
+ "rewards/margins_min": 4.733575344085693,
929
+ "rewards/margins_std": 23.528743743896484,
930
+ "rewards/rejected": -36.17279052734375,
931
+ "step": 460
932
+ },
933
+ {
934
+ "epoch": 1.32,
935
+ "grad_norm": 1.3112038161123905,
936
+ "learning_rate": 3.4282335144083985e-06,
937
+ "logits/chosen": -1.4795830249786377,
938
+ "logits/rejected": -0.9518693089485168,
939
+ "logps/chosen": -1163.8775634765625,
940
+ "logps/rejected": -3689.42919921875,
941
+ "loss": 0.0479,
942
+ "rewards/accuracies": 0.987500011920929,
943
+ "rewards/chosen": -8.739542007446289,
944
+ "rewards/margins": 25.273595809936523,
945
+ "rewards/margins_max": 51.39923858642578,
946
+ "rewards/margins_min": 7.340193271636963,
947
+ "rewards/margins_std": 19.96113395690918,
948
+ "rewards/rejected": -34.01313400268555,
949
+ "step": 470
950
+ },
951
+ {
952
+ "epoch": 1.35,
953
+ "grad_norm": 1.5548232896248495,
954
+ "learning_rate": 3.351625499337395e-06,
955
+ "logits/chosen": -1.408975601196289,
956
+ "logits/rejected": -0.8387428522109985,
957
+ "logps/chosen": -1590.7392578125,
958
+ "logps/rejected": -4186.083984375,
959
+ "loss": 0.1211,
960
+ "rewards/accuracies": 0.9750000238418579,
961
+ "rewards/chosen": -12.902124404907227,
962
+ "rewards/margins": 26.1358699798584,
963
+ "rewards/margins_max": 52.00028610229492,
964
+ "rewards/margins_min": 7.108595848083496,
965
+ "rewards/margins_std": 20.017724990844727,
966
+ "rewards/rejected": -39.037994384765625,
967
+ "step": 480
968
+ },
969
+ {
970
+ "epoch": 1.38,
971
+ "grad_norm": 11.945131109643063,
972
+ "learning_rate": 3.2741017310271056e-06,
973
+ "logits/chosen": -1.4146662950515747,
974
+ "logits/rejected": -0.797744631767273,
975
+ "logps/chosen": -1408.4781494140625,
976
+ "logps/rejected": -4716.1142578125,
977
+ "loss": 0.1193,
978
+ "rewards/accuracies": 0.9624999761581421,
979
+ "rewards/chosen": -11.056276321411133,
980
+ "rewards/margins": 32.81745147705078,
981
+ "rewards/margins_max": 74.43333435058594,
982
+ "rewards/margins_min": 6.130241394042969,
983
+ "rewards/margins_std": 30.70904541015625,
984
+ "rewards/rejected": -43.87372589111328,
985
+ "step": 490
986
+ },
987
+ {
988
+ "epoch": 1.41,
989
+ "grad_norm": 8.866730611852605,
990
+ "learning_rate": 3.195745570816532e-06,
991
+ "logits/chosen": -1.5317000150680542,
992
+ "logits/rejected": -1.1303898096084595,
993
+ "logps/chosen": -1008.9549560546875,
994
+ "logps/rejected": -2925.121337890625,
995
+ "loss": 0.0551,
996
+ "rewards/accuracies": 0.987500011920929,
997
+ "rewards/chosen": -7.364511013031006,
998
+ "rewards/margins": 19.03972625732422,
999
+ "rewards/margins_max": 39.17045211791992,
1000
+ "rewards/margins_min": 5.060760974884033,
1001
+ "rewards/margins_std": 15.347724914550781,
1002
+ "rewards/rejected": -26.40423583984375,
1003
+ "step": 500
1004
+ },
1005
+ {
1006
+ "epoch": 1.41,
1007
+ "eval_logits/chosen": -1.4257258176803589,
1008
+ "eval_logits/rejected": -1.2957897186279297,
1009
+ "eval_logps/chosen": -1318.13232421875,
1010
+ "eval_logps/rejected": -1773.228271484375,
1011
+ "eval_loss": 2.656811475753784,
1012
+ "eval_rewards/accuracies": 0.6259999871253967,
1013
+ "eval_rewards/chosen": -10.432517051696777,
1014
+ "eval_rewards/margins": 4.7245612144470215,
1015
+ "eval_rewards/margins_max": 28.604450225830078,
1016
+ "eval_rewards/margins_min": -13.697463989257812,
1017
+ "eval_rewards/margins_std": 13.803962707519531,
1018
+ "eval_rewards/rejected": -15.15707778930664,
1019
+ "eval_runtime": 740.0178,
1020
+ "eval_samples_per_second": 2.703,
1021
+ "eval_steps_per_second": 0.169,
1022
+ "step": 500
1023
+ },
1024
+ {
1025
+ "epoch": 1.44,
1026
+ "grad_norm": 4.416696704306515,
1027
+ "learning_rate": 3.116641275116018e-06,
1028
+ "logits/chosen": -1.3493144512176514,
1029
+ "logits/rejected": -0.7629820704460144,
1030
+ "logps/chosen": -1329.723388671875,
1031
+ "logps/rejected": -4176.525390625,
1032
+ "loss": 0.079,
1033
+ "rewards/accuracies": 0.9624999761581421,
1034
+ "rewards/chosen": -10.125879287719727,
1035
+ "rewards/margins": 28.669696807861328,
1036
+ "rewards/margins_max": 62.80031204223633,
1037
+ "rewards/margins_min": 7.63208532333374,
1038
+ "rewards/margins_std": 25.432636260986328,
1039
+ "rewards/rejected": -38.79557418823242,
1040
+ "step": 510
1041
+ },
1042
+ {
1043
+ "epoch": 1.46,
1044
+ "grad_norm": 16.45913444533014,
1045
+ "learning_rate": 3.0368739048062956e-06,
1046
+ "logits/chosen": -1.283140778541565,
1047
+ "logits/rejected": -0.6157187819480896,
1048
+ "logps/chosen": -1691.1859130859375,
1049
+ "logps/rejected": -5241.02587890625,
1050
+ "loss": 0.0912,
1051
+ "rewards/accuracies": 0.9750000238418579,
1052
+ "rewards/chosen": -14.164215087890625,
1053
+ "rewards/margins": 35.53845977783203,
1054
+ "rewards/margins_max": 76.7809066772461,
1055
+ "rewards/margins_min": 7.163271903991699,
1056
+ "rewards/margins_std": 31.75979995727539,
1057
+ "rewards/rejected": -49.702667236328125,
1058
+ "step": 520
1059
+ },
1060
+ {
1061
+ "epoch": 1.49,
1062
+ "grad_norm": 0.20699893951201953,
1063
+ "learning_rate": 2.956529233772492e-06,
1064
+ "logits/chosen": -1.337871789932251,
1065
+ "logits/rejected": -0.6666404008865356,
1066
+ "logps/chosen": -1663.0726318359375,
1067
+ "logps/rejected": -5488.6357421875,
1068
+ "loss": 0.0491,
1069
+ "rewards/accuracies": 0.9624999761581421,
1070
+ "rewards/chosen": -13.510289192199707,
1071
+ "rewards/margins": 38.012184143066406,
1072
+ "rewards/margins_max": 82.3269271850586,
1073
+ "rewards/margins_min": 9.285993576049805,
1074
+ "rewards/margins_std": 33.284690856933594,
1075
+ "rewards/rejected": -51.52248001098633,
1076
+ "step": 530
1077
+ },
1078
+ {
1079
+ "epoch": 1.52,
1080
+ "grad_norm": 5.250766698166954,
1081
+ "learning_rate": 2.8756936566714317e-06,
1082
+ "logits/chosen": -1.2444673776626587,
1083
+ "logits/rejected": -0.6022458672523499,
1084
+ "logps/chosen": -2035.296875,
1085
+ "logps/rejected": -5392.39453125,
1086
+ "loss": 0.1281,
1087
+ "rewards/accuracies": 0.987500011920929,
1088
+ "rewards/chosen": -17.186960220336914,
1089
+ "rewards/margins": 33.95701217651367,
1090
+ "rewards/margins_max": 71.49807739257812,
1091
+ "rewards/margins_min": 9.499161720275879,
1092
+ "rewards/margins_std": 27.914098739624023,
1093
+ "rewards/rejected": -51.14397430419922,
1094
+ "step": 540
1095
+ },
1096
+ {
1097
+ "epoch": 1.55,
1098
+ "grad_norm": 2.140750073865154,
1099
+ "learning_rate": 2.794454096031429e-06,
1100
+ "logits/chosen": -1.3312510251998901,
1101
+ "logits/rejected": -0.5831511616706848,
1102
+ "logps/chosen": -1793.1119384765625,
1103
+ "logps/rejected": -6092.6904296875,
1104
+ "loss": 0.0615,
1105
+ "rewards/accuracies": 0.987500011920929,
1106
+ "rewards/chosen": -14.43737506866455,
1107
+ "rewards/margins": 42.91600036621094,
1108
+ "rewards/margins_max": 97.83891296386719,
1109
+ "rewards/margins_min": 9.349691390991211,
1110
+ "rewards/margins_std": 40.1955451965332,
1111
+ "rewards/rejected": -57.35337448120117,
1112
+ "step": 550
1113
+ },
1114
+ {
1115
+ "epoch": 1.58,
1116
+ "grad_norm": 2.5243522946384265,
1117
+ "learning_rate": 2.71289790878446e-06,
1118
+ "logits/chosen": -1.369600534439087,
1119
+ "logits/rejected": -0.7785685658454895,
1120
+ "logps/chosen": -1457.351318359375,
1121
+ "logps/rejected": -4937.0205078125,
1122
+ "loss": 0.061,
1123
+ "rewards/accuracies": 0.987500011920929,
1124
+ "rewards/chosen": -11.771065711975098,
1125
+ "rewards/margins": 34.56814956665039,
1126
+ "rewards/margins_max": 73.30443572998047,
1127
+ "rewards/margins_min": 9.829178810119629,
1128
+ "rewards/margins_std": 29.120563507080078,
1129
+ "rewards/rejected": -46.339210510253906,
1130
+ "step": 560
1131
+ },
1132
+ {
1133
+ "epoch": 1.61,
1134
+ "grad_norm": 21.43574111885225,
1135
+ "learning_rate": 2.6311127923312156e-06,
1136
+ "logits/chosen": -1.2619574069976807,
1137
+ "logits/rejected": -0.5961970090866089,
1138
+ "logps/chosen": -2181.08935546875,
1139
+ "logps/rejected": -6657.7587890625,
1140
+ "loss": 0.1197,
1141
+ "rewards/accuracies": 0.9750000238418579,
1142
+ "rewards/chosen": -18.670825958251953,
1143
+ "rewards/margins": 44.561912536621094,
1144
+ "rewards/margins_max": 88.48506164550781,
1145
+ "rewards/margins_min": 7.3762311935424805,
1146
+ "rewards/margins_std": 36.8846435546875,
1147
+ "rewards/rejected": -63.23274612426758,
1148
+ "step": 570
1149
+ },
1150
+ {
1151
+ "epoch": 1.63,
1152
+ "grad_norm": 7.871582743030215,
1153
+ "learning_rate": 2.549186690240057e-06,
1154
+ "logits/chosen": -1.2399415969848633,
1155
+ "logits/rejected": -0.647193431854248,
1156
+ "logps/chosen": -1575.2166748046875,
1157
+ "logps/rejected": -5238.3876953125,
1158
+ "loss": 0.0476,
1159
+ "rewards/accuracies": 0.987500011920929,
1160
+ "rewards/chosen": -13.128217697143555,
1161
+ "rewards/margins": 36.19137191772461,
1162
+ "rewards/margins_max": 77.69859313964844,
1163
+ "rewards/margins_min": 8.904541015625,
1164
+ "rewards/margins_std": 31.661495208740234,
1165
+ "rewards/rejected": -49.3195915222168,
1166
+ "step": 580
1167
+ },
1168
+ {
1169
+ "epoch": 1.66,
1170
+ "grad_norm": 3.101951593932612,
1171
+ "learning_rate": 2.4672076976812548e-06,
1172
+ "logits/chosen": -1.310465693473816,
1173
+ "logits/rejected": -0.7587507963180542,
1174
+ "logps/chosen": -1462.403564453125,
1175
+ "logps/rejected": -4328.84130859375,
1176
+ "loss": 0.1237,
1177
+ "rewards/accuracies": 0.949999988079071,
1178
+ "rewards/chosen": -11.766267776489258,
1179
+ "rewards/margins": 28.488006591796875,
1180
+ "rewards/margins_max": 57.8286247253418,
1181
+ "rewards/margins_min": 7.110069274902344,
1182
+ "rewards/margins_std": 22.86618423461914,
1183
+ "rewards/rejected": -40.254276275634766,
1184
+ "step": 590
1185
+ },
1186
+ {
1187
+ "epoch": 1.69,
1188
+ "grad_norm": 0.0,
1189
+ "learning_rate": 2.3852639666982218e-06,
1190
+ "logits/chosen": -1.1802486181259155,
1191
+ "logits/rejected": -0.5148967504501343,
1192
+ "logps/chosen": -1528.84228515625,
1193
+ "logps/rejected": -5615.95703125,
1194
+ "loss": 0.169,
1195
+ "rewards/accuracies": 0.9750000238418579,
1196
+ "rewards/chosen": -12.585975646972656,
1197
+ "rewards/margins": 40.26185989379883,
1198
+ "rewards/margins_max": 87.59795379638672,
1199
+ "rewards/margins_min": 7.060022830963135,
1200
+ "rewards/margins_std": 37.70138931274414,
1201
+ "rewards/rejected": -52.84783935546875,
1202
+ "step": 600
1203
+ },
1204
+ {
1205
+ "epoch": 1.69,
1206
+ "eval_logits/chosen": -1.1529114246368408,
1207
+ "eval_logits/rejected": -1.0369744300842285,
1208
+ "eval_logps/chosen": -1772.8465576171875,
1209
+ "eval_logps/rejected": -2317.167724609375,
1210
+ "eval_loss": 3.7088778018951416,
1211
+ "eval_rewards/accuracies": 0.6159999966621399,
1212
+ "eval_rewards/chosen": -14.979656219482422,
1213
+ "eval_rewards/margins": 5.616815567016602,
1214
+ "eval_rewards/margins_max": 36.04051208496094,
1215
+ "eval_rewards/margins_min": -19.89307975769043,
1216
+ "eval_rewards/margins_std": 18.072784423828125,
1217
+ "eval_rewards/rejected": -20.596471786499023,
1218
+ "eval_runtime": 739.6295,
1219
+ "eval_samples_per_second": 2.704,
1220
+ "eval_steps_per_second": 0.169,
1221
+ "step": 600
1222
+ },
1223
+ {
1224
+ "epoch": 1.72,
1225
+ "grad_norm": 0.41834345626108355,
1226
+ "learning_rate": 2.303443611417584e-06,
1227
+ "logits/chosen": -1.282365083694458,
1228
+ "logits/rejected": -0.7108389139175415,
1229
+ "logps/chosen": -1314.89794921875,
1230
+ "logps/rejected": -4228.2021484375,
1231
+ "loss": 0.0614,
1232
+ "rewards/accuracies": 0.9750000238418579,
1233
+ "rewards/chosen": -10.093293190002441,
1234
+ "rewards/margins": 29.427413940429688,
1235
+ "rewards/margins_max": 62.4293327331543,
1236
+ "rewards/margins_min": 9.642549514770508,
1237
+ "rewards/margins_std": 24.211936950683594,
1238
+ "rewards/rejected": -39.52070617675781,
1239
+ "step": 610
1240
+ },
1241
+ {
1242
+ "epoch": 1.75,
1243
+ "grad_norm": 6.90289983608622e-05,
1244
+ "learning_rate": 2.2218346133000264e-06,
1245
+ "logits/chosen": -1.1936254501342773,
1246
+ "logits/rejected": -0.497224897146225,
1247
+ "logps/chosen": -1679.509033203125,
1248
+ "logps/rejected": -5510.2685546875,
1249
+ "loss": 0.0599,
1250
+ "rewards/accuracies": 1.0,
1251
+ "rewards/chosen": -13.5853853225708,
1252
+ "rewards/margins": 38.249794006347656,
1253
+ "rewards/margins_max": 87.04119873046875,
1254
+ "rewards/margins_min": 6.039667129516602,
1255
+ "rewards/margins_std": 37.55732727050781,
1256
+ "rewards/rejected": -51.835174560546875,
1257
+ "step": 620
1258
+ },
1259
+ {
1260
+ "epoch": 1.77,
1261
+ "grad_norm": 6.551057267217207,
1262
+ "learning_rate": 2.140524726533792e-06,
1263
+ "logits/chosen": -1.1961278915405273,
1264
+ "logits/rejected": -0.6267169117927551,
1265
+ "logps/chosen": -1415.692626953125,
1266
+ "logps/rejected": -5236.87548828125,
1267
+ "loss": 0.0962,
1268
+ "rewards/accuracies": 0.9624999761581421,
1269
+ "rewards/chosen": -10.874711036682129,
1270
+ "rewards/margins": 38.274715423583984,
1271
+ "rewards/margins_max": 77.94049072265625,
1272
+ "rewards/margins_min": 6.714731693267822,
1273
+ "rewards/margins_std": 32.879066467285156,
1274
+ "rewards/rejected": -49.14943313598633,
1275
+ "step": 630
1276
+ },
1277
+ {
1278
+ "epoch": 1.8,
1279
+ "grad_norm": 6.856262195892754,
1280
+ "learning_rate": 2.059601383672566e-06,
1281
+ "logits/chosen": -1.358189344406128,
1282
+ "logits/rejected": -0.7102182507514954,
1283
+ "logps/chosen": -1562.2977294921875,
1284
+ "logps/rejected": -5266.2451171875,
1285
+ "loss": 0.0188,
1286
+ "rewards/accuracies": 1.0,
1287
+ "rewards/chosen": -12.751333236694336,
1288
+ "rewards/margins": 37.24244689941406,
1289
+ "rewards/margins_max": 71.85942077636719,
1290
+ "rewards/margins_min": 10.842636108398438,
1291
+ "rewards/margins_std": 27.61344337463379,
1292
+ "rewards/rejected": -49.99378204345703,
1293
+ "step": 640
1294
+ },
1295
+ {
1296
+ "epoch": 1.83,
1297
+ "grad_norm": 21.756292024970186,
1298
+ "learning_rate": 1.9791516016192214e-06,
1299
+ "logits/chosen": -1.2505046129226685,
1300
+ "logits/rejected": -0.5122033357620239,
1301
+ "logps/chosen": -2004.923095703125,
1302
+ "logps/rejected": -6630.80859375,
1303
+ "loss": 0.0833,
1304
+ "rewards/accuracies": 0.9624999761581421,
1305
+ "rewards/chosen": -16.955032348632812,
1306
+ "rewards/margins": 46.02431869506836,
1307
+ "rewards/margins_max": 99.85273742675781,
1308
+ "rewards/margins_min": 7.133955478668213,
1309
+ "rewards/margins_std": 42.62360382080078,
1310
+ "rewards/rejected": -62.9793586730957,
1311
+ "step": 650
1312
+ },
1313
+ {
1314
+ "epoch": 1.86,
1315
+ "grad_norm": 4.783156669269607,
1316
+ "learning_rate": 1.8992618880565039e-06,
1317
+ "logits/chosen": -1.26731538772583,
1318
+ "logits/rejected": -0.6431769132614136,
1319
+ "logps/chosen": -1309.8038330078125,
1320
+ "logps/rejected": -4725.5908203125,
1321
+ "loss": 0.0275,
1322
+ "rewards/accuracies": 0.987500011920929,
1323
+ "rewards/chosen": -10.83329963684082,
1324
+ "rewards/margins": 33.95838165283203,
1325
+ "rewards/margins_max": 75.65066528320312,
1326
+ "rewards/margins_min": 5.828721046447754,
1327
+ "rewards/margins_std": 32.41167449951172,
1328
+ "rewards/rejected": -44.79167938232422,
1329
+ "step": 660
1330
+ },
1331
+ {
1332
+ "epoch": 1.89,
1333
+ "grad_norm": 5.459662336311476,
1334
+ "learning_rate": 1.8200181484252888e-06,
1335
+ "logits/chosen": -1.3983234167099,
1336
+ "logits/rejected": -0.8143168687820435,
1337
+ "logps/chosen": -1471.9613037109375,
1338
+ "logps/rejected": -4750.08203125,
1339
+ "loss": 0.0401,
1340
+ "rewards/accuracies": 0.9750000238418579,
1341
+ "rewards/chosen": -11.914285659790039,
1342
+ "rewards/margins": 32.79840850830078,
1343
+ "rewards/margins_max": 64.46766662597656,
1344
+ "rewards/margins_min": 8.825395584106445,
1345
+ "rewards/margins_std": 24.9855899810791,
1346
+ "rewards/rejected": -44.71269226074219,
1347
+ "step": 670
1348
+ },
1349
+ {
1350
+ "epoch": 1.92,
1351
+ "grad_norm": 65.44842315709371,
1352
+ "learning_rate": 1.7415055935504234e-06,
1353
+ "logits/chosen": -1.2928615808486938,
1354
+ "logits/rejected": -0.5238109230995178,
1355
+ "logps/chosen": -2183.151123046875,
1356
+ "logps/rejected": -6693.0283203125,
1357
+ "loss": 0.0503,
1358
+ "rewards/accuracies": 0.9624999761581421,
1359
+ "rewards/chosen": -18.94681167602539,
1360
+ "rewards/margins": 45.092262268066406,
1361
+ "rewards/margins_max": 93.51702117919922,
1362
+ "rewards/margins_min": 9.774969100952148,
1363
+ "rewards/margins_std": 37.305381774902344,
1364
+ "rewards/rejected": -64.03907775878906,
1365
+ "step": 680
1366
+ },
1367
+ {
1368
+ "epoch": 1.94,
1369
+ "grad_norm": 1.5650985472245555,
1370
+ "learning_rate": 1.6638086480134954e-06,
1371
+ "logits/chosen": -1.207524061203003,
1372
+ "logits/rejected": -0.47601214051246643,
1373
+ "logps/chosen": -1792.2386474609375,
1374
+ "logps/rejected": -6021.9873046875,
1375
+ "loss": 0.1575,
1376
+ "rewards/accuracies": 0.9375,
1377
+ "rewards/chosen": -15.667875289916992,
1378
+ "rewards/margins": 42.214996337890625,
1379
+ "rewards/margins_max": 87.77252197265625,
1380
+ "rewards/margins_min": 9.487606048583984,
1381
+ "rewards/margins_std": 37.55704879760742,
1382
+ "rewards/rejected": -57.88287353515625,
1383
+ "step": 690
1384
+ },
1385
+ {
1386
+ "epoch": 1.97,
1387
+ "grad_norm": 19.45271789860168,
1388
+ "learning_rate": 1.5870108593710473e-06,
1389
+ "logits/chosen": -1.3592132329940796,
1390
+ "logits/rejected": -0.6936360001564026,
1391
+ "logps/chosen": -1344.2899169921875,
1392
+ "logps/rejected": -4588.08251953125,
1393
+ "loss": 0.0661,
1394
+ "rewards/accuracies": 0.9750000238418579,
1395
+ "rewards/chosen": -10.740570068359375,
1396
+ "rewards/margins": 32.726192474365234,
1397
+ "rewards/margins_max": 63.50341033935547,
1398
+ "rewards/margins_min": 8.110437393188477,
1399
+ "rewards/margins_std": 25.54091453552246,
1400
+ "rewards/rejected": -43.466758728027344,
1401
+ "step": 700
1402
+ },
1403
+ {
1404
+ "epoch": 1.97,
1405
+ "eval_logits/chosen": -1.2633495330810547,
1406
+ "eval_logits/rejected": -1.1163122653961182,
1407
+ "eval_logps/chosen": -1868.0721435546875,
1408
+ "eval_logps/rejected": -2522.087890625,
1409
+ "eval_loss": 4.195674896240234,
1410
+ "eval_rewards/accuracies": 0.621999979019165,
1411
+ "eval_rewards/chosen": -15.931914329528809,
1412
+ "eval_rewards/margins": 6.713759899139404,
1413
+ "eval_rewards/margins_max": 41.90719223022461,
1414
+ "eval_rewards/margins_min": -22.690624237060547,
1415
+ "eval_rewards/margins_std": 20.96088218688965,
1416
+ "eval_rewards/rejected": -22.645673751831055,
1417
+ "eval_runtime": 738.3261,
1418
+ "eval_samples_per_second": 2.709,
1419
+ "eval_steps_per_second": 0.169,
1420
+ "step": 700
1421
+ },
1422
+ {
1423
+ "epoch": 2.0,
1424
+ "grad_norm": 57.39478740350686,
1425
+ "learning_rate": 1.511194808315853e-06,
1426
+ "logits/chosen": -1.3517937660217285,
1427
+ "logits/rejected": -0.7227485775947571,
1428
+ "logps/chosen": -1058.769775390625,
1429
+ "logps/rejected": -4626.90625,
1430
+ "loss": 0.0338,
1431
+ "rewards/accuracies": 0.9750000238418579,
1432
+ "rewards/chosen": -8.303699493408203,
1433
+ "rewards/margins": 35.37030792236328,
1434
+ "rewards/margins_max": 70.94367218017578,
1435
+ "rewards/margins_min": 11.402629852294922,
1436
+ "rewards/margins_std": 27.145639419555664,
1437
+ "rewards/rejected": -43.67401123046875,
1438
+ "step": 710
1439
+ },
1440
+ {
1441
+ "epoch": 2.03,
1442
+ "grad_norm": 0.30572900295523964,
1443
+ "learning_rate": 1.4364420198778662e-06,
1444
+ "logits/chosen": -1.4744371175765991,
1445
+ "logits/rejected": -0.6496419906616211,
1446
+ "logps/chosen": -1649.629638671875,
1447
+ "logps/rejected": -6396.2255859375,
1448
+ "loss": 0.0131,
1449
+ "rewards/accuracies": 1.0,
1450
+ "rewards/chosen": -13.1373929977417,
1451
+ "rewards/margins": 47.363670349121094,
1452
+ "rewards/margins_max": 104.21504211425781,
1453
+ "rewards/margins_min": 10.33763313293457,
1454
+ "rewards/margins_std": 43.073524475097656,
1455
+ "rewards/rejected": -60.501068115234375,
1456
+ "step": 720
1457
+ },
1458
+ {
1459
+ "epoch": 2.06,
1460
+ "grad_norm": 4.84455677819935,
1461
+ "learning_rate": 1.3628328757603243e-06,
1462
+ "logits/chosen": -1.1539907455444336,
1463
+ "logits/rejected": -0.274676114320755,
1464
+ "logps/chosen": -2860.142578125,
1465
+ "logps/rejected": -8397.5234375,
1466
+ "loss": 0.0368,
1467
+ "rewards/accuracies": 0.9750000238418579,
1468
+ "rewards/chosen": -25.513622283935547,
1469
+ "rewards/margins": 55.286590576171875,
1470
+ "rewards/margins_max": 105.07356262207031,
1471
+ "rewards/margins_min": 14.340426445007324,
1472
+ "rewards/margins_std": 41.029579162597656,
1473
+ "rewards/rejected": -80.80020904541016,
1474
+ "step": 730
1475
+ },
1476
+ {
1477
+ "epoch": 2.08,
1478
+ "grad_norm": 0.0021241478585518002,
1479
+ "learning_rate": 1.2904465279052725e-06,
1480
+ "logits/chosen": -1.1582523584365845,
1481
+ "logits/rejected": -0.3782978653907776,
1482
+ "logps/chosen": -2420.54833984375,
1483
+ "logps/rejected": -7015.25,
1484
+ "loss": 0.0249,
1485
+ "rewards/accuracies": 0.987500011920929,
1486
+ "rewards/chosen": -21.189661026000977,
1487
+ "rewards/margins": 45.897850036621094,
1488
+ "rewards/margins_max": 94.14543914794922,
1489
+ "rewards/margins_min": 7.491143703460693,
1490
+ "rewards/margins_std": 39.1116828918457,
1491
+ "rewards/rejected": -67.08751678466797,
1492
+ "step": 740
1493
+ },
1494
+ {
1495
+ "epoch": 2.11,
1496
+ "grad_norm": 0.031617935973131554,
1497
+ "learning_rate": 1.219360813381446e-06,
1498
+ "logits/chosen": -1.1664245128631592,
1499
+ "logits/rejected": -0.344910204410553,
1500
+ "logps/chosen": -1666.4990234375,
1501
+ "logps/rejected": -6462.9248046875,
1502
+ "loss": 0.005,
1503
+ "rewards/accuracies": 1.0,
1504
+ "rewards/chosen": -14.041155815124512,
1505
+ "rewards/margins": 47.90277862548828,
1506
+ "rewards/margins_max": 93.52235412597656,
1507
+ "rewards/margins_min": 12.527769088745117,
1508
+ "rewards/margins_std": 36.832435607910156,
1509
+ "rewards/rejected": -61.943939208984375,
1510
+ "step": 750
1511
+ },
1512
+ {
1513
+ "epoch": 2.14,
1514
+ "grad_norm": 0.00013460482103239127,
1515
+ "learning_rate": 1.1496521706860392e-06,
1516
+ "logits/chosen": -1.1053615808486938,
1517
+ "logits/rejected": -0.3047180771827698,
1518
+ "logps/chosen": -1850.0924072265625,
1519
+ "logps/rejected": -6907.0830078125,
1520
+ "loss": 0.0315,
1521
+ "rewards/accuracies": 1.0,
1522
+ "rewards/chosen": -15.86078929901123,
1523
+ "rewards/margins": 50.24542999267578,
1524
+ "rewards/margins_max": 114.74400329589844,
1525
+ "rewards/margins_min": 12.046091079711914,
1526
+ "rewards/margins_std": 46.68362808227539,
1527
+ "rewards/rejected": -66.1062240600586,
1528
+ "step": 760
1529
+ },
1530
+ {
1531
+ "epoch": 2.17,
1532
+ "grad_norm": 0.0,
1533
+ "learning_rate": 1.0813955575503588e-06,
1534
+ "logits/chosen": -1.2353287935256958,
1535
+ "logits/rejected": -0.49261850118637085,
1536
+ "logps/chosen": -1610.0069580078125,
1537
+ "logps/rejected": -6046.82470703125,
1538
+ "loss": 0.0287,
1539
+ "rewards/accuracies": 0.987500011920929,
1540
+ "rewards/chosen": -13.354846000671387,
1541
+ "rewards/margins": 44.665653228759766,
1542
+ "rewards/margins_max": 94.2379379272461,
1543
+ "rewards/margins_min": 14.147542953491211,
1544
+ "rewards/margins_std": 37.06581115722656,
1545
+ "rewards/rejected": -58.02050018310547,
1546
+ "step": 770
1547
+ },
1548
+ {
1549
+ "epoch": 2.2,
1550
+ "grad_norm": 0.22671226780074974,
1551
+ "learning_rate": 1.0146643703377488e-06,
1552
+ "logits/chosen": -1.314743161201477,
1553
+ "logits/rejected": -0.6268264055252075,
1554
+ "logps/chosen": -1836.8834228515625,
1555
+ "logps/rejected": -5721.7490234375,
1556
+ "loss": 0.0229,
1557
+ "rewards/accuracies": 1.0,
1558
+ "rewards/chosen": -15.416688919067383,
1559
+ "rewards/margins": 39.22182083129883,
1560
+ "rewards/margins_max": 87.73088073730469,
1561
+ "rewards/margins_min": 7.594358921051025,
1562
+ "rewards/margins_std": 37.2376708984375,
1563
+ "rewards/rejected": -54.63850784301758,
1564
+ "step": 780
1565
+ },
1566
+ {
1567
+ "epoch": 2.23,
1568
+ "grad_norm": 0.14863704144029602,
1569
+ "learning_rate": 9.495303651204496e-07,
1570
+ "logits/chosen": -1.2842717170715332,
1571
+ "logits/rejected": -0.5629429817199707,
1572
+ "logps/chosen": -1538.7266845703125,
1573
+ "logps/rejected": -5633.85009765625,
1574
+ "loss": 0.0131,
1575
+ "rewards/accuracies": 0.987500011920929,
1576
+ "rewards/chosen": -12.397846221923828,
1577
+ "rewards/margins": 41.08926010131836,
1578
+ "rewards/margins_max": 78.1208724975586,
1579
+ "rewards/margins_min": 11.591024398803711,
1580
+ "rewards/margins_std": 29.9827880859375,
1581
+ "rewards/rejected": -53.48711013793945,
1582
+ "step": 790
1583
+ },
1584
+ {
1585
+ "epoch": 2.25,
1586
+ "grad_norm": 0.030764744246518656,
1587
+ "learning_rate": 8.860635805202616e-07,
1588
+ "logits/chosen": -1.3080496788024902,
1589
+ "logits/rejected": -0.35764509439468384,
1590
+ "logps/chosen": -1737.4599609375,
1591
+ "logps/rejected": -7442.0615234375,
1592
+ "loss": 0.0044,
1593
+ "rewards/accuracies": 1.0,
1594
+ "rewards/chosen": -14.408218383789062,
1595
+ "rewards/margins": 56.461143493652344,
1596
+ "rewards/margins_max": 120.32511901855469,
1597
+ "rewards/margins_min": 16.574840545654297,
1598
+ "rewards/margins_std": 48.085899353027344,
1599
+ "rewards/rejected": -70.8693618774414,
1600
+ "step": 800
1601
+ },
1602
+ {
1603
+ "epoch": 2.25,
1604
+ "eval_logits/chosen": -1.093600869178772,
1605
+ "eval_logits/rejected": -0.9370656609535217,
1606
+ "eval_logps/chosen": -2551.046142578125,
1607
+ "eval_logps/rejected": -3403.35693359375,
1608
+ "eval_loss": 5.91084098815918,
1609
+ "eval_rewards/accuracies": 0.6230000257492065,
1610
+ "eval_rewards/chosen": -22.761653900146484,
1611
+ "eval_rewards/margins": 8.696711540222168,
1612
+ "eval_rewards/margins_max": 56.638038635253906,
1613
+ "eval_rewards/margins_min": -31.93355941772461,
1614
+ "eval_rewards/margins_std": 28.60364532470703,
1615
+ "eval_rewards/rejected": -31.45836639404297,
1616
+ "eval_runtime": 738.0909,
1617
+ "eval_samples_per_second": 2.71,
1618
+ "eval_steps_per_second": 0.169,
1619
+ "step": 800
1620
+ },
1621
+ {
1622
+ "epoch": 2.28,
1623
+ "grad_norm": 2.1784962232415586,
1624
+ "learning_rate": 8.24332262395994e-07,
1625
+ "logits/chosen": -1.2908947467803955,
1626
+ "logits/rejected": -0.5336360931396484,
1627
+ "logps/chosen": -1974.3099365234375,
1628
+ "logps/rejected": -6812.33740234375,
1629
+ "loss": 0.014,
1630
+ "rewards/accuracies": 1.0,
1631
+ "rewards/chosen": -16.995155334472656,
1632
+ "rewards/margins": 48.02867889404297,
1633
+ "rewards/margins_max": 103.1453628540039,
1634
+ "rewards/margins_min": 12.88886833190918,
1635
+ "rewards/margins_std": 41.732765197753906,
1636
+ "rewards/rejected": -65.02383422851562,
1637
+ "step": 810
1638
+ },
1639
+ {
1640
+ "epoch": 2.31,
1641
+ "grad_norm": 0.06615521863125323,
1642
+ "learning_rate": 7.644027904586587e-07,
1643
+ "logits/chosen": -1.2683175802230835,
1644
+ "logits/rejected": -0.41825681924819946,
1645
+ "logps/chosen": -1580.210693359375,
1646
+ "logps/rejected": -7023.9873046875,
1647
+ "loss": 0.0061,
1648
+ "rewards/accuracies": 1.0,
1649
+ "rewards/chosen": -12.912153244018555,
1650
+ "rewards/margins": 54.1514778137207,
1651
+ "rewards/margins_max": 105.11991882324219,
1652
+ "rewards/margins_min": 14.542474746704102,
1653
+ "rewards/margins_std": 42.142704010009766,
1654
+ "rewards/rejected": -67.06363677978516,
1655
+ "step": 820
1656
+ },
1657
+ {
1658
+ "epoch": 2.34,
1659
+ "grad_norm": 5.130329059658452,
1660
+ "learning_rate": 7.06339606893347e-07,
1661
+ "logits/chosen": -1.0899183750152588,
1662
+ "logits/rejected": -0.23333916068077087,
1663
+ "logps/chosen": -2606.93798828125,
1664
+ "logps/rejected": -8327.251953125,
1665
+ "loss": 0.0214,
1666
+ "rewards/accuracies": 0.987500011920929,
1667
+ "rewards/chosen": -22.539045333862305,
1668
+ "rewards/margins": 57.628456115722656,
1669
+ "rewards/margins_max": 118.33135986328125,
1670
+ "rewards/margins_min": 17.841205596923828,
1671
+ "rewards/margins_std": 45.653324127197266,
1672
+ "rewards/rejected": -80.16749572753906,
1673
+ "step": 830
1674
+ },
1675
+ {
1676
+ "epoch": 2.37,
1677
+ "grad_norm": 0.9977784813046777,
1678
+ "learning_rate": 6.502051470645149e-07,
1679
+ "logits/chosen": -1.2162657976150513,
1680
+ "logits/rejected": -0.460705429315567,
1681
+ "logps/chosen": -2124.04736328125,
1682
+ "logps/rejected": -6999.44921875,
1683
+ "loss": 0.0297,
1684
+ "rewards/accuracies": 0.987500011920929,
1685
+ "rewards/chosen": -18.209606170654297,
1686
+ "rewards/margins": 48.93000793457031,
1687
+ "rewards/margins_max": 95.8235855102539,
1688
+ "rewards/margins_min": 14.549077987670898,
1689
+ "rewards/margins_std": 37.2866096496582,
1690
+ "rewards/rejected": -67.13961029052734,
1691
+ "step": 840
1692
+ },
1693
+ {
1694
+ "epoch": 2.39,
1695
+ "grad_norm": 0.6712946936221619,
1696
+ "learning_rate": 5.960597723792194e-07,
1697
+ "logits/chosen": -1.2636550664901733,
1698
+ "logits/rejected": -0.2811248004436493,
1699
+ "logps/chosen": -1696.5123291015625,
1700
+ "logps/rejected": -7235.36328125,
1701
+ "loss": 0.0267,
1702
+ "rewards/accuracies": 0.987500011920929,
1703
+ "rewards/chosen": -14.254020690917969,
1704
+ "rewards/margins": 55.350074768066406,
1705
+ "rewards/margins_max": 116.00260162353516,
1706
+ "rewards/margins_min": 13.50024700164795,
1707
+ "rewards/margins_std": 47.108055114746094,
1708
+ "rewards/rejected": -69.60409545898438,
1709
+ "step": 850
1710
+ },
1711
+ {
1712
+ "epoch": 2.42,
1713
+ "grad_norm": 2.1640840935576553,
1714
+ "learning_rate": 5.43961705380465e-07,
1715
+ "logits/chosen": -1.2832921743392944,
1716
+ "logits/rejected": -0.3425557315349579,
1717
+ "logps/chosen": -2057.831787109375,
1718
+ "logps/rejected": -7537.8369140625,
1719
+ "loss": 0.0145,
1720
+ "rewards/accuracies": 1.0,
1721
+ "rewards/chosen": -17.41219139099121,
1722
+ "rewards/margins": 55.01224899291992,
1723
+ "rewards/margins_max": 117.5909194946289,
1724
+ "rewards/margins_min": 12.145246505737305,
1725
+ "rewards/margins_std": 48.41987228393555,
1726
+ "rewards/rejected": -72.42444610595703,
1727
+ "step": 860
1728
+ },
1729
+ {
1730
+ "epoch": 2.45,
1731
+ "grad_norm": 85.12949407648767,
1732
+ "learning_rate": 4.939669671404871e-07,
1733
+ "logits/chosen": -1.2509949207305908,
1734
+ "logits/rejected": -0.4620714783668518,
1735
+ "logps/chosen": -1721.5452880859375,
1736
+ "logps/rejected": -6441.6572265625,
1737
+ "loss": 0.0141,
1738
+ "rewards/accuracies": 0.987500011920929,
1739
+ "rewards/chosen": -14.360589981079102,
1740
+ "rewards/margins": 46.867820739746094,
1741
+ "rewards/margins_max": 93.87699890136719,
1742
+ "rewards/margins_min": 13.444913864135742,
1743
+ "rewards/margins_std": 36.849510192871094,
1744
+ "rewards/rejected": -61.228416442871094,
1745
+ "step": 870
1746
+ },
1747
+ {
1748
+ "epoch": 2.48,
1749
+ "grad_norm": 0.0,
1750
+ "learning_rate": 4.461293170212644e-07,
1751
+ "logits/chosen": -1.229883074760437,
1752
+ "logits/rejected": -0.33345091342926025,
1753
+ "logps/chosen": -2521.58740234375,
1754
+ "logps/rejected": -7449.078125,
1755
+ "loss": 0.0109,
1756
+ "rewards/accuracies": 1.0,
1757
+ "rewards/chosen": -22.13032341003418,
1758
+ "rewards/margins": 49.36927032470703,
1759
+ "rewards/margins_max": 106.86543273925781,
1760
+ "rewards/margins_min": 12.414986610412598,
1761
+ "rewards/margins_std": 43.098392486572266,
1762
+ "rewards/rejected": -71.49959564208984,
1763
+ "step": 880
1764
+ },
1765
+ {
1766
+ "epoch": 2.51,
1767
+ "grad_norm": 1.1810182556014033,
1768
+ "learning_rate": 4.005001948670606e-07,
1769
+ "logits/chosen": -1.272280216217041,
1770
+ "logits/rejected": -0.45221084356307983,
1771
+ "logps/chosen": -2259.484130859375,
1772
+ "logps/rejected": -7555.74365234375,
1773
+ "loss": 0.0132,
1774
+ "rewards/accuracies": 1.0,
1775
+ "rewards/chosen": -19.396902084350586,
1776
+ "rewards/margins": 53.13331985473633,
1777
+ "rewards/margins_max": 103.8553237915039,
1778
+ "rewards/margins_min": 17.713207244873047,
1779
+ "rewards/margins_std": 38.8509407043457,
1780
+ "rewards/rejected": -72.53022003173828,
1781
+ "step": 890
1782
+ },
1783
+ {
1784
+ "epoch": 2.54,
1785
+ "grad_norm": 0.35529843117163307,
1786
+ "learning_rate": 3.571286656911377e-07,
1787
+ "logits/chosen": -1.2354185581207275,
1788
+ "logits/rejected": -0.25159841775894165,
1789
+ "logps/chosen": -2037.9954833984375,
1790
+ "logps/rejected": -7491.6142578125,
1791
+ "loss": 0.011,
1792
+ "rewards/accuracies": 1.0,
1793
+ "rewards/chosen": -17.313739776611328,
1794
+ "rewards/margins": 54.32684326171875,
1795
+ "rewards/margins_max": 107.57340240478516,
1796
+ "rewards/margins_min": 10.841619491577148,
1797
+ "rewards/margins_std": 45.87627029418945,
1798
+ "rewards/rejected": -71.64057922363281,
1799
+ "step": 900
1800
+ },
1801
+ {
1802
+ "epoch": 2.54,
1803
+ "eval_logits/chosen": -1.0846086740493774,
1804
+ "eval_logits/rejected": -0.9208425283432007,
1805
+ "eval_logps/chosen": -2583.26708984375,
1806
+ "eval_logps/rejected": -3463.187255859375,
1807
+ "eval_loss": 5.921300888061523,
1808
+ "eval_rewards/accuracies": 0.6230000257492065,
1809
+ "eval_rewards/chosen": -23.083864212036133,
1810
+ "eval_rewards/margins": 8.97280502319336,
1811
+ "eval_rewards/margins_max": 56.95476531982422,
1812
+ "eval_rewards/margins_min": -32.0980110168457,
1813
+ "eval_rewards/margins_std": 28.85976219177246,
1814
+ "eval_rewards/rejected": -32.05666732788086,
1815
+ "eval_runtime": 738.68,
1816
+ "eval_samples_per_second": 2.708,
1817
+ "eval_steps_per_second": 0.169,
1818
+ "step": 900
1819
+ },
1820
+ {
1821
+ "epoch": 2.56,
1822
+ "grad_norm": 0.10081394980985045,
1823
+ "learning_rate": 3.1606136691612555e-07,
1824
+ "logits/chosen": -1.226280689239502,
1825
+ "logits/rejected": -0.4225079119205475,
1826
+ "logps/chosen": -1866.7056884765625,
1827
+ "logps/rejected": -6545.6845703125,
1828
+ "loss": 0.0179,
1829
+ "rewards/accuracies": 1.0,
1830
+ "rewards/chosen": -15.806465148925781,
1831
+ "rewards/margins": 47.003273010253906,
1832
+ "rewards/margins_max": 105.39058685302734,
1833
+ "rewards/margins_min": 13.083778381347656,
1834
+ "rewards/margins_std": 42.84505081176758,
1835
+ "rewards/rejected": -62.80973434448242,
1836
+ "step": 910
1837
+ },
1838
+ {
1839
+ "epoch": 2.59,
1840
+ "grad_norm": 1.3123811940509262,
1841
+ "learning_rate": 2.773424582247844e-07,
1842
+ "logits/chosen": -1.2208049297332764,
1843
+ "logits/rejected": -0.3649698495864868,
1844
+ "logps/chosen": -1686.8743896484375,
1845
+ "logps/rejected": -6142.4833984375,
1846
+ "loss": 0.008,
1847
+ "rewards/accuracies": 1.0,
1848
+ "rewards/chosen": -14.011746406555176,
1849
+ "rewards/margins": 44.98779296875,
1850
+ "rewards/margins_max": 102.88818359375,
1851
+ "rewards/margins_min": 11.078018188476562,
1852
+ "rewards/margins_std": 43.19032669067383,
1853
+ "rewards/rejected": -58.999542236328125,
1854
+ "step": 920
1855
+ },
1856
+ {
1857
+ "epoch": 2.62,
1858
+ "grad_norm": 0.21161148338166058,
1859
+ "learning_rate": 2.410135740750821e-07,
1860
+ "logits/chosen": -1.2054741382598877,
1861
+ "logits/rejected": -0.38507014513015747,
1862
+ "logps/chosen": -2171.072021484375,
1863
+ "logps/rejected": -6896.95458984375,
1864
+ "loss": 0.0157,
1865
+ "rewards/accuracies": 1.0,
1866
+ "rewards/chosen": -19.08205795288086,
1867
+ "rewards/margins": 47.25795364379883,
1868
+ "rewards/margins_max": 104.60585021972656,
1869
+ "rewards/margins_min": 10.657424926757812,
1870
+ "rewards/margins_std": 42.776161193847656,
1871
+ "rewards/rejected": -66.34001922607422,
1872
+ "step": 930
1873
+ },
1874
+ {
1875
+ "epoch": 2.65,
1876
+ "grad_norm": 0.3466449666433785,
1877
+ "learning_rate": 2.0711377893064182e-07,
1878
+ "logits/chosen": -1.23805832862854,
1879
+ "logits/rejected": -0.43131136894226074,
1880
+ "logps/chosen": -2135.45068359375,
1881
+ "logps/rejected": -7015.8134765625,
1882
+ "loss": 0.014,
1883
+ "rewards/accuracies": 0.987500011920929,
1884
+ "rewards/chosen": -18.11324691772461,
1885
+ "rewards/margins": 48.709083557128906,
1886
+ "rewards/margins_max": 110.52925872802734,
1887
+ "rewards/margins_min": 12.27102279663086,
1888
+ "rewards/margins_std": 45.295265197753906,
1889
+ "rewards/rejected": -66.82232666015625,
1890
+ "step": 940
1891
+ },
1892
+ {
1893
+ "epoch": 2.68,
1894
+ "grad_norm": 2.4378946425006456,
1895
+ "learning_rate": 1.756795252547111e-07,
1896
+ "logits/chosen": -1.2234621047973633,
1897
+ "logits/rejected": -0.623192310333252,
1898
+ "logps/chosen": -1561.3631591796875,
1899
+ "logps/rejected": -4726.6142578125,
1900
+ "loss": 0.0703,
1901
+ "rewards/accuracies": 0.9750000238418579,
1902
+ "rewards/chosen": -13.122468948364258,
1903
+ "rewards/margins": 31.732192993164062,
1904
+ "rewards/margins_max": 68.03765869140625,
1905
+ "rewards/margins_min": 5.950772285461426,
1906
+ "rewards/margins_std": 28.550399780273438,
1907
+ "rewards/rejected": -44.85466384887695,
1908
+ "step": 950
1909
+ },
1910
+ {
1911
+ "epoch": 2.7,
1912
+ "grad_norm": 0.0,
1913
+ "learning_rate": 1.4674461431281013e-07,
1914
+ "logits/chosen": -1.3227882385253906,
1915
+ "logits/rejected": -0.6799469590187073,
1916
+ "logps/chosen": -1683.427490234375,
1917
+ "logps/rejected": -5307.63525390625,
1918
+ "loss": 0.0084,
1919
+ "rewards/accuracies": 1.0,
1920
+ "rewards/chosen": -14.210386276245117,
1921
+ "rewards/margins": 36.34208679199219,
1922
+ "rewards/margins_max": 72.8411636352539,
1923
+ "rewards/margins_min": 9.163644790649414,
1924
+ "rewards/margins_std": 29.5941104888916,
1925
+ "rewards/rejected": -50.5524787902832,
1926
+ "step": 960
1927
+ },
1928
+ {
1929
+ "epoch": 2.73,
1930
+ "grad_norm": 0.0577551737134431,
1931
+ "learning_rate": 1.2034015982622243e-07,
1932
+ "logits/chosen": -1.357311487197876,
1933
+ "logits/rejected": -0.47240549325942993,
1934
+ "logps/chosen": -1767.5843505859375,
1935
+ "logps/rejected": -6352.15283203125,
1936
+ "loss": 0.0052,
1937
+ "rewards/accuracies": 1.0,
1938
+ "rewards/chosen": -14.427714347839355,
1939
+ "rewards/margins": 45.86684799194336,
1940
+ "rewards/margins_max": 90.58439636230469,
1941
+ "rewards/margins_min": 9.973871231079102,
1942
+ "rewards/margins_std": 37.57789611816406,
1943
+ "rewards/rejected": -60.29457473754883,
1944
+ "step": 970
1945
+ },
1946
+ {
1947
+ "epoch": 2.76,
1948
+ "grad_norm": 0.22029503083466712,
1949
+ "learning_rate": 9.649455451539419e-08,
1950
+ "logits/chosen": -1.2634292840957642,
1951
+ "logits/rejected": -0.45305362343788147,
1952
+ "logps/chosen": -1745.5113525390625,
1953
+ "logps/rejected": -5825.91943359375,
1954
+ "loss": 0.0138,
1955
+ "rewards/accuracies": 1.0,
1956
+ "rewards/chosen": -14.848994255065918,
1957
+ "rewards/margins": 40.82499694824219,
1958
+ "rewards/margins_max": 80.49317932128906,
1959
+ "rewards/margins_min": 10.800613403320312,
1960
+ "rewards/margins_std": 31.47903060913086,
1961
+ "rewards/rejected": -55.67399215698242,
1962
+ "step": 980
1963
+ },
1964
+ {
1965
+ "epoch": 2.79,
1966
+ "grad_norm": 0.09652421472407109,
1967
+ "learning_rate": 7.523343956923196e-08,
1968
+ "logits/chosen": -1.3216396570205688,
1969
+ "logits/rejected": -0.4804636836051941,
1970
+ "logps/chosen": -1506.0504150390625,
1971
+ "logps/rejected": -6089.37158203125,
1972
+ "loss": 0.0055,
1973
+ "rewards/accuracies": 1.0,
1974
+ "rewards/chosen": -12.122316360473633,
1975
+ "rewards/margins": 45.81190490722656,
1976
+ "rewards/margins_max": 100.2818603515625,
1977
+ "rewards/margins_min": 9.738183975219727,
1978
+ "rewards/margins_std": 41.03260040283203,
1979
+ "rewards/rejected": -57.93422317504883,
1980
+ "step": 990
1981
+ },
1982
+ {
1983
+ "epoch": 2.82,
1984
+ "grad_norm": 0.37399026907919497,
1985
+ "learning_rate": 5.657967707312195e-08,
1986
+ "logits/chosen": -1.2263704538345337,
1987
+ "logits/rejected": -0.4276729226112366,
1988
+ "logps/chosen": -1708.02734375,
1989
+ "logps/rejected": -6663.94140625,
1990
+ "loss": 0.0138,
1991
+ "rewards/accuracies": 1.0,
1992
+ "rewards/chosen": -14.4015474319458,
1993
+ "rewards/margins": 49.11872482299805,
1994
+ "rewards/margins_max": 102.17176818847656,
1995
+ "rewards/margins_min": 15.611490249633789,
1996
+ "rewards/margins_std": 40.248619079589844,
1997
+ "rewards/rejected": -63.52027130126953,
1998
+ "step": 1000
1999
+ },
2000
+ {
2001
+ "epoch": 2.82,
2002
+ "eval_logits/chosen": -1.0810388326644897,
2003
+ "eval_logits/rejected": -0.9159793257713318,
2004
+ "eval_logps/chosen": -2609.25732421875,
2005
+ "eval_logps/rejected": -3499.874267578125,
2006
+ "eval_loss": 6.058404922485352,
2007
+ "eval_rewards/accuracies": 0.628000020980835,
2008
+ "eval_rewards/chosen": -23.343765258789062,
2009
+ "eval_rewards/margins": 9.0797700881958,
2010
+ "eval_rewards/margins_max": 58.32236862182617,
2011
+ "eval_rewards/margins_min": -32.86642837524414,
2012
+ "eval_rewards/margins_std": 29.538137435913086,
2013
+ "eval_rewards/rejected": -32.42353439331055,
2014
+ "eval_runtime": 739.0567,
2015
+ "eval_samples_per_second": 2.706,
2016
+ "eval_steps_per_second": 0.169,
2017
+ "step": 1000
2018
+ },
2019
+ {
2020
+ "epoch": 2.85,
2021
+ "grad_norm": 2.8531847376173625,
2022
+ "learning_rate": 4.055332542531959e-08,
2023
+ "logits/chosen": -1.26244056224823,
2024
+ "logits/rejected": -0.42530936002731323,
2025
+ "logps/chosen": -2178.673828125,
2026
+ "logps/rejected": -7108.33203125,
2027
+ "loss": 0.0333,
2028
+ "rewards/accuracies": 0.987500011920929,
2029
+ "rewards/chosen": -18.750629425048828,
2030
+ "rewards/margins": 48.864402770996094,
2031
+ "rewards/margins_max": 102.9088363647461,
2032
+ "rewards/margins_min": 12.413164138793945,
2033
+ "rewards/margins_std": 41.09243392944336,
2034
+ "rewards/rejected": -67.61502838134766,
2035
+ "step": 1010
2036
+ },
2037
+ {
2038
+ "epoch": 2.87,
2039
+ "grad_norm": 0.02965677453231411,
2040
+ "learning_rate": 2.7171617768147472e-08,
2041
+ "logits/chosen": -1.3160995244979858,
2042
+ "logits/rejected": -0.41138404607772827,
2043
+ "logps/chosen": -1946.414794921875,
2044
+ "logps/rejected": -6739.81494140625,
2045
+ "loss": 0.0229,
2046
+ "rewards/accuracies": 0.9750000238418579,
2047
+ "rewards/chosen": -16.537607192993164,
2048
+ "rewards/margins": 47.81861114501953,
2049
+ "rewards/margins_max": 101.61311340332031,
2050
+ "rewards/margins_min": 11.498300552368164,
2051
+ "rewards/margins_std": 40.96593475341797,
2052
+ "rewards/rejected": -64.35621643066406,
2053
+ "step": 1020
2054
+ },
2055
+ {
2056
+ "epoch": 2.9,
2057
+ "grad_norm": 0.22573854808180674,
2058
+ "learning_rate": 1.6448943457189616e-08,
2059
+ "logits/chosen": -1.2531920671463013,
2060
+ "logits/rejected": -0.3577966094017029,
2061
+ "logps/chosen": -1684.554443359375,
2062
+ "logps/rejected": -7091.48828125,
2063
+ "loss": 0.0258,
2064
+ "rewards/accuracies": 0.987500011920929,
2065
+ "rewards/chosen": -13.979682922363281,
2066
+ "rewards/margins": 53.9659538269043,
2067
+ "rewards/margins_max": 115.4454345703125,
2068
+ "rewards/margins_min": 13.953027725219727,
2069
+ "rewards/margins_std": 46.97876739501953,
2070
+ "rewards/rejected": -67.94563293457031,
2071
+ "step": 1030
2072
+ },
2073
+ {
2074
+ "epoch": 2.93,
2075
+ "grad_norm": 5.432210871045947,
2076
+ "learning_rate": 8.39683258841123e-09,
2077
+ "logits/chosen": -1.1665958166122437,
2078
+ "logits/rejected": -0.5134680867195129,
2079
+ "logps/chosen": -1703.557373046875,
2080
+ "logps/rejected": -5138.3486328125,
2081
+ "loss": 0.0249,
2082
+ "rewards/accuracies": 0.9750000238418579,
2083
+ "rewards/chosen": -14.348538398742676,
2084
+ "rewards/margins": 34.34075164794922,
2085
+ "rewards/margins_max": 77.13099670410156,
2086
+ "rewards/margins_min": 9.077662467956543,
2087
+ "rewards/margins_std": 31.803686141967773,
2088
+ "rewards/rejected": -48.68928909301758,
2089
+ "step": 1040
2090
+ },
2091
+ {
2092
+ "epoch": 2.96,
2093
+ "grad_norm": 0.11937178792801269,
2094
+ "learning_rate": 3.0239435998430376e-09,
2095
+ "logits/chosen": -1.2731530666351318,
2096
+ "logits/rejected": -0.38636043667793274,
2097
+ "logps/chosen": -1771.852783203125,
2098
+ "logps/rejected": -6674.75390625,
2099
+ "loss": 0.0232,
2100
+ "rewards/accuracies": 1.0,
2101
+ "rewards/chosen": -14.935644149780273,
2102
+ "rewards/margins": 49.16513442993164,
2103
+ "rewards/margins_max": 97.74002838134766,
2104
+ "rewards/margins_min": 9.202229499816895,
2105
+ "rewards/margins_std": 40.83153533935547,
2106
+ "rewards/rejected": -64.10078430175781,
2107
+ "step": 1050
2108
+ },
2109
+ {
2110
+ "epoch": 2.99,
2111
+ "grad_norm": 0.044483769560748045,
2112
+ "learning_rate": 3.3605396115826695e-10,
2113
+ "logits/chosen": -1.2338615655899048,
2114
+ "logits/rejected": -0.40085142850875854,
2115
+ "logps/chosen": -1394.041748046875,
2116
+ "logps/rejected": -6284.47021484375,
2117
+ "loss": 0.0075,
2118
+ "rewards/accuracies": 1.0,
2119
+ "rewards/chosen": -11.409103393554688,
2120
+ "rewards/margins": 48.55421447753906,
2121
+ "rewards/margins_max": 100.22550964355469,
2122
+ "rewards/margins_min": 13.580920219421387,
2123
+ "rewards/margins_std": 40.853858947753906,
2124
+ "rewards/rejected": -59.96331787109375,
2125
+ "step": 1060
2126
+ },
2127
+ {
2128
+ "epoch": 3.0,
2129
+ "step": 1065,
2130
+ "total_flos": 0.0,
2131
+ "train_loss": 0.1413031851636692,
2132
+ "train_runtime": 20921.2576,
2133
+ "train_samples_per_second": 0.814,
2134
+ "train_steps_per_second": 0.051
2135
+ }
2136
+ ],
2137
+ "logging_steps": 10,
2138
+ "max_steps": 1065,
2139
+ "num_input_tokens_seen": 0,
2140
+ "num_train_epochs": 3,
2141
+ "save_steps": 100,
2142
+ "total_flos": 0.0,
2143
+ "train_batch_size": 4,
2144
+ "trial_name": null,
2145
+ "trial_params": null
2146
+ }