ZeroUniqueness commited on
Commit
63a4e86
·
1 Parent(s): c7bce51

Training in progress, step 4300

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. adapter_config.json +4 -4
  2. adapter_model.bin +1 -1
  3. checkpoint-2900/README.md +0 -20
  4. checkpoint-2900/adapter_config.json +0 -26
  5. checkpoint-2900/adapter_model.bin +0 -3
  6. checkpoint-2900/adapter_model/README.md +0 -20
  7. checkpoint-2900/adapter_model/adapter_config.json +0 -26
  8. checkpoint-2900/adapter_model/adapter_model.bin +0 -3
  9. checkpoint-2900/optimizer.pt +0 -3
  10. checkpoint-2900/rng_state_0.pth +0 -3
  11. checkpoint-2900/rng_state_1.pth +0 -3
  12. checkpoint-2900/rng_state_10.pth +0 -3
  13. checkpoint-2900/rng_state_11.pth +0 -3
  14. checkpoint-2900/rng_state_12.pth +0 -3
  15. checkpoint-2900/rng_state_13.pth +0 -3
  16. checkpoint-2900/rng_state_2.pth +0 -3
  17. checkpoint-2900/rng_state_3.pth +0 -3
  18. checkpoint-2900/rng_state_4.pth +0 -3
  19. checkpoint-2900/rng_state_5.pth +0 -3
  20. checkpoint-2900/rng_state_6.pth +0 -3
  21. checkpoint-2900/rng_state_7.pth +0 -3
  22. checkpoint-2900/rng_state_8.pth +0 -3
  23. checkpoint-2900/rng_state_9.pth +0 -3
  24. checkpoint-2900/scheduler.pt +0 -3
  25. checkpoint-2900/trainer_state.json +0 -704
  26. checkpoint-2900/training_args.bin +0 -3
  27. checkpoint-3000/README.md +0 -20
  28. checkpoint-3000/adapter_config.json +0 -26
  29. checkpoint-3000/adapter_model.bin +0 -3
  30. checkpoint-3000/adapter_model/README.md +0 -20
  31. checkpoint-3000/adapter_model/adapter_config.json +0 -26
  32. checkpoint-3000/adapter_model/adapter_model.bin +0 -3
  33. checkpoint-3000/optimizer.pt +0 -3
  34. checkpoint-3000/rng_state_0.pth +0 -3
  35. checkpoint-3000/rng_state_1.pth +0 -3
  36. checkpoint-3000/rng_state_10.pth +0 -3
  37. checkpoint-3000/rng_state_11.pth +0 -3
  38. checkpoint-3000/rng_state_12.pth +0 -3
  39. checkpoint-3000/rng_state_13.pth +0 -3
  40. checkpoint-3000/rng_state_2.pth +0 -3
  41. checkpoint-3000/rng_state_3.pth +0 -3
  42. checkpoint-3000/rng_state_4.pth +0 -3
  43. checkpoint-3000/rng_state_5.pth +0 -3
  44. checkpoint-3000/rng_state_6.pth +0 -3
  45. checkpoint-3000/rng_state_7.pth +0 -3
  46. checkpoint-3000/rng_state_8.pth +0 -3
  47. checkpoint-3000/rng_state_9.pth +0 -3
  48. checkpoint-3000/scheduler.pt +0 -3
  49. checkpoint-3000/trainer_state.json +0 -736
  50. checkpoint-3000/training_args.bin +0 -3
adapter_config.json CHANGED
@@ -14,13 +14,13 @@
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
17
- "down_proj",
18
- "k_proj",
19
  "gate_proj",
20
  "v_proj",
21
- "o_proj",
22
  "q_proj",
23
- "up_proj"
 
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
14
  "r": 32,
15
  "revision": null,
16
  "target_modules": [
 
 
17
  "gate_proj",
18
  "v_proj",
19
+ "k_proj",
20
  "q_proj",
21
+ "up_proj",
22
+ "o_proj",
23
+ "down_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db5a92be0cd5f8b38b328e0f82e62452a3fa7b5052a0a1f93fd8c4b1dd18b7a7
3
  size 500897101
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd1d9047d90b00aaf0a6c21147e9789af5c4ef9e3c1df5179a1b86f66b610c52
3
  size 500897101
checkpoint-2900/README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
- ## Training procedure
5
-
6
-
7
- The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
- - llm_int8_threshold: 6.0
11
- - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
- - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: bfloat16
17
- ### Framework versions
18
-
19
-
20
- - PEFT 0.5.0.dev0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-2900/adapter_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
- "bias": "none",
5
- "fan_in_fan_out": null,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 16,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": null,
13
- "peft_type": "LORA",
14
- "r": 32,
15
- "revision": null,
16
- "target_modules": [
17
- "down_proj",
18
- "k_proj",
19
- "gate_proj",
20
- "v_proj",
21
- "o_proj",
22
- "q_proj",
23
- "up_proj"
24
- ],
25
- "task_type": "CAUSAL_LM"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-2900/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:69e75ae7e1f69a41e91706ac778c4176886a39e991e0545bde52d6c1f744f678
3
- size 500897101
 
 
 
 
checkpoint-2900/adapter_model/README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
- ## Training procedure
5
-
6
-
7
- The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
- - llm_int8_threshold: 6.0
11
- - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
- - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: bfloat16
17
- ### Framework versions
18
-
19
-
20
- - PEFT 0.5.0.dev0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-2900/adapter_model/adapter_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
- "bias": "none",
5
- "fan_in_fan_out": null,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 16,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": null,
13
- "peft_type": "LORA",
14
- "r": 32,
15
- "revision": null,
16
- "target_modules": [
17
- "down_proj",
18
- "k_proj",
19
- "gate_proj",
20
- "v_proj",
21
- "o_proj",
22
- "q_proj",
23
- "up_proj"
24
- ],
25
- "task_type": "CAUSAL_LM"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-2900/adapter_model/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:69e75ae7e1f69a41e91706ac778c4176886a39e991e0545bde52d6c1f744f678
3
- size 500897101
 
 
 
 
checkpoint-2900/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c87b4dae7f70055caf6d97c6be4cde96969b4b232d33f2ec8df44468892e94ae
3
- size 1001752701
 
 
 
 
checkpoint-2900/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef95949da63c44bf2803034d897cf02b2c1404fb37a4930aa3e5a0ec33f3e973
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d659d12bcecefb93ed656fc3f6ed770926a62be67a034197a8fc61d6626623aa
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_10.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c8b3dd09b89ccf70ecfb60c04b41270829ea90cfa0f805505a977df5bed72a8
3
- size 27789
 
 
 
 
checkpoint-2900/rng_state_11.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a529130f265d55a197418034304b29c2f30e5460bdc34974ffa7dc75b218dee
3
- size 27789
 
 
 
 
checkpoint-2900/rng_state_12.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:364a7220780238b4e23965c23a3f97f81bcf3f5672fb2c6ba1f95a8ce81dcd6a
3
- size 27789
 
 
 
 
checkpoint-2900/rng_state_13.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ca8a5bf47b25db91e77352018364ed5cb8658c4ab84c74754d600a671025b52
3
- size 27789
 
 
 
 
checkpoint-2900/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e38e3caba0fb2345524e8ba51c489e590ff9c29f6b6b1852244b5b876edef717
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e25391629539867dd50549dd648e30136ff0695597362098b5348ddfb9b13591
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_4.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:05c76df7735faef919f1d2529cb22dc8b258235397aea29e5d522f742f0378d3
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_5.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:04f49ff59341bf10436ddf12dfda352e27600dda201e545e829be606ffa4a75b
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_6.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5235f6c75f2e3a3b228ad84d02c69e6e58f2c65de54d888dd789ea3de5eccaae
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_7.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d1e22c7b9e7f7aea18263f185bde2560e57a023912a137341f61fe9d96545ab
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_8.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:466fa25c9331cd4ea53315fb2b9f257aa0623587238faa3ca9b184ccb63c1756
3
- size 27772
 
 
 
 
checkpoint-2900/rng_state_9.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:71dcdb5ca148232df1ccde976ae62a837427d137bd880eb213220fe9d130a051
3
- size 27772
 
 
 
 
checkpoint-2900/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7bdab61020affcb7675661f6f3289658a70b84200495ea5e8c5d13b32c66edcc
3
- size 627
 
 
 
 
checkpoint-2900/trainer_state.json DELETED
@@ -1,704 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.1244668476153548,
5
- "global_step": 2900,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.02,
12
- "learning_rate": 0.0001999867761371633,
13
- "loss": 1.0435,
14
- "step": 50
15
- },
16
- {
17
- "epoch": 0.04,
18
- "learning_rate": 0.00019993306018843102,
19
- "loss": 0.8918,
20
- "step": 100
21
- },
22
- {
23
- "epoch": 0.06,
24
- "learning_rate": 0.00019983804784290833,
25
- "loss": 0.8874,
26
- "step": 150
27
- },
28
- {
29
- "epoch": 0.08,
30
- "learning_rate": 0.00019970177836355307,
31
- "loss": 0.8839,
32
- "step": 200
33
- },
34
- {
35
- "epoch": 0.09,
36
- "learning_rate": 0.00019961818913082012,
37
- "loss": 0.8801,
38
- "step": 225
39
- },
40
- {
41
- "epoch": 0.1,
42
- "learning_rate": 0.00019952430806244534,
43
- "loss": 0.8753,
44
- "step": 250
45
- },
46
- {
47
- "epoch": 0.11,
48
- "learning_rate": 0.00019942014485754635,
49
- "loss": 0.8754,
50
- "step": 275
51
- },
52
- {
53
- "epoch": 0.12,
54
- "learning_rate": 0.00019930571027751713,
55
- "loss": 0.8751,
56
- "step": 300
57
- },
58
- {
59
- "epoch": 0.13,
60
- "learning_rate": 0.0001991810161449164,
61
- "loss": 0.8819,
62
- "step": 325
63
- },
64
- {
65
- "epoch": 0.14,
66
- "learning_rate": 0.00019904607534224612,
67
- "loss": 0.8744,
68
- "step": 350
69
- },
70
- {
71
- "epoch": 0.15,
72
- "learning_rate": 0.00019890090181062063,
73
- "loss": 0.8735,
74
- "step": 375
75
- },
76
- {
77
- "epoch": 0.16,
78
- "learning_rate": 0.00019874551054832625,
79
- "loss": 0.8703,
80
- "step": 400
81
- },
82
- {
83
- "epoch": 0.16,
84
- "learning_rate": 0.00019857991760927193,
85
- "loss": 0.8715,
86
- "step": 425
87
- },
88
- {
89
- "epoch": 0.17,
90
- "learning_rate": 0.00019840414010133045,
91
- "loss": 0.8714,
92
- "step": 450
93
- },
94
- {
95
- "epoch": 0.18,
96
- "learning_rate": 0.00019821819618457114,
97
- "loss": 0.8653,
98
- "step": 475
99
- },
100
- {
101
- "epoch": 0.19,
102
- "learning_rate": 0.0001980221050693837,
103
- "loss": 0.8716,
104
- "step": 500
105
- },
106
- {
107
- "epoch": 0.2,
108
- "learning_rate": 0.00019781588701449338,
109
- "loss": 0.8695,
110
- "step": 525
111
- },
112
- {
113
- "epoch": 0.21,
114
- "learning_rate": 0.0001975995633248682,
115
- "loss": 0.8746,
116
- "step": 550
117
- },
118
- {
119
- "epoch": 0.22,
120
- "learning_rate": 0.00019737315634951762,
121
- "loss": 0.8731,
122
- "step": 575
123
- },
124
- {
125
- "epoch": 0.23,
126
- "learning_rate": 0.00019713668947918386,
127
- "loss": 0.867,
128
- "step": 600
129
- },
130
- {
131
- "epoch": 0.24,
132
- "learning_rate": 0.0001968901871439252,
133
- "loss": 0.8706,
134
- "step": 625
135
- },
136
- {
137
- "epoch": 0.25,
138
- "learning_rate": 0.000196633674810592,
139
- "loss": 0.8595,
140
- "step": 650
141
- },
142
- {
143
- "epoch": 0.26,
144
- "learning_rate": 0.0001963671789801958,
145
- "loss": 0.8627,
146
- "step": 675
147
- },
148
- {
149
- "epoch": 0.27,
150
- "learning_rate": 0.0001960907271851712,
151
- "loss": 0.8607,
152
- "step": 700
153
- },
154
- {
155
- "epoch": 0.28,
156
- "learning_rate": 0.00019580434798653173,
157
- "loss": 0.858,
158
- "step": 725
159
- },
160
- {
161
- "epoch": 0.29,
162
- "learning_rate": 0.00019550807097091876,
163
- "loss": 0.8589,
164
- "step": 750
165
- },
166
- {
167
- "epoch": 0.3,
168
- "learning_rate": 0.00019520192674754515,
169
- "loss": 0.8561,
170
- "step": 775
171
- },
172
- {
173
- "epoch": 0.31,
174
- "learning_rate": 0.00019488594694503264,
175
- "loss": 0.8576,
176
- "step": 800
177
- },
178
- {
179
- "epoch": 0.32,
180
- "learning_rate": 0.00019456016420814446,
181
- "loss": 0.8597,
182
- "step": 825
183
- },
184
- {
185
- "epoch": 0.33,
186
- "learning_rate": 0.00019422461219441254,
187
- "loss": 0.862,
188
- "step": 850
189
- },
190
- {
191
- "epoch": 0.34,
192
- "learning_rate": 0.00019387932557066035,
193
- "loss": 0.8577,
194
- "step": 875
195
- },
196
- {
197
- "epoch": 0.35,
198
- "learning_rate": 0.00019352434000942127,
199
- "loss": 0.8632,
200
- "step": 900
201
- },
202
- {
203
- "epoch": 0.36,
204
- "learning_rate": 0.00019315969218525333,
205
- "loss": 0.8567,
206
- "step": 925
207
- },
208
- {
209
- "epoch": 0.37,
210
- "learning_rate": 0.00019278541977095005,
211
- "loss": 0.8501,
212
- "step": 950
213
- },
214
- {
215
- "epoch": 0.38,
216
- "learning_rate": 0.00019240156143364844,
217
- "loss": 0.8596,
218
- "step": 975
219
- },
220
- {
221
- "epoch": 0.39,
222
- "learning_rate": 0.00019200815683083434,
223
- "loss": 0.8556,
224
- "step": 1000
225
- },
226
- {
227
- "epoch": 0.39,
228
- "eval_loss": 0.8521950244903564,
229
- "eval_runtime": 59.8838,
230
- "eval_samples_per_second": 12.19,
231
- "eval_steps_per_second": 0.885,
232
- "step": 1000
233
- },
234
- {
235
- "epoch": 0.4,
236
- "learning_rate": 0.00019160524660624505,
237
- "loss": 0.8531,
238
- "step": 1025
239
- },
240
- {
241
- "epoch": 0.41,
242
- "learning_rate": 0.00019119287238567045,
243
- "loss": 0.8513,
244
- "step": 1050
245
- },
246
- {
247
- "epoch": 0.42,
248
- "learning_rate": 0.00019077107677265253,
249
- "loss": 0.8502,
250
- "step": 1075
251
- },
252
- {
253
- "epoch": 0.43,
254
- "learning_rate": 0.00019033990334408384,
255
- "loss": 0.8469,
256
- "step": 1100
257
- },
258
- {
259
- "epoch": 0.44,
260
- "learning_rate": 0.00018989939664570545,
261
- "loss": 0.8495,
262
- "step": 1125
263
- },
264
- {
265
- "epoch": 0.45,
266
- "learning_rate": 0.00018944960218750484,
267
- "loss": 0.8485,
268
- "step": 1150
269
- },
270
- {
271
- "epoch": 0.46,
272
- "learning_rate": 0.00018899056643901404,
273
- "loss": 0.8534,
274
- "step": 1175
275
- },
276
- {
277
- "epoch": 0.47,
278
- "learning_rate": 0.00018852233682450893,
279
- "loss": 0.8531,
280
- "step": 1200
281
- },
282
- {
283
- "epoch": 0.47,
284
- "learning_rate": 0.00018804496171810948,
285
- "loss": 0.8509,
286
- "step": 1225
287
- },
288
- {
289
- "epoch": 0.48,
290
- "learning_rate": 0.00018755849043878222,
291
- "loss": 0.8445,
292
- "step": 1250
293
- },
294
- {
295
- "epoch": 0.49,
296
- "learning_rate": 0.0001870629732452449,
297
- "loss": 0.8548,
298
- "step": 1275
299
- },
300
- {
301
- "epoch": 0.5,
302
- "learning_rate": 0.00018655846133077417,
303
- "loss": 0.8441,
304
- "step": 1300
305
- },
306
- {
307
- "epoch": 0.51,
308
- "learning_rate": 0.00018604500681791656,
309
- "loss": 0.8533,
310
- "step": 1325
311
- },
312
- {
313
- "epoch": 0.52,
314
- "learning_rate": 0.00018552266275310373,
315
- "loss": 0.8505,
316
- "step": 1350
317
- },
318
- {
319
- "epoch": 0.53,
320
- "learning_rate": 0.0001849914831011719,
321
- "loss": 0.8544,
322
- "step": 1375
323
- },
324
- {
325
- "epoch": 0.54,
326
- "learning_rate": 0.00018445152273978668,
327
- "loss": 0.845,
328
- "step": 1400
329
- },
330
- {
331
- "epoch": 0.55,
332
- "learning_rate": 0.00018390283745377354,
333
- "loss": 0.8376,
334
- "step": 1425
335
- },
336
- {
337
- "epoch": 0.56,
338
- "learning_rate": 0.0001833454839293545,
339
- "loss": 0.847,
340
- "step": 1450
341
- },
342
- {
343
- "epoch": 0.57,
344
- "learning_rate": 0.00018277951974829163,
345
- "loss": 0.8473,
346
- "step": 1475
347
- },
348
- {
349
- "epoch": 0.58,
350
- "learning_rate": 0.0001822050033819382,
351
- "loss": 0.8438,
352
- "step": 1500
353
- },
354
- {
355
- "epoch": 0.59,
356
- "learning_rate": 0.00018162199418519785,
357
- "loss": 0.8418,
358
- "step": 1525
359
- },
360
- {
361
- "epoch": 0.6,
362
- "learning_rate": 0.00018103055239039243,
363
- "loss": 0.842,
364
- "step": 1550
365
- },
366
- {
367
- "epoch": 0.61,
368
- "learning_rate": 0.0001804307391010393,
369
- "loss": 0.8435,
370
- "step": 1575
371
- },
372
- {
373
- "epoch": 0.62,
374
- "learning_rate": 0.00017982261628553842,
375
- "loss": 0.8349,
376
- "step": 1600
377
- },
378
- {
379
- "epoch": 0.63,
380
- "learning_rate": 0.0001792062467707703,
381
- "loss": 0.8483,
382
- "step": 1625
383
- },
384
- {
385
- "epoch": 0.64,
386
- "learning_rate": 0.0001785816942356052,
387
- "loss": 0.8387,
388
- "step": 1650
389
- },
390
- {
391
- "epoch": 0.65,
392
- "learning_rate": 0.00017794902320432429,
393
- "loss": 0.843,
394
- "step": 1675
395
- },
396
- {
397
- "epoch": 0.66,
398
- "learning_rate": 0.00017730829903995333,
399
- "loss": 0.8424,
400
- "step": 1700
401
- },
402
- {
403
- "epoch": 0.67,
404
- "learning_rate": 0.00017665958793751006,
405
- "loss": 0.8418,
406
- "step": 1725
407
- },
408
- {
409
- "epoch": 0.68,
410
- "learning_rate": 0.00017600295691716522,
411
- "loss": 0.8384,
412
- "step": 1750
413
- },
414
- {
415
- "epoch": 0.69,
416
- "learning_rate": 0.00017533847381731856,
417
- "loss": 0.8445,
418
- "step": 1775
419
- },
420
- {
421
- "epoch": 0.7,
422
- "learning_rate": 0.00017466620728759033,
423
- "loss": 0.8446,
424
- "step": 1800
425
- },
426
- {
427
- "epoch": 0.71,
428
- "learning_rate": 0.00017398622678172878,
429
- "loss": 0.838,
430
- "step": 1825
431
- },
432
- {
433
- "epoch": 0.72,
434
- "learning_rate": 0.0001732986025504348,
435
- "loss": 0.8415,
436
- "step": 1850
437
- },
438
- {
439
- "epoch": 0.73,
440
- "learning_rate": 0.000172603405634104,
441
- "loss": 0.8357,
442
- "step": 1875
443
- },
444
- {
445
- "epoch": 0.74,
446
- "learning_rate": 0.00017190070785548755,
447
- "loss": 0.8311,
448
- "step": 1900
449
- },
450
- {
451
- "epoch": 0.75,
452
- "learning_rate": 0.0001711905818122717,
453
- "loss": 0.8333,
454
- "step": 1925
455
- },
456
- {
457
- "epoch": 0.76,
458
- "learning_rate": 0.0001704731008695777,
459
- "loss": 0.8387,
460
- "step": 1950
461
- },
462
- {
463
- "epoch": 0.77,
464
- "learning_rate": 0.0001697483391523821,
465
- "loss": 0.8442,
466
- "step": 1975
467
- },
468
- {
469
- "epoch": 0.78,
470
- "learning_rate": 0.00016901637153785885,
471
- "loss": 0.8399,
472
- "step": 2000
473
- },
474
- {
475
- "epoch": 0.78,
476
- "eval_loss": 0.8339959383010864,
477
- "eval_runtime": 58.5829,
478
- "eval_samples_per_second": 12.461,
479
- "eval_steps_per_second": 0.905,
480
- "step": 2000
481
- },
482
- {
483
- "epoch": 0.79,
484
- "learning_rate": 0.0001682772736476434,
485
- "loss": 0.8334,
486
- "step": 2025
487
- },
488
- {
489
- "epoch": 0.79,
490
- "learning_rate": 0.0001675311218400201,
491
- "loss": 0.835,
492
- "step": 2050
493
- },
494
- {
495
- "epoch": 0.8,
496
- "learning_rate": 0.00016677799320203332,
497
- "loss": 0.8368,
498
- "step": 2075
499
- },
500
- {
501
- "epoch": 0.81,
502
- "learning_rate": 0.00016601796554152344,
503
- "loss": 0.8278,
504
- "step": 2100
505
- },
506
- {
507
- "epoch": 0.82,
508
- "learning_rate": 0.00016525111737908827,
509
- "loss": 0.8334,
510
- "step": 2125
511
- },
512
- {
513
- "epoch": 0.83,
514
- "learning_rate": 0.00016447752793997096,
515
- "loss": 0.8416,
516
- "step": 2150
517
- },
518
- {
519
- "epoch": 0.84,
520
- "learning_rate": 0.00016369727714587483,
521
- "loss": 0.8297,
522
- "step": 2175
523
- },
524
- {
525
- "epoch": 0.85,
526
- "learning_rate": 0.0001629104456067066,
527
- "loss": 0.8327,
528
- "step": 2200
529
- },
530
- {
531
- "epoch": 0.86,
532
- "learning_rate": 0.00016211711461224825,
533
- "loss": 0.8324,
534
- "step": 2225
535
- },
536
- {
537
- "epoch": 0.87,
538
- "learning_rate": 0.0001613173661237589,
539
- "loss": 0.8313,
540
- "step": 2250
541
- },
542
- {
543
- "epoch": 0.88,
544
- "learning_rate": 0.0001605112827655069,
545
- "loss": 0.8292,
546
- "step": 2275
547
- },
548
- {
549
- "epoch": 0.89,
550
- "learning_rate": 0.0001596989478162339,
551
- "loss": 0.8334,
552
- "step": 2300
553
- },
554
- {
555
- "epoch": 0.9,
556
- "learning_rate": 0.00015888044520055106,
557
- "loss": 0.8352,
558
- "step": 2325
559
- },
560
- {
561
- "epoch": 0.91,
562
- "learning_rate": 0.00015805585948026852,
563
- "loss": 0.823,
564
- "step": 2350
565
- },
566
- {
567
- "epoch": 0.92,
568
- "learning_rate": 0.000157225275845659,
569
- "loss": 0.8293,
570
- "step": 2375
571
- },
572
- {
573
- "epoch": 0.93,
574
- "learning_rate": 0.00015638878010665672,
575
- "loss": 0.8289,
576
- "step": 2400
577
- },
578
- {
579
- "epoch": 0.94,
580
- "learning_rate": 0.00015554645868399205,
581
- "loss": 0.832,
582
- "step": 2425
583
- },
584
- {
585
- "epoch": 0.95,
586
- "learning_rate": 0.00015469839860026308,
587
- "loss": 0.8294,
588
- "step": 2450
589
- },
590
- {
591
- "epoch": 0.96,
592
- "learning_rate": 0.0001538446874709452,
593
- "loss": 0.8281,
594
- "step": 2475
595
- },
596
- {
597
- "epoch": 0.97,
598
- "learning_rate": 0.00015298541349533925,
599
- "loss": 0.8314,
600
- "step": 2500
601
- },
602
- {
603
- "epoch": 0.98,
604
- "learning_rate": 0.00015212066544745926,
605
- "loss": 0.831,
606
- "step": 2525
607
- },
608
- {
609
- "epoch": 0.99,
610
- "learning_rate": 0.00015125053266686124,
611
- "loss": 0.8319,
612
- "step": 2550
613
- },
614
- {
615
- "epoch": 1.0,
616
- "learning_rate": 0.00015037510504941303,
617
- "loss": 0.8259,
618
- "step": 2575
619
- },
620
- {
621
- "epoch": 1.01,
622
- "learning_rate": 0.00014949447303800695,
623
- "loss": 0.8133,
624
- "step": 2600
625
- },
626
- {
627
- "epoch": 1.02,
628
- "learning_rate": 0.00014860872761321593,
629
- "loss": 0.8139,
630
- "step": 2625
631
- },
632
- {
633
- "epoch": 1.03,
634
- "learning_rate": 0.00014771796028389405,
635
- "loss": 0.804,
636
- "step": 2650
637
- },
638
- {
639
- "epoch": 1.04,
640
- "learning_rate": 0.0001468222630777225,
641
- "loss": 0.8011,
642
- "step": 2675
643
- },
644
- {
645
- "epoch": 1.05,
646
- "learning_rate": 0.00014592172853170193,
647
- "loss": 0.8037,
648
- "step": 2700
649
- },
650
- {
651
- "epoch": 1.06,
652
- "learning_rate": 0.00014501644968259212,
653
- "loss": 0.8063,
654
- "step": 2725
655
- },
656
- {
657
- "epoch": 1.07,
658
- "learning_rate": 0.00014410652005730025,
659
- "loss": 0.8155,
660
- "step": 2750
661
- },
662
- {
663
- "epoch": 1.08,
664
- "learning_rate": 0.00014319203366321826,
665
- "loss": 0.8066,
666
- "step": 2775
667
- },
668
- {
669
- "epoch": 1.09,
670
- "learning_rate": 0.0001422730849785107,
671
- "loss": 0.8091,
672
- "step": 2800
673
- },
674
- {
675
- "epoch": 1.1,
676
- "learning_rate": 0.0001413497689423539,
677
- "loss": 0.8067,
678
- "step": 2825
679
- },
680
- {
681
- "epoch": 1.11,
682
- "learning_rate": 0.00014042218094512755,
683
- "loss": 0.8046,
684
- "step": 2850
685
- },
686
- {
687
- "epoch": 1.11,
688
- "learning_rate": 0.00013949041681855985,
689
- "loss": 0.8053,
690
- "step": 2875
691
- },
692
- {
693
- "epoch": 1.12,
694
- "learning_rate": 0.0001385545728258264,
695
- "loss": 0.8075,
696
- "step": 2900
697
- }
698
- ],
699
- "max_steps": 7737,
700
- "num_train_epochs": 3,
701
- "total_flos": 1.248869156726086e+19,
702
- "trial_name": null,
703
- "trial_params": null
704
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-2900/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7292138fecd854f5f17371c439bbd450ee3c48e738b75818b778a55f4e26ef57
3
- size 4027
 
 
 
 
checkpoint-3000/README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
- ## Training procedure
5
-
6
-
7
- The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
- - llm_int8_threshold: 6.0
11
- - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
- - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: bfloat16
17
- ### Framework versions
18
-
19
-
20
- - PEFT 0.5.0.dev0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3000/adapter_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
- "bias": "none",
5
- "fan_in_fan_out": null,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 16,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": null,
13
- "peft_type": "LORA",
14
- "r": 32,
15
- "revision": null,
16
- "target_modules": [
17
- "down_proj",
18
- "k_proj",
19
- "gate_proj",
20
- "v_proj",
21
- "o_proj",
22
- "q_proj",
23
- "up_proj"
24
- ],
25
- "task_type": "CAUSAL_LM"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3000/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3768dd1339753637e98fb5a78e49089bfc20cfbb2e5d5ab1d79b249f12bd91d6
3
- size 500897101
 
 
 
 
checkpoint-3000/adapter_model/README.md DELETED
@@ -1,20 +0,0 @@
1
- ---
2
- library_name: peft
3
- ---
4
- ## Training procedure
5
-
6
-
7
- The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
- - llm_int8_threshold: 6.0
11
- - llm_int8_skip_modules: None
12
- - llm_int8_enable_fp32_cpu_offload: False
13
- - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: bfloat16
17
- ### Framework versions
18
-
19
-
20
- - PEFT 0.5.0.dev0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3000/adapter_model/adapter_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "auto_mapping": null,
3
- "base_model_name_or_path": "/workspace/webui/models/TheBloke_Llama-2-13B-fp16",
4
- "bias": "none",
5
- "fan_in_fan_out": null,
6
- "inference_mode": true,
7
- "init_lora_weights": true,
8
- "layers_pattern": null,
9
- "layers_to_transform": null,
10
- "lora_alpha": 16,
11
- "lora_dropout": 0.05,
12
- "modules_to_save": null,
13
- "peft_type": "LORA",
14
- "r": 32,
15
- "revision": null,
16
- "target_modules": [
17
- "down_proj",
18
- "k_proj",
19
- "gate_proj",
20
- "v_proj",
21
- "o_proj",
22
- "q_proj",
23
- "up_proj"
24
- ],
25
- "task_type": "CAUSAL_LM"
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3000/adapter_model/adapter_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3768dd1339753637e98fb5a78e49089bfc20cfbb2e5d5ab1d79b249f12bd91d6
3
- size 500897101
 
 
 
 
checkpoint-3000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:43dd2594437f90014ceb24dae35f0caf1408f106ca0640abfdb6f930ec7d1917
3
- size 1001752701
 
 
 
 
checkpoint-3000/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:372977c5ad5707d01ab93c603a2084e21dcc5bbe3746e25cbd04e791984a40ad
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cf974f69d3a79d20401ab16c60981d40397dff4557e89f3fb4f166a1c2b6988
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_10.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f46cf42d5b149dfdabbc17334f090d493267bd9a2a1e982e5a7904ba8ad96c66
3
- size 27789
 
 
 
 
checkpoint-3000/rng_state_11.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d782b19c86bb619094e572c4d5fa6b1b8aff8745fd27fd20d67ad46b26e3500
3
- size 27789
 
 
 
 
checkpoint-3000/rng_state_12.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46ed59f9d93f80f01f93314ed6347cd1950aad081d7b1afa92318f622eae07dc
3
- size 27789
 
 
 
 
checkpoint-3000/rng_state_13.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de734c628aa46f0b8f66418bc77802de4cf179c013f3fa6520a4c967c9c44ee9
3
- size 27789
 
 
 
 
checkpoint-3000/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b3ee02de5d832b0550492ad9197099cc49a1ef39883f9f21d43ee5eb7abbe91
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f47b89653cf7379ccd6b6642b1cb9305e7b0ff4639356066795aec75e109dcd0
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_4.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:896ef18d13ff8a9f7f4850c38ec321a8050c0001f5f2abe7e1f217fabe5940c7
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_5.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f03081aff382542ac4332b7380a218da9fc6f3cc8c0e182a596a1aa05e2c4a86
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_6.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ba56e43a346f575d11975e33c17fd24d061dc495ff64f652662b4fe2bd5ba2a
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_7.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd77f76c2f0d692464a663d61dae016bcf5dd8966ed8742d59dea407d2a7048f
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_8.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:20c5b17ace5a48bc38935d68f2ceef000b52c248bec852ba042e42f55d1fdcec
3
- size 27772
 
 
 
 
checkpoint-3000/rng_state_9.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2362bce85dfe8a924fd3e27761bcd0046ea4d1f8b878596ba3abccb85ffaaa3d
3
- size 27772
 
 
 
 
checkpoint-3000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5eb3f4a2e8a234b77c028b4206015afec84f155c6d5bcb4ec7cd2c9f89b304b
3
- size 627
 
 
 
 
checkpoint-3000/trainer_state.json DELETED
@@ -1,736 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.1632415664986429,
5
- "global_step": 3000,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.02,
12
- "learning_rate": 0.0001999867761371633,
13
- "loss": 1.0435,
14
- "step": 50
15
- },
16
- {
17
- "epoch": 0.04,
18
- "learning_rate": 0.00019993306018843102,
19
- "loss": 0.8918,
20
- "step": 100
21
- },
22
- {
23
- "epoch": 0.06,
24
- "learning_rate": 0.00019983804784290833,
25
- "loss": 0.8874,
26
- "step": 150
27
- },
28
- {
29
- "epoch": 0.08,
30
- "learning_rate": 0.00019970177836355307,
31
- "loss": 0.8839,
32
- "step": 200
33
- },
34
- {
35
- "epoch": 0.09,
36
- "learning_rate": 0.00019961818913082012,
37
- "loss": 0.8801,
38
- "step": 225
39
- },
40
- {
41
- "epoch": 0.1,
42
- "learning_rate": 0.00019952430806244534,
43
- "loss": 0.8753,
44
- "step": 250
45
- },
46
- {
47
- "epoch": 0.11,
48
- "learning_rate": 0.00019942014485754635,
49
- "loss": 0.8754,
50
- "step": 275
51
- },
52
- {
53
- "epoch": 0.12,
54
- "learning_rate": 0.00019930571027751713,
55
- "loss": 0.8751,
56
- "step": 300
57
- },
58
- {
59
- "epoch": 0.13,
60
- "learning_rate": 0.0001991810161449164,
61
- "loss": 0.8819,
62
- "step": 325
63
- },
64
- {
65
- "epoch": 0.14,
66
- "learning_rate": 0.00019904607534224612,
67
- "loss": 0.8744,
68
- "step": 350
69
- },
70
- {
71
- "epoch": 0.15,
72
- "learning_rate": 0.00019890090181062063,
73
- "loss": 0.8735,
74
- "step": 375
75
- },
76
- {
77
- "epoch": 0.16,
78
- "learning_rate": 0.00019874551054832625,
79
- "loss": 0.8703,
80
- "step": 400
81
- },
82
- {
83
- "epoch": 0.16,
84
- "learning_rate": 0.00019857991760927193,
85
- "loss": 0.8715,
86
- "step": 425
87
- },
88
- {
89
- "epoch": 0.17,
90
- "learning_rate": 0.00019840414010133045,
91
- "loss": 0.8714,
92
- "step": 450
93
- },
94
- {
95
- "epoch": 0.18,
96
- "learning_rate": 0.00019821819618457114,
97
- "loss": 0.8653,
98
- "step": 475
99
- },
100
- {
101
- "epoch": 0.19,
102
- "learning_rate": 0.0001980221050693837,
103
- "loss": 0.8716,
104
- "step": 500
105
- },
106
- {
107
- "epoch": 0.2,
108
- "learning_rate": 0.00019781588701449338,
109
- "loss": 0.8695,
110
- "step": 525
111
- },
112
- {
113
- "epoch": 0.21,
114
- "learning_rate": 0.0001975995633248682,
115
- "loss": 0.8746,
116
- "step": 550
117
- },
118
- {
119
- "epoch": 0.22,
120
- "learning_rate": 0.00019737315634951762,
121
- "loss": 0.8731,
122
- "step": 575
123
- },
124
- {
125
- "epoch": 0.23,
126
- "learning_rate": 0.00019713668947918386,
127
- "loss": 0.867,
128
- "step": 600
129
- },
130
- {
131
- "epoch": 0.24,
132
- "learning_rate": 0.0001968901871439252,
133
- "loss": 0.8706,
134
- "step": 625
135
- },
136
- {
137
- "epoch": 0.25,
138
- "learning_rate": 0.000196633674810592,
139
- "loss": 0.8595,
140
- "step": 650
141
- },
142
- {
143
- "epoch": 0.26,
144
- "learning_rate": 0.0001963671789801958,
145
- "loss": 0.8627,
146
- "step": 675
147
- },
148
- {
149
- "epoch": 0.27,
150
- "learning_rate": 0.0001960907271851712,
151
- "loss": 0.8607,
152
- "step": 700
153
- },
154
- {
155
- "epoch": 0.28,
156
- "learning_rate": 0.00019580434798653173,
157
- "loss": 0.858,
158
- "step": 725
159
- },
160
- {
161
- "epoch": 0.29,
162
- "learning_rate": 0.00019550807097091876,
163
- "loss": 0.8589,
164
- "step": 750
165
- },
166
- {
167
- "epoch": 0.3,
168
- "learning_rate": 0.00019520192674754515,
169
- "loss": 0.8561,
170
- "step": 775
171
- },
172
- {
173
- "epoch": 0.31,
174
- "learning_rate": 0.00019488594694503264,
175
- "loss": 0.8576,
176
- "step": 800
177
- },
178
- {
179
- "epoch": 0.32,
180
- "learning_rate": 0.00019456016420814446,
181
- "loss": 0.8597,
182
- "step": 825
183
- },
184
- {
185
- "epoch": 0.33,
186
- "learning_rate": 0.00019422461219441254,
187
- "loss": 0.862,
188
- "step": 850
189
- },
190
- {
191
- "epoch": 0.34,
192
- "learning_rate": 0.00019387932557066035,
193
- "loss": 0.8577,
194
- "step": 875
195
- },
196
- {
197
- "epoch": 0.35,
198
- "learning_rate": 0.00019352434000942127,
199
- "loss": 0.8632,
200
- "step": 900
201
- },
202
- {
203
- "epoch": 0.36,
204
- "learning_rate": 0.00019315969218525333,
205
- "loss": 0.8567,
206
- "step": 925
207
- },
208
- {
209
- "epoch": 0.37,
210
- "learning_rate": 0.00019278541977095005,
211
- "loss": 0.8501,
212
- "step": 950
213
- },
214
- {
215
- "epoch": 0.38,
216
- "learning_rate": 0.00019240156143364844,
217
- "loss": 0.8596,
218
- "step": 975
219
- },
220
- {
221
- "epoch": 0.39,
222
- "learning_rate": 0.00019200815683083434,
223
- "loss": 0.8556,
224
- "step": 1000
225
- },
226
- {
227
- "epoch": 0.39,
228
- "eval_loss": 0.8521950244903564,
229
- "eval_runtime": 59.8838,
230
- "eval_samples_per_second": 12.19,
231
- "eval_steps_per_second": 0.885,
232
- "step": 1000
233
- },
234
- {
235
- "epoch": 0.4,
236
- "learning_rate": 0.00019160524660624505,
237
- "loss": 0.8531,
238
- "step": 1025
239
- },
240
- {
241
- "epoch": 0.41,
242
- "learning_rate": 0.00019119287238567045,
243
- "loss": 0.8513,
244
- "step": 1050
245
- },
246
- {
247
- "epoch": 0.42,
248
- "learning_rate": 0.00019077107677265253,
249
- "loss": 0.8502,
250
- "step": 1075
251
- },
252
- {
253
- "epoch": 0.43,
254
- "learning_rate": 0.00019033990334408384,
255
- "loss": 0.8469,
256
- "step": 1100
257
- },
258
- {
259
- "epoch": 0.44,
260
- "learning_rate": 0.00018989939664570545,
261
- "loss": 0.8495,
262
- "step": 1125
263
- },
264
- {
265
- "epoch": 0.45,
266
- "learning_rate": 0.00018944960218750484,
267
- "loss": 0.8485,
268
- "step": 1150
269
- },
270
- {
271
- "epoch": 0.46,
272
- "learning_rate": 0.00018899056643901404,
273
- "loss": 0.8534,
274
- "step": 1175
275
- },
276
- {
277
- "epoch": 0.47,
278
- "learning_rate": 0.00018852233682450893,
279
- "loss": 0.8531,
280
- "step": 1200
281
- },
282
- {
283
- "epoch": 0.47,
284
- "learning_rate": 0.00018804496171810948,
285
- "loss": 0.8509,
286
- "step": 1225
287
- },
288
- {
289
- "epoch": 0.48,
290
- "learning_rate": 0.00018755849043878222,
291
- "loss": 0.8445,
292
- "step": 1250
293
- },
294
- {
295
- "epoch": 0.49,
296
- "learning_rate": 0.0001870629732452449,
297
- "loss": 0.8548,
298
- "step": 1275
299
- },
300
- {
301
- "epoch": 0.5,
302
- "learning_rate": 0.00018655846133077417,
303
- "loss": 0.8441,
304
- "step": 1300
305
- },
306
- {
307
- "epoch": 0.51,
308
- "learning_rate": 0.00018604500681791656,
309
- "loss": 0.8533,
310
- "step": 1325
311
- },
312
- {
313
- "epoch": 0.52,
314
- "learning_rate": 0.00018552266275310373,
315
- "loss": 0.8505,
316
- "step": 1350
317
- },
318
- {
319
- "epoch": 0.53,
320
- "learning_rate": 0.0001849914831011719,
321
- "loss": 0.8544,
322
- "step": 1375
323
- },
324
- {
325
- "epoch": 0.54,
326
- "learning_rate": 0.00018445152273978668,
327
- "loss": 0.845,
328
- "step": 1400
329
- },
330
- {
331
- "epoch": 0.55,
332
- "learning_rate": 0.00018390283745377354,
333
- "loss": 0.8376,
334
- "step": 1425
335
- },
336
- {
337
- "epoch": 0.56,
338
- "learning_rate": 0.0001833454839293545,
339
- "loss": 0.847,
340
- "step": 1450
341
- },
342
- {
343
- "epoch": 0.57,
344
- "learning_rate": 0.00018277951974829163,
345
- "loss": 0.8473,
346
- "step": 1475
347
- },
348
- {
349
- "epoch": 0.58,
350
- "learning_rate": 0.0001822050033819382,
351
- "loss": 0.8438,
352
- "step": 1500
353
- },
354
- {
355
- "epoch": 0.59,
356
- "learning_rate": 0.00018162199418519785,
357
- "loss": 0.8418,
358
- "step": 1525
359
- },
360
- {
361
- "epoch": 0.6,
362
- "learning_rate": 0.00018103055239039243,
363
- "loss": 0.842,
364
- "step": 1550
365
- },
366
- {
367
- "epoch": 0.61,
368
- "learning_rate": 0.0001804307391010393,
369
- "loss": 0.8435,
370
- "step": 1575
371
- },
372
- {
373
- "epoch": 0.62,
374
- "learning_rate": 0.00017982261628553842,
375
- "loss": 0.8349,
376
- "step": 1600
377
- },
378
- {
379
- "epoch": 0.63,
380
- "learning_rate": 0.0001792062467707703,
381
- "loss": 0.8483,
382
- "step": 1625
383
- },
384
- {
385
- "epoch": 0.64,
386
- "learning_rate": 0.0001785816942356052,
387
- "loss": 0.8387,
388
- "step": 1650
389
- },
390
- {
391
- "epoch": 0.65,
392
- "learning_rate": 0.00017794902320432429,
393
- "loss": 0.843,
394
- "step": 1675
395
- },
396
- {
397
- "epoch": 0.66,
398
- "learning_rate": 0.00017730829903995333,
399
- "loss": 0.8424,
400
- "step": 1700
401
- },
402
- {
403
- "epoch": 0.67,
404
- "learning_rate": 0.00017665958793751006,
405
- "loss": 0.8418,
406
- "step": 1725
407
- },
408
- {
409
- "epoch": 0.68,
410
- "learning_rate": 0.00017600295691716522,
411
- "loss": 0.8384,
412
- "step": 1750
413
- },
414
- {
415
- "epoch": 0.69,
416
- "learning_rate": 0.00017533847381731856,
417
- "loss": 0.8445,
418
- "step": 1775
419
- },
420
- {
421
- "epoch": 0.7,
422
- "learning_rate": 0.00017466620728759033,
423
- "loss": 0.8446,
424
- "step": 1800
425
- },
426
- {
427
- "epoch": 0.71,
428
- "learning_rate": 0.00017398622678172878,
429
- "loss": 0.838,
430
- "step": 1825
431
- },
432
- {
433
- "epoch": 0.72,
434
- "learning_rate": 0.0001732986025504348,
435
- "loss": 0.8415,
436
- "step": 1850
437
- },
438
- {
439
- "epoch": 0.73,
440
- "learning_rate": 0.000172603405634104,
441
- "loss": 0.8357,
442
- "step": 1875
443
- },
444
- {
445
- "epoch": 0.74,
446
- "learning_rate": 0.00017190070785548755,
447
- "loss": 0.8311,
448
- "step": 1900
449
- },
450
- {
451
- "epoch": 0.75,
452
- "learning_rate": 0.0001711905818122717,
453
- "loss": 0.8333,
454
- "step": 1925
455
- },
456
- {
457
- "epoch": 0.76,
458
- "learning_rate": 0.0001704731008695777,
459
- "loss": 0.8387,
460
- "step": 1950
461
- },
462
- {
463
- "epoch": 0.77,
464
- "learning_rate": 0.0001697483391523821,
465
- "loss": 0.8442,
466
- "step": 1975
467
- },
468
- {
469
- "epoch": 0.78,
470
- "learning_rate": 0.00016901637153785885,
471
- "loss": 0.8399,
472
- "step": 2000
473
- },
474
- {
475
- "epoch": 0.78,
476
- "eval_loss": 0.8339959383010864,
477
- "eval_runtime": 58.5829,
478
- "eval_samples_per_second": 12.461,
479
- "eval_steps_per_second": 0.905,
480
- "step": 2000
481
- },
482
- {
483
- "epoch": 0.79,
484
- "learning_rate": 0.0001682772736476434,
485
- "loss": 0.8334,
486
- "step": 2025
487
- },
488
- {
489
- "epoch": 0.79,
490
- "learning_rate": 0.0001675311218400201,
491
- "loss": 0.835,
492
- "step": 2050
493
- },
494
- {
495
- "epoch": 0.8,
496
- "learning_rate": 0.00016677799320203332,
497
- "loss": 0.8368,
498
- "step": 2075
499
- },
500
- {
501
- "epoch": 0.81,
502
- "learning_rate": 0.00016601796554152344,
503
- "loss": 0.8278,
504
- "step": 2100
505
- },
506
- {
507
- "epoch": 0.82,
508
- "learning_rate": 0.00016525111737908827,
509
- "loss": 0.8334,
510
- "step": 2125
511
- },
512
- {
513
- "epoch": 0.83,
514
- "learning_rate": 0.00016447752793997096,
515
- "loss": 0.8416,
516
- "step": 2150
517
- },
518
- {
519
- "epoch": 0.84,
520
- "learning_rate": 0.00016369727714587483,
521
- "loss": 0.8297,
522
- "step": 2175
523
- },
524
- {
525
- "epoch": 0.85,
526
- "learning_rate": 0.0001629104456067066,
527
- "loss": 0.8327,
528
- "step": 2200
529
- },
530
- {
531
- "epoch": 0.86,
532
- "learning_rate": 0.00016211711461224825,
533
- "loss": 0.8324,
534
- "step": 2225
535
- },
536
- {
537
- "epoch": 0.87,
538
- "learning_rate": 0.0001613173661237589,
539
- "loss": 0.8313,
540
- "step": 2250
541
- },
542
- {
543
- "epoch": 0.88,
544
- "learning_rate": 0.0001605112827655069,
545
- "loss": 0.8292,
546
- "step": 2275
547
- },
548
- {
549
- "epoch": 0.89,
550
- "learning_rate": 0.0001596989478162339,
551
- "loss": 0.8334,
552
- "step": 2300
553
- },
554
- {
555
- "epoch": 0.9,
556
- "learning_rate": 0.00015888044520055106,
557
- "loss": 0.8352,
558
- "step": 2325
559
- },
560
- {
561
- "epoch": 0.91,
562
- "learning_rate": 0.00015805585948026852,
563
- "loss": 0.823,
564
- "step": 2350
565
- },
566
- {
567
- "epoch": 0.92,
568
- "learning_rate": 0.000157225275845659,
569
- "loss": 0.8293,
570
- "step": 2375
571
- },
572
- {
573
- "epoch": 0.93,
574
- "learning_rate": 0.00015638878010665672,
575
- "loss": 0.8289,
576
- "step": 2400
577
- },
578
- {
579
- "epoch": 0.94,
580
- "learning_rate": 0.00015554645868399205,
581
- "loss": 0.832,
582
- "step": 2425
583
- },
584
- {
585
- "epoch": 0.95,
586
- "learning_rate": 0.00015469839860026308,
587
- "loss": 0.8294,
588
- "step": 2450
589
- },
590
- {
591
- "epoch": 0.96,
592
- "learning_rate": 0.0001538446874709452,
593
- "loss": 0.8281,
594
- "step": 2475
595
- },
596
- {
597
- "epoch": 0.97,
598
- "learning_rate": 0.00015298541349533925,
599
- "loss": 0.8314,
600
- "step": 2500
601
- },
602
- {
603
- "epoch": 0.98,
604
- "learning_rate": 0.00015212066544745926,
605
- "loss": 0.831,
606
- "step": 2525
607
- },
608
- {
609
- "epoch": 0.99,
610
- "learning_rate": 0.00015125053266686124,
611
- "loss": 0.8319,
612
- "step": 2550
613
- },
614
- {
615
- "epoch": 1.0,
616
- "learning_rate": 0.00015037510504941303,
617
- "loss": 0.8259,
618
- "step": 2575
619
- },
620
- {
621
- "epoch": 1.01,
622
- "learning_rate": 0.00014949447303800695,
623
- "loss": 0.8133,
624
- "step": 2600
625
- },
626
- {
627
- "epoch": 1.02,
628
- "learning_rate": 0.00014860872761321593,
629
- "loss": 0.8139,
630
- "step": 2625
631
- },
632
- {
633
- "epoch": 1.03,
634
- "learning_rate": 0.00014771796028389405,
635
- "loss": 0.804,
636
- "step": 2650
637
- },
638
- {
639
- "epoch": 1.04,
640
- "learning_rate": 0.0001468222630777225,
641
- "loss": 0.8011,
642
- "step": 2675
643
- },
644
- {
645
- "epoch": 1.05,
646
- "learning_rate": 0.00014592172853170193,
647
- "loss": 0.8037,
648
- "step": 2700
649
- },
650
- {
651
- "epoch": 1.06,
652
- "learning_rate": 0.00014501644968259212,
653
- "loss": 0.8063,
654
- "step": 2725
655
- },
656
- {
657
- "epoch": 1.07,
658
- "learning_rate": 0.00014410652005730025,
659
- "loss": 0.8155,
660
- "step": 2750
661
- },
662
- {
663
- "epoch": 1.08,
664
- "learning_rate": 0.00014319203366321826,
665
- "loss": 0.8066,
666
- "step": 2775
667
- },
668
- {
669
- "epoch": 1.09,
670
- "learning_rate": 0.0001422730849785107,
671
- "loss": 0.8091,
672
- "step": 2800
673
- },
674
- {
675
- "epoch": 1.1,
676
- "learning_rate": 0.0001413497689423539,
677
- "loss": 0.8067,
678
- "step": 2825
679
- },
680
- {
681
- "epoch": 1.11,
682
- "learning_rate": 0.00014042218094512755,
683
- "loss": 0.8046,
684
- "step": 2850
685
- },
686
- {
687
- "epoch": 1.11,
688
- "learning_rate": 0.00013949041681855985,
689
- "loss": 0.8053,
690
- "step": 2875
691
- },
692
- {
693
- "epoch": 1.12,
694
- "learning_rate": 0.0001385545728258264,
695
- "loss": 0.8075,
696
- "step": 2900
697
- },
698
- {
699
- "epoch": 1.13,
700
- "learning_rate": 0.0001376147456516055,
701
- "loss": 0.8015,
702
- "step": 2925
703
- },
704
- {
705
- "epoch": 1.14,
706
- "learning_rate": 0.00013667103239208903,
707
- "loss": 0.8016,
708
- "step": 2950
709
- },
710
- {
711
- "epoch": 1.15,
712
- "learning_rate": 0.00013572353054495126,
713
- "loss": 0.8029,
714
- "step": 2975
715
- },
716
- {
717
- "epoch": 1.16,
718
- "learning_rate": 0.0001347723379992762,
719
- "loss": 0.8017,
720
- "step": 3000
721
- },
722
- {
723
- "epoch": 1.16,
724
- "eval_loss": 0.8229297995567322,
725
- "eval_runtime": 59.3398,
726
- "eval_samples_per_second": 12.302,
727
- "eval_steps_per_second": 0.893,
728
- "step": 3000
729
- }
730
- ],
731
- "max_steps": 7737,
732
- "num_train_epochs": 3,
733
- "total_flos": 1.2918900565647294e+19,
734
- "trial_name": null,
735
- "trial_params": null
736
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7292138fecd854f5f17371c439bbd450ee3c48e738b75818b778a55f4e26ef57
3
- size 4027