chansung commited on
Commit
fbdc666
·
verified ·
1 Parent(s): 7df4136

Model save

Browse files
README.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gemma
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: google/gemma-7b
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: gemma-7b-sft-qlora-no-robots
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # gemma-7b-sft-qlora-no-robots
20
+
21
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 4.2389
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 4
44
+ - eval_batch_size: 4
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 4
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 32
50
+ - total_eval_batch_size: 16
51
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 50
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 17.168 | 1.0 | 3 | 15.0876 |
61
+ | 14.9207 | 2.0 | 6 | 8.7644 |
62
+ | 14.9207 | 3.0 | 9 | 4.8425 |
63
+ | 7.3214 | 4.0 | 12 | 3.0239 |
64
+ | 2.9627 | 5.0 | 15 | 2.2565 |
65
+ | 2.9627 | 6.0 | 18 | 1.8792 |
66
+ | 1.7971 | 7.0 | 21 | 1.7648 |
67
+ | 1.7971 | 8.0 | 24 | 1.7012 |
68
+ | 1.4939 | 9.0 | 27 | 1.5479 |
69
+ | 1.2756 | 10.0 | 30 | 1.5051 |
70
+ | 1.2756 | 11.0 | 33 | 1.3975 |
71
+ | 1.0884 | 12.0 | 36 | 1.4440 |
72
+ | 1.0884 | 13.0 | 39 | 1.4135 |
73
+ | 0.9429 | 14.0 | 42 | 1.4587 |
74
+ | 0.7653 | 15.0 | 45 | 1.4874 |
75
+ | 0.7653 | 16.0 | 48 | 1.5958 |
76
+ | 0.6424 | 17.0 | 51 | 1.5928 |
77
+ | 0.6424 | 18.0 | 54 | 1.6838 |
78
+ | 0.5346 | 19.0 | 57 | 1.8264 |
79
+ | 0.4249 | 20.0 | 60 | 1.9655 |
80
+ | 0.4249 | 21.0 | 63 | 2.1370 |
81
+ | 0.3347 | 22.0 | 66 | 2.6981 |
82
+ | 0.3347 | 23.0 | 69 | 2.7131 |
83
+ | 0.2655 | 24.0 | 72 | 2.7668 |
84
+ | 0.2026 | 25.0 | 75 | 2.8615 |
85
+ | 0.2026 | 26.0 | 78 | 3.1596 |
86
+ | 0.1588 | 27.0 | 81 | 3.3286 |
87
+ | 0.1588 | 28.0 | 84 | 3.5463 |
88
+ | 0.1319 | 29.0 | 87 | 3.3686 |
89
+ | 0.1111 | 30.0 | 90 | 3.6859 |
90
+ | 0.1111 | 31.0 | 93 | 3.7810 |
91
+ | 0.0939 | 32.0 | 96 | 3.7559 |
92
+ | 0.0939 | 33.0 | 99 | 3.9164 |
93
+ | 0.082 | 34.0 | 102 | 3.9693 |
94
+ | 0.0709 | 35.0 | 105 | 4.0430 |
95
+ | 0.0709 | 36.0 | 108 | 4.1017 |
96
+ | 0.0638 | 37.0 | 111 | 4.1449 |
97
+ | 0.0638 | 38.0 | 114 | 4.1639 |
98
+ | 0.0597 | 39.0 | 117 | 4.1880 |
99
+ | 0.0556 | 40.0 | 120 | 4.2123 |
100
+ | 0.0556 | 41.0 | 123 | 4.2196 |
101
+ | 0.0535 | 42.0 | 126 | 4.2262 |
102
+ | 0.0535 | 43.0 | 129 | 4.2301 |
103
+ | 0.0521 | 44.0 | 132 | 4.2314 |
104
+ | 0.0521 | 45.0 | 135 | 4.2365 |
105
+ | 0.0521 | 46.0 | 138 | 4.2350 |
106
+ | 0.0525 | 47.0 | 141 | 4.2364 |
107
+ | 0.0525 | 48.0 | 144 | 4.2320 |
108
+ | 0.0509 | 49.0 | 147 | 4.2361 |
109
+ | 0.0505 | 50.0 | 150 | 4.2389 |
110
+
111
+
112
+ ### Framework versions
113
+
114
+ - PEFT 0.7.1
115
+ - Transformers 4.39.0.dev0
116
+ - Pytorch 2.2.2+cu121
117
+ - Datasets 2.14.6
118
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6be4368d41073daecac18bce49e11d7383c6c086b444be7a16796d8cacf18aa
3
  size 100060536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91039626473e4cb5c0fa03d2c01cc67cc749e62f55112ce48049e7a9d69817ec
3
  size 100060536
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 50.0,
3
+ "train_loss": 1.2187181319793066,
4
+ "train_runtime": 1336.0597,
5
+ "train_samples": 926,
6
+ "train_samples_per_second": 3.256,
7
+ "train_steps_per_second": 0.112
8
+ }
runs/Apr09_02-14-22_deep-diver-main-silent-eagle-1-0-0/events.out.tfevents.1712643385.deep-diver-main-silent-eagle-1-0-0.585.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f3d80f50b420db512037fa96b2b95afcdc5a426b141de6c75e37ed4149da1c8
3
- size 18411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44c0e8a00fbb8bd0844082d7cf225f1cf63320795c06f79e37a3394b08d1586
3
+ size 25417
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 50.0,
3
+ "train_loss": 1.2187181319793066,
4
+ "train_runtime": 1336.0597,
5
+ "train_samples": 926,
6
+ "train_samples_per_second": 3.256,
7
+ "train_steps_per_second": 0.112
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,647 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 50.0,
5
+ "eval_steps": 500,
6
+ "global_step": 150,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.33,
13
+ "grad_norm": 74.0,
14
+ "learning_rate": 1.3333333333333333e-05,
15
+ "loss": 17.168,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 1.0,
20
+ "eval_loss": 15.087593078613281,
21
+ "eval_runtime": 1.3097,
22
+ "eval_samples_per_second": 1.527,
23
+ "eval_steps_per_second": 0.764,
24
+ "step": 3
25
+ },
26
+ {
27
+ "epoch": 1.67,
28
+ "grad_norm": 67.5,
29
+ "learning_rate": 6.666666666666667e-05,
30
+ "loss": 14.9207,
31
+ "step": 5
32
+ },
33
+ {
34
+ "epoch": 2.0,
35
+ "eval_loss": 8.764444351196289,
36
+ "eval_runtime": 1.3178,
37
+ "eval_samples_per_second": 1.518,
38
+ "eval_steps_per_second": 0.759,
39
+ "step": 6
40
+ },
41
+ {
42
+ "epoch": 3.0,
43
+ "eval_loss": 4.842519760131836,
44
+ "eval_runtime": 1.321,
45
+ "eval_samples_per_second": 1.514,
46
+ "eval_steps_per_second": 0.757,
47
+ "step": 9
48
+ },
49
+ {
50
+ "epoch": 3.33,
51
+ "grad_norm": 15.875,
52
+ "learning_rate": 0.00013333333333333334,
53
+ "loss": 7.3214,
54
+ "step": 10
55
+ },
56
+ {
57
+ "epoch": 4.0,
58
+ "eval_loss": 3.023890495300293,
59
+ "eval_runtime": 1.3205,
60
+ "eval_samples_per_second": 1.515,
61
+ "eval_steps_per_second": 0.757,
62
+ "step": 12
63
+ },
64
+ {
65
+ "epoch": 5.0,
66
+ "grad_norm": 5.53125,
67
+ "learning_rate": 0.0002,
68
+ "loss": 2.9627,
69
+ "step": 15
70
+ },
71
+ {
72
+ "epoch": 5.0,
73
+ "eval_loss": 2.256518602371216,
74
+ "eval_runtime": 1.3203,
75
+ "eval_samples_per_second": 1.515,
76
+ "eval_steps_per_second": 0.757,
77
+ "step": 15
78
+ },
79
+ {
80
+ "epoch": 6.0,
81
+ "eval_loss": 1.8792051076889038,
82
+ "eval_runtime": 1.3249,
83
+ "eval_samples_per_second": 1.51,
84
+ "eval_steps_per_second": 0.755,
85
+ "step": 18
86
+ },
87
+ {
88
+ "epoch": 6.67,
89
+ "grad_norm": 1.640625,
90
+ "learning_rate": 0.00019932383577419432,
91
+ "loss": 1.7971,
92
+ "step": 20
93
+ },
94
+ {
95
+ "epoch": 7.0,
96
+ "eval_loss": 1.7647607326507568,
97
+ "eval_runtime": 1.324,
98
+ "eval_samples_per_second": 1.511,
99
+ "eval_steps_per_second": 0.755,
100
+ "step": 21
101
+ },
102
+ {
103
+ "epoch": 8.0,
104
+ "eval_loss": 1.7012015581130981,
105
+ "eval_runtime": 1.3248,
106
+ "eval_samples_per_second": 1.51,
107
+ "eval_steps_per_second": 0.755,
108
+ "step": 24
109
+ },
110
+ {
111
+ "epoch": 8.33,
112
+ "grad_norm": 2.234375,
113
+ "learning_rate": 0.00019730448705798239,
114
+ "loss": 1.4939,
115
+ "step": 25
116
+ },
117
+ {
118
+ "epoch": 9.0,
119
+ "eval_loss": 1.547886610031128,
120
+ "eval_runtime": 1.3259,
121
+ "eval_samples_per_second": 1.508,
122
+ "eval_steps_per_second": 0.754,
123
+ "step": 27
124
+ },
125
+ {
126
+ "epoch": 10.0,
127
+ "grad_norm": 1.78125,
128
+ "learning_rate": 0.00019396926207859084,
129
+ "loss": 1.2756,
130
+ "step": 30
131
+ },
132
+ {
133
+ "epoch": 10.0,
134
+ "eval_loss": 1.5050724744796753,
135
+ "eval_runtime": 1.3222,
136
+ "eval_samples_per_second": 1.513,
137
+ "eval_steps_per_second": 0.756,
138
+ "step": 30
139
+ },
140
+ {
141
+ "epoch": 11.0,
142
+ "eval_loss": 1.3975391387939453,
143
+ "eval_runtime": 1.3256,
144
+ "eval_samples_per_second": 1.509,
145
+ "eval_steps_per_second": 0.754,
146
+ "step": 33
147
+ },
148
+ {
149
+ "epoch": 11.67,
150
+ "grad_norm": 1.1796875,
151
+ "learning_rate": 0.00018936326403234125,
152
+ "loss": 1.0884,
153
+ "step": 35
154
+ },
155
+ {
156
+ "epoch": 12.0,
157
+ "eval_loss": 1.444014549255371,
158
+ "eval_runtime": 1.325,
159
+ "eval_samples_per_second": 1.509,
160
+ "eval_steps_per_second": 0.755,
161
+ "step": 36
162
+ },
163
+ {
164
+ "epoch": 13.0,
165
+ "eval_loss": 1.413475513458252,
166
+ "eval_runtime": 1.3258,
167
+ "eval_samples_per_second": 1.509,
168
+ "eval_steps_per_second": 0.754,
169
+ "step": 39
170
+ },
171
+ {
172
+ "epoch": 13.33,
173
+ "grad_norm": 1.3046875,
174
+ "learning_rate": 0.00018354878114129367,
175
+ "loss": 0.9429,
176
+ "step": 40
177
+ },
178
+ {
179
+ "epoch": 14.0,
180
+ "eval_loss": 1.4587093591690063,
181
+ "eval_runtime": 1.3247,
182
+ "eval_samples_per_second": 1.51,
183
+ "eval_steps_per_second": 0.755,
184
+ "step": 42
185
+ },
186
+ {
187
+ "epoch": 15.0,
188
+ "grad_norm": 2.21875,
189
+ "learning_rate": 0.0001766044443118978,
190
+ "loss": 0.7653,
191
+ "step": 45
192
+ },
193
+ {
194
+ "epoch": 15.0,
195
+ "eval_loss": 1.487448811531067,
196
+ "eval_runtime": 1.3231,
197
+ "eval_samples_per_second": 1.512,
198
+ "eval_steps_per_second": 0.756,
199
+ "step": 45
200
+ },
201
+ {
202
+ "epoch": 16.0,
203
+ "eval_loss": 1.5958000421524048,
204
+ "eval_runtime": 1.3242,
205
+ "eval_samples_per_second": 1.51,
206
+ "eval_steps_per_second": 0.755,
207
+ "step": 48
208
+ },
209
+ {
210
+ "epoch": 16.67,
211
+ "grad_norm": 0.9921875,
212
+ "learning_rate": 0.0001686241637868734,
213
+ "loss": 0.6424,
214
+ "step": 50
215
+ },
216
+ {
217
+ "epoch": 17.0,
218
+ "eval_loss": 1.592842698097229,
219
+ "eval_runtime": 1.3242,
220
+ "eval_samples_per_second": 1.51,
221
+ "eval_steps_per_second": 0.755,
222
+ "step": 51
223
+ },
224
+ {
225
+ "epoch": 18.0,
226
+ "eval_loss": 1.683807373046875,
227
+ "eval_runtime": 1.3252,
228
+ "eval_samples_per_second": 1.509,
229
+ "eval_steps_per_second": 0.755,
230
+ "step": 54
231
+ },
232
+ {
233
+ "epoch": 18.33,
234
+ "grad_norm": 4.78125,
235
+ "learning_rate": 0.00015971585917027862,
236
+ "loss": 0.5346,
237
+ "step": 55
238
+ },
239
+ {
240
+ "epoch": 19.0,
241
+ "eval_loss": 1.8263951539993286,
242
+ "eval_runtime": 1.3248,
243
+ "eval_samples_per_second": 1.51,
244
+ "eval_steps_per_second": 0.755,
245
+ "step": 57
246
+ },
247
+ {
248
+ "epoch": 20.0,
249
+ "grad_norm": 0.99609375,
250
+ "learning_rate": 0.00015000000000000001,
251
+ "loss": 0.4249,
252
+ "step": 60
253
+ },
254
+ {
255
+ "epoch": 20.0,
256
+ "eval_loss": 1.9654637575149536,
257
+ "eval_runtime": 1.3233,
258
+ "eval_samples_per_second": 1.511,
259
+ "eval_steps_per_second": 0.756,
260
+ "step": 60
261
+ },
262
+ {
263
+ "epoch": 21.0,
264
+ "eval_loss": 2.137000560760498,
265
+ "eval_runtime": 1.3247,
266
+ "eval_samples_per_second": 1.51,
267
+ "eval_steps_per_second": 0.755,
268
+ "step": 63
269
+ },
270
+ {
271
+ "epoch": 21.67,
272
+ "grad_norm": 1.484375,
273
+ "learning_rate": 0.0001396079766039157,
274
+ "loss": 0.3347,
275
+ "step": 65
276
+ },
277
+ {
278
+ "epoch": 22.0,
279
+ "eval_loss": 2.698075294494629,
280
+ "eval_runtime": 1.3255,
281
+ "eval_samples_per_second": 1.509,
282
+ "eval_steps_per_second": 0.754,
283
+ "step": 66
284
+ },
285
+ {
286
+ "epoch": 23.0,
287
+ "eval_loss": 2.713052272796631,
288
+ "eval_runtime": 1.3259,
289
+ "eval_samples_per_second": 1.508,
290
+ "eval_steps_per_second": 0.754,
291
+ "step": 69
292
+ },
293
+ {
294
+ "epoch": 23.33,
295
+ "grad_norm": 1.71875,
296
+ "learning_rate": 0.00012868032327110904,
297
+ "loss": 0.2655,
298
+ "step": 70
299
+ },
300
+ {
301
+ "epoch": 24.0,
302
+ "eval_loss": 2.7668490409851074,
303
+ "eval_runtime": 1.3252,
304
+ "eval_samples_per_second": 1.509,
305
+ "eval_steps_per_second": 0.755,
306
+ "step": 72
307
+ },
308
+ {
309
+ "epoch": 25.0,
310
+ "grad_norm": 0.734375,
311
+ "learning_rate": 0.00011736481776669306,
312
+ "loss": 0.2026,
313
+ "step": 75
314
+ },
315
+ {
316
+ "epoch": 25.0,
317
+ "eval_loss": 2.8614566326141357,
318
+ "eval_runtime": 1.3226,
319
+ "eval_samples_per_second": 1.512,
320
+ "eval_steps_per_second": 0.756,
321
+ "step": 75
322
+ },
323
+ {
324
+ "epoch": 26.0,
325
+ "eval_loss": 3.1595633029937744,
326
+ "eval_runtime": 1.3267,
327
+ "eval_samples_per_second": 1.507,
328
+ "eval_steps_per_second": 0.754,
329
+ "step": 78
330
+ },
331
+ {
332
+ "epoch": 26.67,
333
+ "grad_norm": 0.73828125,
334
+ "learning_rate": 0.00010581448289104758,
335
+ "loss": 0.1588,
336
+ "step": 80
337
+ },
338
+ {
339
+ "epoch": 27.0,
340
+ "eval_loss": 3.3285796642303467,
341
+ "eval_runtime": 1.3243,
342
+ "eval_samples_per_second": 1.51,
343
+ "eval_steps_per_second": 0.755,
344
+ "step": 81
345
+ },
346
+ {
347
+ "epoch": 28.0,
348
+ "eval_loss": 3.546278953552246,
349
+ "eval_runtime": 1.3248,
350
+ "eval_samples_per_second": 1.51,
351
+ "eval_steps_per_second": 0.755,
352
+ "step": 84
353
+ },
354
+ {
355
+ "epoch": 28.33,
356
+ "grad_norm": 1.015625,
357
+ "learning_rate": 9.418551710895243e-05,
358
+ "loss": 0.1319,
359
+ "step": 85
360
+ },
361
+ {
362
+ "epoch": 29.0,
363
+ "eval_loss": 3.3686463832855225,
364
+ "eval_runtime": 1.3245,
365
+ "eval_samples_per_second": 1.51,
366
+ "eval_steps_per_second": 0.755,
367
+ "step": 87
368
+ },
369
+ {
370
+ "epoch": 30.0,
371
+ "grad_norm": 0.56640625,
372
+ "learning_rate": 8.263518223330697e-05,
373
+ "loss": 0.1111,
374
+ "step": 90
375
+ },
376
+ {
377
+ "epoch": 30.0,
378
+ "eval_loss": 3.685863733291626,
379
+ "eval_runtime": 1.3215,
380
+ "eval_samples_per_second": 1.513,
381
+ "eval_steps_per_second": 0.757,
382
+ "step": 90
383
+ },
384
+ {
385
+ "epoch": 31.0,
386
+ "eval_loss": 3.780993700027466,
387
+ "eval_runtime": 1.3257,
388
+ "eval_samples_per_second": 1.509,
389
+ "eval_steps_per_second": 0.754,
390
+ "step": 93
391
+ },
392
+ {
393
+ "epoch": 31.67,
394
+ "grad_norm": 0.4375,
395
+ "learning_rate": 7.131967672889101e-05,
396
+ "loss": 0.0939,
397
+ "step": 95
398
+ },
399
+ {
400
+ "epoch": 32.0,
401
+ "eval_loss": 3.7559256553649902,
402
+ "eval_runtime": 1.3244,
403
+ "eval_samples_per_second": 1.51,
404
+ "eval_steps_per_second": 0.755,
405
+ "step": 96
406
+ },
407
+ {
408
+ "epoch": 33.0,
409
+ "eval_loss": 3.916355848312378,
410
+ "eval_runtime": 1.3252,
411
+ "eval_samples_per_second": 1.509,
412
+ "eval_steps_per_second": 0.755,
413
+ "step": 99
414
+ },
415
+ {
416
+ "epoch": 33.33,
417
+ "grad_norm": 0.390625,
418
+ "learning_rate": 6.039202339608432e-05,
419
+ "loss": 0.082,
420
+ "step": 100
421
+ },
422
+ {
423
+ "epoch": 34.0,
424
+ "eval_loss": 3.9693491458892822,
425
+ "eval_runtime": 1.3217,
426
+ "eval_samples_per_second": 1.513,
427
+ "eval_steps_per_second": 0.757,
428
+ "step": 102
429
+ },
430
+ {
431
+ "epoch": 35.0,
432
+ "grad_norm": 0.283203125,
433
+ "learning_rate": 5.000000000000002e-05,
434
+ "loss": 0.0709,
435
+ "step": 105
436
+ },
437
+ {
438
+ "epoch": 35.0,
439
+ "eval_loss": 4.04301118850708,
440
+ "eval_runtime": 1.3186,
441
+ "eval_samples_per_second": 1.517,
442
+ "eval_steps_per_second": 0.758,
443
+ "step": 105
444
+ },
445
+ {
446
+ "epoch": 36.0,
447
+ "eval_loss": 4.101677417755127,
448
+ "eval_runtime": 1.3238,
449
+ "eval_samples_per_second": 1.511,
450
+ "eval_steps_per_second": 0.755,
451
+ "step": 108
452
+ },
453
+ {
454
+ "epoch": 36.67,
455
+ "grad_norm": 0.27734375,
456
+ "learning_rate": 4.028414082972141e-05,
457
+ "loss": 0.0638,
458
+ "step": 110
459
+ },
460
+ {
461
+ "epoch": 37.0,
462
+ "eval_loss": 4.144949913024902,
463
+ "eval_runtime": 1.3248,
464
+ "eval_samples_per_second": 1.51,
465
+ "eval_steps_per_second": 0.755,
466
+ "step": 111
467
+ },
468
+ {
469
+ "epoch": 38.0,
470
+ "eval_loss": 4.1638994216918945,
471
+ "eval_runtime": 1.3218,
472
+ "eval_samples_per_second": 1.513,
473
+ "eval_steps_per_second": 0.757,
474
+ "step": 114
475
+ },
476
+ {
477
+ "epoch": 38.33,
478
+ "grad_norm": 0.3125,
479
+ "learning_rate": 3.137583621312665e-05,
480
+ "loss": 0.0597,
481
+ "step": 115
482
+ },
483
+ {
484
+ "epoch": 39.0,
485
+ "eval_loss": 4.187974452972412,
486
+ "eval_runtime": 1.3246,
487
+ "eval_samples_per_second": 1.51,
488
+ "eval_steps_per_second": 0.755,
489
+ "step": 117
490
+ },
491
+ {
492
+ "epoch": 40.0,
493
+ "grad_norm": 0.318359375,
494
+ "learning_rate": 2.339555568810221e-05,
495
+ "loss": 0.0556,
496
+ "step": 120
497
+ },
498
+ {
499
+ "epoch": 40.0,
500
+ "eval_loss": 4.212304592132568,
501
+ "eval_runtime": 1.3229,
502
+ "eval_samples_per_second": 1.512,
503
+ "eval_steps_per_second": 0.756,
504
+ "step": 120
505
+ },
506
+ {
507
+ "epoch": 41.0,
508
+ "eval_loss": 4.219560623168945,
509
+ "eval_runtime": 1.3269,
510
+ "eval_samples_per_second": 1.507,
511
+ "eval_steps_per_second": 0.754,
512
+ "step": 123
513
+ },
514
+ {
515
+ "epoch": 41.67,
516
+ "grad_norm": 0.201171875,
517
+ "learning_rate": 1.6451218858706374e-05,
518
+ "loss": 0.0535,
519
+ "step": 125
520
+ },
521
+ {
522
+ "epoch": 42.0,
523
+ "eval_loss": 4.226192951202393,
524
+ "eval_runtime": 1.3243,
525
+ "eval_samples_per_second": 1.51,
526
+ "eval_steps_per_second": 0.755,
527
+ "step": 126
528
+ },
529
+ {
530
+ "epoch": 43.0,
531
+ "eval_loss": 4.230067729949951,
532
+ "eval_runtime": 1.3252,
533
+ "eval_samples_per_second": 1.509,
534
+ "eval_steps_per_second": 0.755,
535
+ "step": 129
536
+ },
537
+ {
538
+ "epoch": 43.33,
539
+ "grad_norm": 0.2490234375,
540
+ "learning_rate": 1.0636735967658784e-05,
541
+ "loss": 0.0521,
542
+ "step": 130
543
+ },
544
+ {
545
+ "epoch": 44.0,
546
+ "eval_loss": 4.231449604034424,
547
+ "eval_runtime": 1.3262,
548
+ "eval_samples_per_second": 1.508,
549
+ "eval_steps_per_second": 0.754,
550
+ "step": 132
551
+ },
552
+ {
553
+ "epoch": 45.0,
554
+ "grad_norm": 0.10986328125,
555
+ "learning_rate": 6.030737921409169e-06,
556
+ "loss": 0.0521,
557
+ "step": 135
558
+ },
559
+ {
560
+ "epoch": 45.0,
561
+ "eval_loss": 4.2365241050720215,
562
+ "eval_runtime": 1.3238,
563
+ "eval_samples_per_second": 1.511,
564
+ "eval_steps_per_second": 0.755,
565
+ "step": 135
566
+ },
567
+ {
568
+ "epoch": 46.0,
569
+ "eval_loss": 4.234958171844482,
570
+ "eval_runtime": 1.3255,
571
+ "eval_samples_per_second": 1.509,
572
+ "eval_steps_per_second": 0.754,
573
+ "step": 138
574
+ },
575
+ {
576
+ "epoch": 46.67,
577
+ "grad_norm": 0.12451171875,
578
+ "learning_rate": 2.6955129420176196e-06,
579
+ "loss": 0.0525,
580
+ "step": 140
581
+ },
582
+ {
583
+ "epoch": 47.0,
584
+ "eval_loss": 4.236445426940918,
585
+ "eval_runtime": 1.3261,
586
+ "eval_samples_per_second": 1.508,
587
+ "eval_steps_per_second": 0.754,
588
+ "step": 141
589
+ },
590
+ {
591
+ "epoch": 48.0,
592
+ "eval_loss": 4.231955051422119,
593
+ "eval_runtime": 1.3237,
594
+ "eval_samples_per_second": 1.511,
595
+ "eval_steps_per_second": 0.755,
596
+ "step": 144
597
+ },
598
+ {
599
+ "epoch": 48.33,
600
+ "grad_norm": 0.1357421875,
601
+ "learning_rate": 6.761642258056978e-07,
602
+ "loss": 0.0509,
603
+ "step": 145
604
+ },
605
+ {
606
+ "epoch": 49.0,
607
+ "eval_loss": 4.236112594604492,
608
+ "eval_runtime": 1.3264,
609
+ "eval_samples_per_second": 1.508,
610
+ "eval_steps_per_second": 0.754,
611
+ "step": 147
612
+ },
613
+ {
614
+ "epoch": 50.0,
615
+ "grad_norm": 0.09326171875,
616
+ "learning_rate": 0.0,
617
+ "loss": 0.0505,
618
+ "step": 150
619
+ },
620
+ {
621
+ "epoch": 50.0,
622
+ "eval_loss": 4.238922595977783,
623
+ "eval_runtime": 1.3223,
624
+ "eval_samples_per_second": 1.513,
625
+ "eval_steps_per_second": 0.756,
626
+ "step": 150
627
+ },
628
+ {
629
+ "epoch": 50.0,
630
+ "step": 150,
631
+ "total_flos": 4.601366279314473e+17,
632
+ "train_loss": 1.2187181319793066,
633
+ "train_runtime": 1336.0597,
634
+ "train_samples_per_second": 3.256,
635
+ "train_steps_per_second": 0.112
636
+ }
637
+ ],
638
+ "logging_steps": 5,
639
+ "max_steps": 150,
640
+ "num_input_tokens_seen": 0,
641
+ "num_train_epochs": 50,
642
+ "save_steps": 100,
643
+ "total_flos": 4.601366279314473e+17,
644
+ "train_batch_size": 4,
645
+ "trial_name": null,
646
+ "trial_params": null
647
+ }