Jinchen Ge commited on
Commit
36baff5
1 Parent(s): fd841b9

Add fine-tuned model

Browse files
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ datasets:
5
+ - wikitext
6
+ model-index:
7
+ - name: clm_output_medium
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # clm_output_medium
15
+
16
+ This model is a fine-tuned version of [gpt2-medium](https://huggingface.co/gpt2-medium) on the wikitext wikitext-103-raw-v1 dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 2.6973
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 1e-05
38
+ - train_batch_size: 1
39
+ - eval_batch_size: 1
40
+ - seed: 42
41
+ - distributed_type: IPU
42
+ - gradient_accumulation_steps: 256
43
+ - total_train_batch_size: 1024
44
+ - total_eval_batch_size: 18
45
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: linear
47
+ - lr_scheduler_warmup_ratio: 0.1
48
+ - num_epochs: 10.0
49
+ - training precision: Mixed Precision
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
+
57
+ - Transformers 4.18.0.dev0
58
+ - Pytorch 1.10.0+cpu
59
+ - Datasets 2.0.0
60
+ - Tokenizers 0.11.6
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_loss": 2.697265625,
4
+ "eval_runtime": 1045.8094,
5
+ "eval_samples": 240,
6
+ "eval_samples_per_second": 0.229,
7
+ "eval_steps_per_second": 0.013,
8
+ "perplexity": 14.83910053420958,
9
+ "train_loss": 2.8070910754504506,
10
+ "train_runtime": 11217.8167,
11
+ "train_samples": 114248,
12
+ "train_samples_per_second": 101.845,
13
+ "train_steps_per_second": 0.099
14
+ }
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2-medium",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "PoptorchPipelinedGPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1024,
16
+ "n_head": 16,
17
+ "n_inner": null,
18
+ "n_layer": 24,
19
+ "n_positions": 1024,
20
+ "n_special": 0,
21
+ "predict_special_tokens": true,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "task_specific_params": {
32
+ "text-generation": {
33
+ "do_sample": true,
34
+ "max_length": 50
35
+ }
36
+ },
37
+ "torch_dtype": "float16",
38
+ "transformers_version": "4.18.0.dev0",
39
+ "use_cache": true,
40
+ "vocab_size": 50257
41
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_loss": 2.697265625,
4
+ "eval_runtime": 1045.8094,
5
+ "eval_samples": 240,
6
+ "eval_samples_per_second": 0.229,
7
+ "eval_steps_per_second": 0.013,
8
+ "perplexity": 14.83910053420958
9
+ }
ipu_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "device_iterations": 2,
3
+ "embedding_serialization_factor": 5,
4
+ "enable_half_first_order_momentum": true,
5
+ "enable_half_partials": true,
6
+ "executable_cache_dir": "./exe_cache",
7
+ "gradient_accumulation_steps": 256,
8
+ "inference_device_iterations": 9,
9
+ "inference_replication_factor": 2,
10
+ "ipus_per_replica": 8,
11
+ "layers_per_ipu": [
12
+ 0,
13
+ 3,
14
+ 3,
15
+ 3,
16
+ 3,
17
+ 4,
18
+ 4,
19
+ 4
20
+ ],
21
+ "matmul_proportion": 0.25,
22
+ "optimizer_state_offchip": true,
23
+ "optimum_version": "1.0.0",
24
+ "output_mode": "final",
25
+ "profile_dir": "",
26
+ "recompute_checkpoint_every_layer": true,
27
+ "replicated_tensor_sharding": true,
28
+ "replication_factor": 2,
29
+ "seed": 42,
30
+ "use_popdist": false
31
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d31e1d5b593aa0a1bae1062b163e60bc9e0c9d6a7d0f0c7b147d053027cf921
3
+ size 734915801
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2-medium", "tokenizer_class": "GPT2Tokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "train_loss": 2.8070910754504506,
4
+ "train_runtime": 11217.8167,
5
+ "train_samples": 114248,
6
+ "train_samples_per_second": 101.845,
7
+ "train_steps_per_second": 0.099
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "global_step": 1110,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.05,
12
+ "learning_rate": 4.504504504504505e-07,
13
+ "loss": 3.2559,
14
+ "step": 5
15
+ },
16
+ {
17
+ "epoch": 0.09,
18
+ "learning_rate": 9.00900900900901e-07,
19
+ "loss": 3.2281,
20
+ "step": 10
21
+ },
22
+ {
23
+ "epoch": 0.14,
24
+ "learning_rate": 1.3513513513513515e-06,
25
+ "loss": 3.1953,
26
+ "step": 15
27
+ },
28
+ {
29
+ "epoch": 0.18,
30
+ "learning_rate": 1.801801801801802e-06,
31
+ "loss": 3.0844,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 0.23,
36
+ "learning_rate": 2.2522522522522524e-06,
37
+ "loss": 3.2563,
38
+ "step": 25
39
+ },
40
+ {
41
+ "epoch": 0.27,
42
+ "learning_rate": 2.702702702702703e-06,
43
+ "loss": 3.098,
44
+ "step": 30
45
+ },
46
+ {
47
+ "epoch": 0.32,
48
+ "learning_rate": 3.1531531531531532e-06,
49
+ "loss": 2.9754,
50
+ "step": 35
51
+ },
52
+ {
53
+ "epoch": 0.36,
54
+ "learning_rate": 3.603603603603604e-06,
55
+ "loss": 3.1168,
56
+ "step": 40
57
+ },
58
+ {
59
+ "epoch": 0.41,
60
+ "learning_rate": 4.0540540540540545e-06,
61
+ "loss": 3.2441,
62
+ "step": 45
63
+ },
64
+ {
65
+ "epoch": 0.45,
66
+ "learning_rate": 4.504504504504505e-06,
67
+ "loss": 3.084,
68
+ "step": 50
69
+ },
70
+ {
71
+ "epoch": 0.5,
72
+ "learning_rate": 4.954954954954955e-06,
73
+ "loss": 3.1965,
74
+ "step": 55
75
+ },
76
+ {
77
+ "epoch": 0.54,
78
+ "learning_rate": 5.405405405405406e-06,
79
+ "loss": 2.9336,
80
+ "step": 60
81
+ },
82
+ {
83
+ "epoch": 0.59,
84
+ "learning_rate": 5.855855855855856e-06,
85
+ "loss": 2.977,
86
+ "step": 65
87
+ },
88
+ {
89
+ "epoch": 0.63,
90
+ "learning_rate": 6.3063063063063065e-06,
91
+ "loss": 3.0297,
92
+ "step": 70
93
+ },
94
+ {
95
+ "epoch": 0.68,
96
+ "learning_rate": 6.7567567567567575e-06,
97
+ "loss": 2.9363,
98
+ "step": 75
99
+ },
100
+ {
101
+ "epoch": 0.72,
102
+ "learning_rate": 7.207207207207208e-06,
103
+ "loss": 2.9477,
104
+ "step": 80
105
+ },
106
+ {
107
+ "epoch": 0.77,
108
+ "learning_rate": 7.657657657657658e-06,
109
+ "loss": 3.0219,
110
+ "step": 85
111
+ },
112
+ {
113
+ "epoch": 0.81,
114
+ "learning_rate": 8.108108108108109e-06,
115
+ "loss": 2.9004,
116
+ "step": 90
117
+ },
118
+ {
119
+ "epoch": 0.86,
120
+ "learning_rate": 8.55855855855856e-06,
121
+ "loss": 2.991,
122
+ "step": 95
123
+ },
124
+ {
125
+ "epoch": 0.9,
126
+ "learning_rate": 9.00900900900901e-06,
127
+ "loss": 2.8973,
128
+ "step": 100
129
+ },
130
+ {
131
+ "epoch": 0.95,
132
+ "learning_rate": 9.45945945945946e-06,
133
+ "loss": 2.9184,
134
+ "step": 105
135
+ },
136
+ {
137
+ "epoch": 0.99,
138
+ "learning_rate": 9.90990990990991e-06,
139
+ "loss": 2.868,
140
+ "step": 110
141
+ },
142
+ {
143
+ "epoch": 1.04,
144
+ "learning_rate": 9.95995995995996e-06,
145
+ "loss": 2.8301,
146
+ "step": 115
147
+ },
148
+ {
149
+ "epoch": 1.08,
150
+ "learning_rate": 9.90990990990991e-06,
151
+ "loss": 2.716,
152
+ "step": 120
153
+ },
154
+ {
155
+ "epoch": 1.13,
156
+ "learning_rate": 9.85985985985986e-06,
157
+ "loss": 2.8859,
158
+ "step": 125
159
+ },
160
+ {
161
+ "epoch": 1.17,
162
+ "learning_rate": 9.80980980980981e-06,
163
+ "loss": 2.8418,
164
+ "step": 130
165
+ },
166
+ {
167
+ "epoch": 1.22,
168
+ "learning_rate": 9.75975975975976e-06,
169
+ "loss": 2.8004,
170
+ "step": 135
171
+ },
172
+ {
173
+ "epoch": 1.26,
174
+ "learning_rate": 9.70970970970971e-06,
175
+ "loss": 2.8203,
176
+ "step": 140
177
+ },
178
+ {
179
+ "epoch": 1.31,
180
+ "learning_rate": 9.65965965965966e-06,
181
+ "loss": 2.8668,
182
+ "step": 145
183
+ },
184
+ {
185
+ "epoch": 1.35,
186
+ "learning_rate": 9.60960960960961e-06,
187
+ "loss": 2.8055,
188
+ "step": 150
189
+ },
190
+ {
191
+ "epoch": 1.4,
192
+ "learning_rate": 9.55955955955956e-06,
193
+ "loss": 2.7934,
194
+ "step": 155
195
+ },
196
+ {
197
+ "epoch": 1.44,
198
+ "learning_rate": 9.50950950950951e-06,
199
+ "loss": 2.9164,
200
+ "step": 160
201
+ },
202
+ {
203
+ "epoch": 1.49,
204
+ "learning_rate": 9.45945945945946e-06,
205
+ "loss": 2.9328,
206
+ "step": 165
207
+ },
208
+ {
209
+ "epoch": 1.53,
210
+ "learning_rate": 9.40940940940941e-06,
211
+ "loss": 2.9363,
212
+ "step": 170
213
+ },
214
+ {
215
+ "epoch": 1.58,
216
+ "learning_rate": 9.35935935935936e-06,
217
+ "loss": 2.841,
218
+ "step": 175
219
+ },
220
+ {
221
+ "epoch": 1.62,
222
+ "learning_rate": 9.30930930930931e-06,
223
+ "loss": 2.7398,
224
+ "step": 180
225
+ },
226
+ {
227
+ "epoch": 1.67,
228
+ "learning_rate": 9.25925925925926e-06,
229
+ "loss": 2.8695,
230
+ "step": 185
231
+ },
232
+ {
233
+ "epoch": 1.71,
234
+ "learning_rate": 9.20920920920921e-06,
235
+ "loss": 2.8363,
236
+ "step": 190
237
+ },
238
+ {
239
+ "epoch": 1.76,
240
+ "learning_rate": 9.15915915915916e-06,
241
+ "loss": 2.8844,
242
+ "step": 195
243
+ },
244
+ {
245
+ "epoch": 1.8,
246
+ "learning_rate": 9.10910910910911e-06,
247
+ "loss": 2.7594,
248
+ "step": 200
249
+ },
250
+ {
251
+ "epoch": 1.85,
252
+ "learning_rate": 9.05905905905906e-06,
253
+ "loss": 2.8078,
254
+ "step": 205
255
+ },
256
+ {
257
+ "epoch": 1.89,
258
+ "learning_rate": 9.00900900900901e-06,
259
+ "loss": 2.9926,
260
+ "step": 210
261
+ },
262
+ {
263
+ "epoch": 1.94,
264
+ "learning_rate": 8.95895895895896e-06,
265
+ "loss": 2.8766,
266
+ "step": 215
267
+ },
268
+ {
269
+ "epoch": 1.98,
270
+ "learning_rate": 8.90890890890891e-06,
271
+ "loss": 2.8145,
272
+ "step": 220
273
+ },
274
+ {
275
+ "epoch": 2.03,
276
+ "learning_rate": 8.85885885885886e-06,
277
+ "loss": 2.7383,
278
+ "step": 225
279
+ },
280
+ {
281
+ "epoch": 2.07,
282
+ "learning_rate": 8.80880880880881e-06,
283
+ "loss": 2.7613,
284
+ "step": 230
285
+ },
286
+ {
287
+ "epoch": 2.12,
288
+ "learning_rate": 8.75875875875876e-06,
289
+ "loss": 2.9582,
290
+ "step": 235
291
+ },
292
+ {
293
+ "epoch": 2.16,
294
+ "learning_rate": 8.70870870870871e-06,
295
+ "loss": 2.8043,
296
+ "step": 240
297
+ },
298
+ {
299
+ "epoch": 2.21,
300
+ "learning_rate": 8.65865865865866e-06,
301
+ "loss": 2.7922,
302
+ "step": 245
303
+ },
304
+ {
305
+ "epoch": 2.25,
306
+ "learning_rate": 8.60860860860861e-06,
307
+ "loss": 2.9848,
308
+ "step": 250
309
+ },
310
+ {
311
+ "epoch": 2.3,
312
+ "learning_rate": 8.55855855855856e-06,
313
+ "loss": 2.8777,
314
+ "step": 255
315
+ },
316
+ {
317
+ "epoch": 2.34,
318
+ "learning_rate": 8.50850850850851e-06,
319
+ "loss": 2.9402,
320
+ "step": 260
321
+ },
322
+ {
323
+ "epoch": 2.39,
324
+ "learning_rate": 8.45845845845846e-06,
325
+ "loss": 2.7785,
326
+ "step": 265
327
+ },
328
+ {
329
+ "epoch": 2.43,
330
+ "learning_rate": 8.408408408408409e-06,
331
+ "loss": 2.8871,
332
+ "step": 270
333
+ },
334
+ {
335
+ "epoch": 2.48,
336
+ "learning_rate": 8.358358358358359e-06,
337
+ "loss": 2.8348,
338
+ "step": 275
339
+ },
340
+ {
341
+ "epoch": 2.52,
342
+ "learning_rate": 8.308308308308309e-06,
343
+ "loss": 2.7664,
344
+ "step": 280
345
+ },
346
+ {
347
+ "epoch": 2.57,
348
+ "learning_rate": 8.258258258258259e-06,
349
+ "loss": 2.7973,
350
+ "step": 285
351
+ },
352
+ {
353
+ "epoch": 2.61,
354
+ "learning_rate": 8.208208208208209e-06,
355
+ "loss": 2.7527,
356
+ "step": 290
357
+ },
358
+ {
359
+ "epoch": 2.66,
360
+ "learning_rate": 8.158158158158159e-06,
361
+ "loss": 2.7844,
362
+ "step": 295
363
+ },
364
+ {
365
+ "epoch": 2.7,
366
+ "learning_rate": 8.108108108108109e-06,
367
+ "loss": 2.8102,
368
+ "step": 300
369
+ },
370
+ {
371
+ "epoch": 2.75,
372
+ "learning_rate": 8.058058058058059e-06,
373
+ "loss": 2.923,
374
+ "step": 305
375
+ },
376
+ {
377
+ "epoch": 2.79,
378
+ "learning_rate": 8.00800800800801e-06,
379
+ "loss": 2.7535,
380
+ "step": 310
381
+ },
382
+ {
383
+ "epoch": 2.84,
384
+ "learning_rate": 7.95795795795796e-06,
385
+ "loss": 2.8164,
386
+ "step": 315
387
+ },
388
+ {
389
+ "epoch": 2.88,
390
+ "learning_rate": 7.90790790790791e-06,
391
+ "loss": 2.8312,
392
+ "step": 320
393
+ },
394
+ {
395
+ "epoch": 2.93,
396
+ "learning_rate": 7.85785785785786e-06,
397
+ "loss": 2.8027,
398
+ "step": 325
399
+ },
400
+ {
401
+ "epoch": 2.97,
402
+ "learning_rate": 7.807807807807808e-06,
403
+ "loss": 2.7281,
404
+ "step": 330
405
+ },
406
+ {
407
+ "epoch": 3.02,
408
+ "learning_rate": 7.757757757757758e-06,
409
+ "loss": 2.8328,
410
+ "step": 335
411
+ },
412
+ {
413
+ "epoch": 3.06,
414
+ "learning_rate": 7.707707707707708e-06,
415
+ "loss": 2.8695,
416
+ "step": 340
417
+ },
418
+ {
419
+ "epoch": 3.11,
420
+ "learning_rate": 7.657657657657658e-06,
421
+ "loss": 2.8,
422
+ "step": 345
423
+ },
424
+ {
425
+ "epoch": 3.15,
426
+ "learning_rate": 7.607607607607608e-06,
427
+ "loss": 2.8832,
428
+ "step": 350
429
+ },
430
+ {
431
+ "epoch": 3.2,
432
+ "learning_rate": 7.557557557557558e-06,
433
+ "loss": 2.7445,
434
+ "step": 355
435
+ },
436
+ {
437
+ "epoch": 3.24,
438
+ "learning_rate": 7.507507507507507e-06,
439
+ "loss": 2.7359,
440
+ "step": 360
441
+ },
442
+ {
443
+ "epoch": 3.29,
444
+ "learning_rate": 7.457457457457457e-06,
445
+ "loss": 2.6559,
446
+ "step": 365
447
+ },
448
+ {
449
+ "epoch": 3.33,
450
+ "learning_rate": 7.4074074074074075e-06,
451
+ "loss": 2.7598,
452
+ "step": 370
453
+ },
454
+ {
455
+ "epoch": 3.38,
456
+ "learning_rate": 7.3573573573573575e-06,
457
+ "loss": 2.8637,
458
+ "step": 375
459
+ },
460
+ {
461
+ "epoch": 3.42,
462
+ "learning_rate": 7.307307307307308e-06,
463
+ "loss": 2.7266,
464
+ "step": 380
465
+ },
466
+ {
467
+ "epoch": 3.47,
468
+ "learning_rate": 7.257257257257258e-06,
469
+ "loss": 2.7289,
470
+ "step": 385
471
+ },
472
+ {
473
+ "epoch": 3.51,
474
+ "learning_rate": 7.207207207207208e-06,
475
+ "loss": 2.7895,
476
+ "step": 390
477
+ },
478
+ {
479
+ "epoch": 3.56,
480
+ "learning_rate": 7.157157157157158e-06,
481
+ "loss": 2.7336,
482
+ "step": 395
483
+ },
484
+ {
485
+ "epoch": 3.6,
486
+ "learning_rate": 7.107107107107107e-06,
487
+ "loss": 2.6094,
488
+ "step": 400
489
+ },
490
+ {
491
+ "epoch": 3.65,
492
+ "learning_rate": 7.057057057057057e-06,
493
+ "loss": 2.9809,
494
+ "step": 405
495
+ },
496
+ {
497
+ "epoch": 3.69,
498
+ "learning_rate": 7.007007007007007e-06,
499
+ "loss": 2.8156,
500
+ "step": 410
501
+ },
502
+ {
503
+ "epoch": 3.74,
504
+ "learning_rate": 6.956956956956957e-06,
505
+ "loss": 2.798,
506
+ "step": 415
507
+ },
508
+ {
509
+ "epoch": 3.78,
510
+ "learning_rate": 6.906906906906907e-06,
511
+ "loss": 2.6703,
512
+ "step": 420
513
+ },
514
+ {
515
+ "epoch": 3.83,
516
+ "learning_rate": 6.856856856856857e-06,
517
+ "loss": 2.8105,
518
+ "step": 425
519
+ },
520
+ {
521
+ "epoch": 3.87,
522
+ "learning_rate": 6.8068068068068075e-06,
523
+ "loss": 2.6707,
524
+ "step": 430
525
+ },
526
+ {
527
+ "epoch": 3.92,
528
+ "learning_rate": 6.7567567567567575e-06,
529
+ "loss": 2.7289,
530
+ "step": 435
531
+ },
532
+ {
533
+ "epoch": 3.96,
534
+ "learning_rate": 6.706706706706707e-06,
535
+ "loss": 2.7004,
536
+ "step": 440
537
+ },
538
+ {
539
+ "epoch": 4.01,
540
+ "learning_rate": 6.656656656656657e-06,
541
+ "loss": 2.8719,
542
+ "step": 445
543
+ },
544
+ {
545
+ "epoch": 4.05,
546
+ "learning_rate": 6.606606606606607e-06,
547
+ "loss": 2.6273,
548
+ "step": 450
549
+ },
550
+ {
551
+ "epoch": 4.1,
552
+ "learning_rate": 6.556556556556557e-06,
553
+ "loss": 2.8629,
554
+ "step": 455
555
+ },
556
+ {
557
+ "epoch": 4.14,
558
+ "learning_rate": 6.506506506506507e-06,
559
+ "loss": 2.8672,
560
+ "step": 460
561
+ },
562
+ {
563
+ "epoch": 4.19,
564
+ "learning_rate": 6.456456456456457e-06,
565
+ "loss": 2.725,
566
+ "step": 465
567
+ },
568
+ {
569
+ "epoch": 4.23,
570
+ "learning_rate": 6.406406406406407e-06,
571
+ "loss": 2.8004,
572
+ "step": 470
573
+ },
574
+ {
575
+ "epoch": 4.28,
576
+ "learning_rate": 6.356356356356357e-06,
577
+ "loss": 2.882,
578
+ "step": 475
579
+ },
580
+ {
581
+ "epoch": 4.32,
582
+ "learning_rate": 6.3063063063063065e-06,
583
+ "loss": 2.8027,
584
+ "step": 480
585
+ },
586
+ {
587
+ "epoch": 4.37,
588
+ "learning_rate": 6.2562562562562565e-06,
589
+ "loss": 2.8988,
590
+ "step": 485
591
+ },
592
+ {
593
+ "epoch": 4.41,
594
+ "learning_rate": 6.206206206206207e-06,
595
+ "loss": 2.6898,
596
+ "step": 490
597
+ },
598
+ {
599
+ "epoch": 4.46,
600
+ "learning_rate": 6.156156156156157e-06,
601
+ "loss": 2.9855,
602
+ "step": 495
603
+ },
604
+ {
605
+ "epoch": 4.5,
606
+ "learning_rate": 6.106106106106107e-06,
607
+ "loss": 2.8246,
608
+ "step": 500
609
+ },
610
+ {
611
+ "epoch": 4.55,
612
+ "learning_rate": 6.056056056056057e-06,
613
+ "loss": 2.8918,
614
+ "step": 505
615
+ },
616
+ {
617
+ "epoch": 4.59,
618
+ "learning_rate": 6.006006006006007e-06,
619
+ "loss": 2.8988,
620
+ "step": 510
621
+ },
622
+ {
623
+ "epoch": 4.64,
624
+ "learning_rate": 5.955955955955957e-06,
625
+ "loss": 2.7313,
626
+ "step": 515
627
+ },
628
+ {
629
+ "epoch": 4.68,
630
+ "learning_rate": 5.905905905905906e-06,
631
+ "loss": 2.8082,
632
+ "step": 520
633
+ },
634
+ {
635
+ "epoch": 4.73,
636
+ "learning_rate": 5.855855855855856e-06,
637
+ "loss": 2.8164,
638
+ "step": 525
639
+ },
640
+ {
641
+ "epoch": 4.77,
642
+ "learning_rate": 5.805805805805806e-06,
643
+ "loss": 2.7496,
644
+ "step": 530
645
+ },
646
+ {
647
+ "epoch": 4.82,
648
+ "learning_rate": 5.755755755755756e-06,
649
+ "loss": 2.5914,
650
+ "step": 535
651
+ },
652
+ {
653
+ "epoch": 4.86,
654
+ "learning_rate": 5.7057057057057065e-06,
655
+ "loss": 2.7824,
656
+ "step": 540
657
+ },
658
+ {
659
+ "epoch": 4.91,
660
+ "learning_rate": 5.6556556556556565e-06,
661
+ "loss": 2.6574,
662
+ "step": 545
663
+ },
664
+ {
665
+ "epoch": 4.95,
666
+ "learning_rate": 5.605605605605607e-06,
667
+ "loss": 2.8434,
668
+ "step": 550
669
+ },
670
+ {
671
+ "epoch": 5.0,
672
+ "learning_rate": 5.555555555555557e-06,
673
+ "loss": 2.8793,
674
+ "step": 555
675
+ },
676
+ {
677
+ "epoch": 5.05,
678
+ "learning_rate": 5.505505505505506e-06,
679
+ "loss": 2.7293,
680
+ "step": 560
681
+ },
682
+ {
683
+ "epoch": 5.09,
684
+ "learning_rate": 5.455455455455456e-06,
685
+ "loss": 2.7664,
686
+ "step": 565
687
+ },
688
+ {
689
+ "epoch": 5.14,
690
+ "learning_rate": 5.405405405405406e-06,
691
+ "loss": 2.7891,
692
+ "step": 570
693
+ },
694
+ {
695
+ "epoch": 5.18,
696
+ "learning_rate": 5.355355355355356e-06,
697
+ "loss": 2.868,
698
+ "step": 575
699
+ },
700
+ {
701
+ "epoch": 5.23,
702
+ "learning_rate": 5.305305305305306e-06,
703
+ "loss": 2.8664,
704
+ "step": 580
705
+ },
706
+ {
707
+ "epoch": 5.27,
708
+ "learning_rate": 5.255255255255256e-06,
709
+ "loss": 2.8324,
710
+ "step": 585
711
+ },
712
+ {
713
+ "epoch": 5.32,
714
+ "learning_rate": 5.205205205205206e-06,
715
+ "loss": 2.7469,
716
+ "step": 590
717
+ },
718
+ {
719
+ "epoch": 5.36,
720
+ "learning_rate": 5.155155155155156e-06,
721
+ "loss": 2.7973,
722
+ "step": 595
723
+ },
724
+ {
725
+ "epoch": 5.41,
726
+ "learning_rate": 5.105105105105106e-06,
727
+ "loss": 2.8254,
728
+ "step": 600
729
+ },
730
+ {
731
+ "epoch": 5.45,
732
+ "learning_rate": 5.055055055055056e-06,
733
+ "loss": 2.7656,
734
+ "step": 605
735
+ },
736
+ {
737
+ "epoch": 5.5,
738
+ "learning_rate": 5.005005005005006e-06,
739
+ "loss": 2.766,
740
+ "step": 610
741
+ },
742
+ {
743
+ "epoch": 5.54,
744
+ "learning_rate": 4.954954954954955e-06,
745
+ "loss": 2.8266,
746
+ "step": 615
747
+ },
748
+ {
749
+ "epoch": 5.59,
750
+ "learning_rate": 4.904904904904905e-06,
751
+ "loss": 2.868,
752
+ "step": 620
753
+ },
754
+ {
755
+ "epoch": 5.63,
756
+ "learning_rate": 4.854854854854855e-06,
757
+ "loss": 2.7523,
758
+ "step": 625
759
+ },
760
+ {
761
+ "epoch": 5.68,
762
+ "learning_rate": 4.804804804804805e-06,
763
+ "loss": 2.8625,
764
+ "step": 630
765
+ },
766
+ {
767
+ "epoch": 5.72,
768
+ "learning_rate": 4.754754754754755e-06,
769
+ "loss": 2.7098,
770
+ "step": 635
771
+ },
772
+ {
773
+ "epoch": 5.77,
774
+ "learning_rate": 4.704704704704705e-06,
775
+ "loss": 2.584,
776
+ "step": 640
777
+ },
778
+ {
779
+ "epoch": 5.81,
780
+ "learning_rate": 4.654654654654655e-06,
781
+ "loss": 2.7652,
782
+ "step": 645
783
+ },
784
+ {
785
+ "epoch": 5.86,
786
+ "learning_rate": 4.604604604604605e-06,
787
+ "loss": 2.7668,
788
+ "step": 650
789
+ },
790
+ {
791
+ "epoch": 5.9,
792
+ "learning_rate": 4.554554554554555e-06,
793
+ "loss": 2.9934,
794
+ "step": 655
795
+ },
796
+ {
797
+ "epoch": 5.95,
798
+ "learning_rate": 4.504504504504505e-06,
799
+ "loss": 2.807,
800
+ "step": 660
801
+ },
802
+ {
803
+ "epoch": 5.99,
804
+ "learning_rate": 4.454454454454455e-06,
805
+ "loss": 2.7543,
806
+ "step": 665
807
+ },
808
+ {
809
+ "epoch": 6.04,
810
+ "learning_rate": 4.404404404404405e-06,
811
+ "loss": 2.7055,
812
+ "step": 670
813
+ },
814
+ {
815
+ "epoch": 6.08,
816
+ "learning_rate": 4.354354354354355e-06,
817
+ "loss": 2.6199,
818
+ "step": 675
819
+ },
820
+ {
821
+ "epoch": 6.13,
822
+ "learning_rate": 4.304304304304305e-06,
823
+ "loss": 2.757,
824
+ "step": 680
825
+ },
826
+ {
827
+ "epoch": 6.17,
828
+ "learning_rate": 4.254254254254255e-06,
829
+ "loss": 2.7941,
830
+ "step": 685
831
+ },
832
+ {
833
+ "epoch": 6.22,
834
+ "learning_rate": 4.204204204204204e-06,
835
+ "loss": 2.7836,
836
+ "step": 690
837
+ },
838
+ {
839
+ "epoch": 6.26,
840
+ "learning_rate": 4.154154154154154e-06,
841
+ "loss": 2.7062,
842
+ "step": 695
843
+ },
844
+ {
845
+ "epoch": 6.31,
846
+ "learning_rate": 4.1041041041041045e-06,
847
+ "loss": 2.807,
848
+ "step": 700
849
+ },
850
+ {
851
+ "epoch": 6.35,
852
+ "learning_rate": 4.0540540540540545e-06,
853
+ "loss": 2.8246,
854
+ "step": 705
855
+ },
856
+ {
857
+ "epoch": 6.4,
858
+ "learning_rate": 4.004004004004005e-06,
859
+ "loss": 2.8395,
860
+ "step": 710
861
+ },
862
+ {
863
+ "epoch": 6.44,
864
+ "learning_rate": 3.953953953953955e-06,
865
+ "loss": 2.7781,
866
+ "step": 715
867
+ },
868
+ {
869
+ "epoch": 6.49,
870
+ "learning_rate": 3.903903903903904e-06,
871
+ "loss": 2.732,
872
+ "step": 720
873
+ },
874
+ {
875
+ "epoch": 6.53,
876
+ "learning_rate": 3.853853853853854e-06,
877
+ "loss": 2.7281,
878
+ "step": 725
879
+ },
880
+ {
881
+ "epoch": 6.58,
882
+ "learning_rate": 3.803803803803804e-06,
883
+ "loss": 2.6957,
884
+ "step": 730
885
+ },
886
+ {
887
+ "epoch": 6.62,
888
+ "learning_rate": 3.7537537537537537e-06,
889
+ "loss": 2.7516,
890
+ "step": 735
891
+ },
892
+ {
893
+ "epoch": 6.67,
894
+ "learning_rate": 3.7037037037037037e-06,
895
+ "loss": 2.741,
896
+ "step": 740
897
+ },
898
+ {
899
+ "epoch": 6.71,
900
+ "learning_rate": 3.653653653653654e-06,
901
+ "loss": 2.798,
902
+ "step": 745
903
+ },
904
+ {
905
+ "epoch": 6.76,
906
+ "learning_rate": 3.603603603603604e-06,
907
+ "loss": 2.741,
908
+ "step": 750
909
+ },
910
+ {
911
+ "epoch": 6.8,
912
+ "learning_rate": 3.5535535535535535e-06,
913
+ "loss": 2.7691,
914
+ "step": 755
915
+ },
916
+ {
917
+ "epoch": 6.85,
918
+ "learning_rate": 3.5035035035035036e-06,
919
+ "loss": 2.7039,
920
+ "step": 760
921
+ },
922
+ {
923
+ "epoch": 6.89,
924
+ "learning_rate": 3.4534534534534537e-06,
925
+ "loss": 2.7797,
926
+ "step": 765
927
+ },
928
+ {
929
+ "epoch": 6.94,
930
+ "learning_rate": 3.4034034034034037e-06,
931
+ "loss": 2.918,
932
+ "step": 770
933
+ },
934
+ {
935
+ "epoch": 6.98,
936
+ "learning_rate": 3.3533533533533534e-06,
937
+ "loss": 2.6195,
938
+ "step": 775
939
+ },
940
+ {
941
+ "epoch": 7.03,
942
+ "learning_rate": 3.3033033033033035e-06,
943
+ "loss": 2.5387,
944
+ "step": 780
945
+ },
946
+ {
947
+ "epoch": 7.07,
948
+ "learning_rate": 3.2532532532532535e-06,
949
+ "loss": 2.7352,
950
+ "step": 785
951
+ },
952
+ {
953
+ "epoch": 7.12,
954
+ "learning_rate": 3.2032032032032036e-06,
955
+ "loss": 2.5961,
956
+ "step": 790
957
+ },
958
+ {
959
+ "epoch": 7.16,
960
+ "learning_rate": 3.1531531531531532e-06,
961
+ "loss": 2.8273,
962
+ "step": 795
963
+ },
964
+ {
965
+ "epoch": 7.21,
966
+ "learning_rate": 3.1031031031031033e-06,
967
+ "loss": 2.6625,
968
+ "step": 800
969
+ },
970
+ {
971
+ "epoch": 7.25,
972
+ "learning_rate": 3.0530530530530534e-06,
973
+ "loss": 2.7,
974
+ "step": 805
975
+ },
976
+ {
977
+ "epoch": 7.3,
978
+ "learning_rate": 3.0030030030030034e-06,
979
+ "loss": 2.7527,
980
+ "step": 810
981
+ },
982
+ {
983
+ "epoch": 7.34,
984
+ "learning_rate": 2.952952952952953e-06,
985
+ "loss": 2.7172,
986
+ "step": 815
987
+ },
988
+ {
989
+ "epoch": 7.39,
990
+ "learning_rate": 2.902902902902903e-06,
991
+ "loss": 2.7395,
992
+ "step": 820
993
+ },
994
+ {
995
+ "epoch": 7.43,
996
+ "learning_rate": 2.8528528528528532e-06,
997
+ "loss": 2.6105,
998
+ "step": 825
999
+ },
1000
+ {
1001
+ "epoch": 7.48,
1002
+ "learning_rate": 2.8028028028028033e-06,
1003
+ "loss": 2.7566,
1004
+ "step": 830
1005
+ },
1006
+ {
1007
+ "epoch": 7.52,
1008
+ "learning_rate": 2.752752752752753e-06,
1009
+ "loss": 2.6984,
1010
+ "step": 835
1011
+ },
1012
+ {
1013
+ "epoch": 7.57,
1014
+ "learning_rate": 2.702702702702703e-06,
1015
+ "loss": 2.8969,
1016
+ "step": 840
1017
+ },
1018
+ {
1019
+ "epoch": 7.61,
1020
+ "learning_rate": 2.652652652652653e-06,
1021
+ "loss": 2.6301,
1022
+ "step": 845
1023
+ },
1024
+ {
1025
+ "epoch": 7.66,
1026
+ "learning_rate": 2.602602602602603e-06,
1027
+ "loss": 2.7102,
1028
+ "step": 850
1029
+ },
1030
+ {
1031
+ "epoch": 7.7,
1032
+ "learning_rate": 2.552552552552553e-06,
1033
+ "loss": 2.9152,
1034
+ "step": 855
1035
+ },
1036
+ {
1037
+ "epoch": 7.75,
1038
+ "learning_rate": 2.502502502502503e-06,
1039
+ "loss": 2.6543,
1040
+ "step": 860
1041
+ },
1042
+ {
1043
+ "epoch": 7.79,
1044
+ "learning_rate": 2.4524524524524525e-06,
1045
+ "loss": 2.6109,
1046
+ "step": 865
1047
+ },
1048
+ {
1049
+ "epoch": 7.84,
1050
+ "learning_rate": 2.4024024024024026e-06,
1051
+ "loss": 2.5871,
1052
+ "step": 870
1053
+ },
1054
+ {
1055
+ "epoch": 7.88,
1056
+ "learning_rate": 2.3523523523523527e-06,
1057
+ "loss": 2.752,
1058
+ "step": 875
1059
+ },
1060
+ {
1061
+ "epoch": 7.93,
1062
+ "learning_rate": 2.3023023023023023e-06,
1063
+ "loss": 2.716,
1064
+ "step": 880
1065
+ },
1066
+ {
1067
+ "epoch": 7.97,
1068
+ "learning_rate": 2.2522522522522524e-06,
1069
+ "loss": 2.807,
1070
+ "step": 885
1071
+ },
1072
+ {
1073
+ "epoch": 8.02,
1074
+ "learning_rate": 2.2022022022022024e-06,
1075
+ "loss": 2.877,
1076
+ "step": 890
1077
+ },
1078
+ {
1079
+ "epoch": 8.06,
1080
+ "learning_rate": 2.1521521521521525e-06,
1081
+ "loss": 2.716,
1082
+ "step": 895
1083
+ },
1084
+ {
1085
+ "epoch": 8.11,
1086
+ "learning_rate": 2.102102102102102e-06,
1087
+ "loss": 2.6879,
1088
+ "step": 900
1089
+ },
1090
+ {
1091
+ "epoch": 8.15,
1092
+ "learning_rate": 2.0520520520520522e-06,
1093
+ "loss": 2.7137,
1094
+ "step": 905
1095
+ },
1096
+ {
1097
+ "epoch": 8.2,
1098
+ "learning_rate": 2.0020020020020023e-06,
1099
+ "loss": 2.7133,
1100
+ "step": 910
1101
+ },
1102
+ {
1103
+ "epoch": 8.24,
1104
+ "learning_rate": 1.951951951951952e-06,
1105
+ "loss": 2.7453,
1106
+ "step": 915
1107
+ },
1108
+ {
1109
+ "epoch": 8.29,
1110
+ "learning_rate": 1.901901901901902e-06,
1111
+ "loss": 2.8281,
1112
+ "step": 920
1113
+ },
1114
+ {
1115
+ "epoch": 8.33,
1116
+ "learning_rate": 1.8518518518518519e-06,
1117
+ "loss": 2.857,
1118
+ "step": 925
1119
+ },
1120
+ {
1121
+ "epoch": 8.38,
1122
+ "learning_rate": 1.801801801801802e-06,
1123
+ "loss": 2.7105,
1124
+ "step": 930
1125
+ },
1126
+ {
1127
+ "epoch": 8.42,
1128
+ "learning_rate": 1.7517517517517518e-06,
1129
+ "loss": 2.7879,
1130
+ "step": 935
1131
+ },
1132
+ {
1133
+ "epoch": 8.47,
1134
+ "learning_rate": 1.7017017017017019e-06,
1135
+ "loss": 2.8539,
1136
+ "step": 940
1137
+ },
1138
+ {
1139
+ "epoch": 8.51,
1140
+ "learning_rate": 1.6516516516516517e-06,
1141
+ "loss": 2.7656,
1142
+ "step": 945
1143
+ },
1144
+ {
1145
+ "epoch": 8.56,
1146
+ "learning_rate": 1.6016016016016018e-06,
1147
+ "loss": 2.677,
1148
+ "step": 950
1149
+ },
1150
+ {
1151
+ "epoch": 8.6,
1152
+ "learning_rate": 1.5515515515515517e-06,
1153
+ "loss": 2.809,
1154
+ "step": 955
1155
+ },
1156
+ {
1157
+ "epoch": 8.65,
1158
+ "learning_rate": 1.5015015015015017e-06,
1159
+ "loss": 2.5945,
1160
+ "step": 960
1161
+ },
1162
+ {
1163
+ "epoch": 8.69,
1164
+ "learning_rate": 1.4514514514514516e-06,
1165
+ "loss": 2.8148,
1166
+ "step": 965
1167
+ },
1168
+ {
1169
+ "epoch": 8.74,
1170
+ "learning_rate": 1.4014014014014016e-06,
1171
+ "loss": 2.7242,
1172
+ "step": 970
1173
+ },
1174
+ {
1175
+ "epoch": 8.78,
1176
+ "learning_rate": 1.3513513513513515e-06,
1177
+ "loss": 2.9172,
1178
+ "step": 975
1179
+ },
1180
+ {
1181
+ "epoch": 8.83,
1182
+ "learning_rate": 1.3013013013013016e-06,
1183
+ "loss": 2.7965,
1184
+ "step": 980
1185
+ },
1186
+ {
1187
+ "epoch": 8.87,
1188
+ "learning_rate": 1.2512512512512514e-06,
1189
+ "loss": 2.8918,
1190
+ "step": 985
1191
+ },
1192
+ {
1193
+ "epoch": 8.92,
1194
+ "learning_rate": 1.2012012012012013e-06,
1195
+ "loss": 2.6551,
1196
+ "step": 990
1197
+ },
1198
+ {
1199
+ "epoch": 8.96,
1200
+ "learning_rate": 1.1511511511511512e-06,
1201
+ "loss": 2.8613,
1202
+ "step": 995
1203
+ },
1204
+ {
1205
+ "epoch": 9.01,
1206
+ "learning_rate": 1.1011011011011012e-06,
1207
+ "loss": 2.7629,
1208
+ "step": 1000
1209
+ },
1210
+ {
1211
+ "epoch": 9.05,
1212
+ "learning_rate": 1.051051051051051e-06,
1213
+ "loss": 2.5543,
1214
+ "step": 1005
1215
+ },
1216
+ {
1217
+ "epoch": 9.1,
1218
+ "learning_rate": 1.0010010010010011e-06,
1219
+ "loss": 2.9523,
1220
+ "step": 1010
1221
+ },
1222
+ {
1223
+ "epoch": 9.14,
1224
+ "learning_rate": 9.50950950950951e-07,
1225
+ "loss": 2.7785,
1226
+ "step": 1015
1227
+ },
1228
+ {
1229
+ "epoch": 9.19,
1230
+ "learning_rate": 9.00900900900901e-07,
1231
+ "loss": 2.6758,
1232
+ "step": 1020
1233
+ },
1234
+ {
1235
+ "epoch": 9.23,
1236
+ "learning_rate": 8.508508508508509e-07,
1237
+ "loss": 2.5539,
1238
+ "step": 1025
1239
+ },
1240
+ {
1241
+ "epoch": 9.28,
1242
+ "learning_rate": 8.008008008008009e-07,
1243
+ "loss": 2.6223,
1244
+ "step": 1030
1245
+ },
1246
+ {
1247
+ "epoch": 9.32,
1248
+ "learning_rate": 7.507507507507509e-07,
1249
+ "loss": 2.882,
1250
+ "step": 1035
1251
+ },
1252
+ {
1253
+ "epoch": 9.37,
1254
+ "learning_rate": 7.007007007007008e-07,
1255
+ "loss": 2.9254,
1256
+ "step": 1040
1257
+ },
1258
+ {
1259
+ "epoch": 9.41,
1260
+ "learning_rate": 6.506506506506508e-07,
1261
+ "loss": 2.825,
1262
+ "step": 1045
1263
+ },
1264
+ {
1265
+ "epoch": 9.46,
1266
+ "learning_rate": 6.006006006006006e-07,
1267
+ "loss": 2.7879,
1268
+ "step": 1050
1269
+ },
1270
+ {
1271
+ "epoch": 9.5,
1272
+ "learning_rate": 5.505505505505506e-07,
1273
+ "loss": 2.7621,
1274
+ "step": 1055
1275
+ },
1276
+ {
1277
+ "epoch": 9.55,
1278
+ "learning_rate": 5.005005005005006e-07,
1279
+ "loss": 2.7711,
1280
+ "step": 1060
1281
+ },
1282
+ {
1283
+ "epoch": 9.59,
1284
+ "learning_rate": 4.504504504504505e-07,
1285
+ "loss": 2.7539,
1286
+ "step": 1065
1287
+ },
1288
+ {
1289
+ "epoch": 9.64,
1290
+ "learning_rate": 4.0040040040040045e-07,
1291
+ "loss": 2.7781,
1292
+ "step": 1070
1293
+ },
1294
+ {
1295
+ "epoch": 9.68,
1296
+ "learning_rate": 3.503503503503504e-07,
1297
+ "loss": 2.7246,
1298
+ "step": 1075
1299
+ },
1300
+ {
1301
+ "epoch": 9.73,
1302
+ "learning_rate": 3.003003003003003e-07,
1303
+ "loss": 2.9125,
1304
+ "step": 1080
1305
+ },
1306
+ {
1307
+ "epoch": 9.77,
1308
+ "learning_rate": 2.502502502502503e-07,
1309
+ "loss": 2.757,
1310
+ "step": 1085
1311
+ },
1312
+ {
1313
+ "epoch": 9.82,
1314
+ "learning_rate": 2.0020020020020022e-07,
1315
+ "loss": 2.7824,
1316
+ "step": 1090
1317
+ },
1318
+ {
1319
+ "epoch": 9.86,
1320
+ "learning_rate": 1.5015015015015016e-07,
1321
+ "loss": 2.6824,
1322
+ "step": 1095
1323
+ },
1324
+ {
1325
+ "epoch": 9.91,
1326
+ "learning_rate": 1.0010010010010011e-07,
1327
+ "loss": 2.6035,
1328
+ "step": 1100
1329
+ },
1330
+ {
1331
+ "epoch": 9.95,
1332
+ "learning_rate": 5.0050050050050056e-08,
1333
+ "loss": 2.7152,
1334
+ "step": 1105
1335
+ },
1336
+ {
1337
+ "epoch": 10.0,
1338
+ "learning_rate": 0.0,
1339
+ "loss": 2.7191,
1340
+ "step": 1110
1341
+ },
1342
+ {
1343
+ "epoch": 10.0,
1344
+ "step": 1110,
1345
+ "total_flos": 2.1618654360131705e+21,
1346
+ "train_loss": 2.8070910754504506,
1347
+ "train_runtime": 11217.8167,
1348
+ "train_samples_per_second": 101.845,
1349
+ "train_steps_per_second": 0.099
1350
+ }
1351
+ ],
1352
+ "max_steps": 1110,
1353
+ "num_train_epochs": 10,
1354
+ "total_flos": 2.1618654360131705e+21,
1355
+ "trial_name": null,
1356
+ "trial_params": null
1357
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:979ff9182aeb728abc8d230808f9beacd1fa8f4313b31cd7cbe459946f5c8a87
3
+ size 2927
vocab.json ADDED
The diff for this file is too large to render. See raw diff