training on large slovak corpus
Browse files- config.json +5 -1
- hydra/config.yaml +9 -8
- hydra/hydra.yaml +11 -7
- hydra/overrides.yaml +9 -5
- main.log +0 -0
- model.safetensors +2 -2
- optimizer.bin +2 -2
- random_states_0.pkl +1 -1
- scheduler.bin +1 -1
- spiece.model +2 -2
- spiece.vocab +0 -0
- train-model.sh +1 -0
config.json
CHANGED
@@ -21,5 +21,9 @@
|
|
21 |
"pad_token_id": 0,
|
22 |
"relative_attention_num_buckets": 32,
|
23 |
"tie_word_embeddings": false,
|
24 |
-
"vocab_size":
|
|
|
|
|
|
|
|
|
25 |
}
|
|
|
21 |
"pad_token_id": 0,
|
22 |
"relative_attention_num_buckets": 32,
|
23 |
"tie_word_embeddings": false,
|
24 |
+
"vocab_size": 120100,
|
25 |
+
"sp_model_kwargs":{
|
26 |
+
"enable_sampling": true,
|
27 |
+
"nbest_size": 4
|
28 |
+
}
|
29 |
}
|
hydra/config.yaml
CHANGED
@@ -6,36 +6,37 @@ predict_only: false
|
|
6 |
seed: 2137
|
7 |
model:
|
8 |
klass: hf_t5
|
9 |
-
name: /home/jovyan/bert-train/nanot5/
|
10 |
overwrite:
|
11 |
dropout_rate: 0.0
|
12 |
add_config:
|
13 |
-
is_bf16:
|
14 |
checkpoint_path: ''
|
15 |
random_init: true
|
16 |
compile: false
|
17 |
data:
|
|
|
18 |
input_length: 512
|
19 |
mlm_probability: 0.15
|
20 |
mean_noise_span_length: 3.0
|
21 |
-
num_workers:
|
22 |
optim:
|
23 |
-
name:
|
24 |
base_lr: 0.02
|
25 |
batch_size: 128
|
26 |
-
total_steps:
|
27 |
epochs: -1
|
28 |
warmup_steps: 10000
|
29 |
-
lr_scheduler:
|
30 |
weight_decay: 0.0
|
31 |
grad_clip: 1.0
|
32 |
-
grad_acc:
|
33 |
final_cosine: 1.0e-05
|
34 |
eval:
|
35 |
every_steps: 5000
|
36 |
steps: 500
|
37 |
checkpoint:
|
38 |
-
every_steps:
|
39 |
logging:
|
40 |
neptune: false
|
41 |
neptune_creds:
|
|
|
6 |
seed: 2137
|
7 |
model:
|
8 |
klass: hf_t5
|
9 |
+
name: /home/jovyan/bert-train/nanot5/templates/base_sklarge_120k
|
10 |
overwrite:
|
11 |
dropout_rate: 0.0
|
12 |
add_config:
|
13 |
+
is_bf16: true
|
14 |
checkpoint_path: ''
|
15 |
random_init: true
|
16 |
compile: false
|
17 |
data:
|
18 |
+
train_path: /home/jovyan/data/sklarge-shards
|
19 |
input_length: 512
|
20 |
mlm_probability: 0.15
|
21 |
mean_noise_span_length: 3.0
|
22 |
+
num_workers: 8
|
23 |
optim:
|
24 |
+
name: adafactor
|
25 |
base_lr: 0.02
|
26 |
batch_size: 128
|
27 |
+
total_steps: 120000
|
28 |
epochs: -1
|
29 |
warmup_steps: 10000
|
30 |
+
lr_scheduler: legacy
|
31 |
weight_decay: 0.0
|
32 |
grad_clip: 1.0
|
33 |
+
grad_acc: 8
|
34 |
final_cosine: 1.0e-05
|
35 |
eval:
|
36 |
every_steps: 5000
|
37 |
steps: 500
|
38 |
checkpoint:
|
39 |
+
every_steps: 10000
|
40 |
logging:
|
41 |
neptune: false
|
42 |
neptune_creds:
|
hydra/hydra.yaml
CHANGED
@@ -112,17 +112,21 @@ hydra:
|
|
112 |
hydra:
|
113 |
- hydra.mode=RUN
|
114 |
task:
|
115 |
-
- optim.name=
|
116 |
-
- optim.lr_scheduler=
|
117 |
-
- model.name=/home/jovyan/bert-train/nanot5/
|
118 |
-
-
|
|
|
119 |
- model.klass=hf_t5
|
120 |
- eval.every_steps=5000
|
121 |
-
-
|
|
|
|
|
|
|
122 |
job:
|
123 |
name: main
|
124 |
chdir: true
|
125 |
-
override_dirname: checkpoint.every_steps=
|
126 |
id: ???
|
127 |
num: ???
|
128 |
config_name: default
|
@@ -147,7 +151,7 @@ hydra:
|
|
147 |
- path: ''
|
148 |
schema: structured
|
149 |
provider: schema
|
150 |
-
output_dir: /home/jovyan/nanoT5/logs/2024-
|
151 |
choices:
|
152 |
local_env: default
|
153 |
task: pt
|
|
|
112 |
hydra:
|
113 |
- hydra.mode=RUN
|
114 |
task:
|
115 |
+
- optim.name=adafactor
|
116 |
+
- optim.lr_scheduler=legacy
|
117 |
+
- model.name=/home/jovyan/bert-train/nanot5/templates/base_sklarge_120k
|
118 |
+
- data.train_path=/home/jovyan/data/sklarge-shards
|
119 |
+
- optim.grad_acc=8
|
120 |
- model.klass=hf_t5
|
121 |
- eval.every_steps=5000
|
122 |
+
- optim.total_steps=120000
|
123 |
+
- model.add_config.is_bf16=True
|
124 |
+
- checkpoint.every_steps=10000
|
125 |
+
- model.compile=False
|
126 |
job:
|
127 |
name: main
|
128 |
chdir: true
|
129 |
+
override_dirname: checkpoint.every_steps=10000,data.train_path=/home/jovyan/data/sklarge-shards,eval.every_steps=5000,model.add_config.is_bf16=True,model.compile=False,model.klass=hf_t5,model.name=/home/jovyan/bert-train/nanot5/templates/base_sklarge_120k,optim.grad_acc=8,optim.lr_scheduler=legacy,optim.name=adafactor,optim.total_steps=120000
|
130 |
id: ???
|
131 |
num: ???
|
132 |
config_name: default
|
|
|
151 |
- path: ''
|
152 |
schema: structured
|
153 |
provider: schema
|
154 |
+
output_dir: /home/jovyan/nanoT5/logs/2024-07-29/11-48-34-
|
155 |
choices:
|
156 |
local_env: default
|
157 |
task: pt
|
hydra/overrides.yaml
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
-
- optim.name=
|
2 |
-
- optim.lr_scheduler=
|
3 |
-
- model.name=/home/jovyan/bert-train/nanot5/
|
4 |
-
-
|
|
|
5 |
- model.klass=hf_t5
|
6 |
- eval.every_steps=5000
|
7 |
-
-
|
|
|
|
|
|
|
|
1 |
+
- optim.name=adafactor
|
2 |
+
- optim.lr_scheduler=legacy
|
3 |
+
- model.name=/home/jovyan/bert-train/nanot5/templates/base_sklarge_120k
|
4 |
+
- data.train_path=/home/jovyan/data/sklarge-shards
|
5 |
+
- optim.grad_acc=8
|
6 |
- model.klass=hf_t5
|
7 |
- eval.every_steps=5000
|
8 |
+
- optim.total_steps=120000
|
9 |
+
- model.add_config.is_bf16=True
|
10 |
+
- checkpoint.every_steps=10000
|
11 |
+
- model.compile=False
|
main.log
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d083edc4ffae1fc5025aeec7b6417c459a1654010f0bd9fee0ae6c57ab97332f
|
3 |
+
size 1530845040
|
optimizer.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2d8ba594f622898a38b758a0bfc7a6bf1e79acde08d843d03b771a5b220106b
|
3 |
+
size 3075653
|
random_states_0.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14663
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:032d5c0c40c80570faf14e7a3a2ba39d76b91afecb36586b35a861be0d83556d
|
3 |
size 14663
|
scheduler.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 819
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c26cfd01bde4900b7b03e0e9b7ec7d389da736cdcf1ccb3d46a64498895077f0
|
3 |
size 819
|
spiece.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9944e5920f922d26793ec2b15ae90f576584035eecde1e9eee0923bc8c3fc328
|
3 |
+
size 2575539
|
spiece.vocab
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
train-model.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python -m nanoT5.main optim.name=adafactor optim.lr_scheduler=legacy model.name=/home/jovyan/bert-train/nanot5/templates/base_sklarge_120k data.train_path=/home/jovyan/data/sklarge-shards optim.grad_acc=8 model.klass=hf_t5 eval.every_steps=5000 optim.total_steps=120000 model.add_config.is_bf16=True checkpoint.every_steps=10000 model.compile=False
|