Retrain tokenizer for case sensitive
Browse files- tokenizer.json +0 -0
- train_tokenizer.py +1 -3
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
train_tokenizer.py
CHANGED
@@ -18,7 +18,7 @@ def train_val_files():
|
|
18 |
print(f"Number of files {len(data_files)} after adding {path}")
|
19 |
|
20 |
# add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
|
21 |
-
add_jsonlines_dir(f"{data_dir}/
|
22 |
add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
|
23 |
add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
|
24 |
random.Random(SEED).shuffle(data_files)
|
@@ -42,8 +42,6 @@ train, val = train_val_files()
|
|
42 |
|
43 |
dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
|
44 |
|
45 |
-
model_dir = "/t5-small-dutch" # ${MODEL_DIR}
|
46 |
-
|
47 |
vocab_size = 32000
|
48 |
input_sentence_size = None
|
49 |
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
|
|
|
18 |
print(f"Number of files {len(data_files)} after adding {path}")
|
19 |
|
20 |
# add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
|
21 |
+
add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*47*.gz")
|
22 |
add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
|
23 |
add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
|
24 |
random.Random(SEED).shuffle(data_files)
|
|
|
42 |
|
43 |
dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
|
44 |
|
|
|
|
|
45 |
vocab_size = 32000
|
46 |
input_sentence_size = None
|
47 |
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
|