File size: 2,002 Bytes
fdc4101 a37447a fdc4101 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
from datasets import load_dataset
from t5_tokenizer_model import SentencePieceUnigramTokenizer
# from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
data_dir = "/home/yeb"
data_files = []
def train_val_files():
import glob
import random
SEED = 12345
def add_jsonlines_dir(path, filespec):
global data_files
data_files += glob.glob(f"{path}/{filespec}")
print(f"Number of files {len(data_files)} after adding {path}")
# add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*47*.gz")
add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
random.Random(SEED).shuffle(data_files)
print(data_files)
total = len(data_files)
print(total)
perc = 0.01
val_size = int(perc * total)
train_size = total - val_size
train = data_files[:train_size]
val = data_files[train_size:]
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
assert list(set(train) & set(val)) == [], "Train overlaps with test"
return train, val
train, val = train_val_files()
dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
vocab_size = 32000
input_sentence_size = None
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
# Build an iterator over this dataset
def batch_iterator(input_sentence_size=None):
if input_sentence_size is None:
input_sentence_size = len(dataset)
batch_length = 100
for i in range(0, input_sentence_size, batch_length):
yield dataset[i: i + batch_length]["text"]
# Train tokenizer
tokenizer.train_from_iterator(
iterator=batch_iterator(input_sentence_size=input_sentence_size),
vocab_size=vocab_size,
show_progress=True,
)
# Save files to disk
tokenizer.save("./tokenizer.json")
|