|
from datasets import load_dataset |
|
from t5_tokenizer_model import SentencePieceUnigramTokenizer |
|
|
|
|
|
|
|
data_dir = "/home/yeb" |
|
data_files = [] |
|
|
|
|
|
def train_val_files(): |
|
import glob |
|
import random |
|
SEED = 12345 |
|
|
|
def add_jsonlines_dir(path, filespec): |
|
global data_files |
|
data_files += glob.glob(f"{path}/{filespec}") |
|
print(f"Number of files {len(data_files)} after adding {path}") |
|
|
|
|
|
add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*47*.gz") |
|
add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz") |
|
add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz") |
|
random.Random(SEED).shuffle(data_files) |
|
|
|
print(data_files) |
|
total = len(data_files) |
|
print(total) |
|
perc = 0.01 |
|
val_size = int(perc * total) |
|
train_size = total - val_size |
|
train = data_files[:train_size] |
|
val = data_files[train_size:] |
|
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files") |
|
|
|
assert list(set(train) & set(val)) == [], "Train overlaps with test" |
|
|
|
return train, val |
|
|
|
|
|
train, val = train_val_files() |
|
|
|
dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train') |
|
|
|
vocab_size = 32000 |
|
input_sentence_size = None |
|
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>") |
|
|
|
|
|
|
|
def batch_iterator(input_sentence_size=None): |
|
if input_sentence_size is None: |
|
input_sentence_size = len(dataset) |
|
batch_length = 100 |
|
for i in range(0, input_sentence_size, batch_length): |
|
yield dataset[i: i + batch_length]["text"] |
|
|
|
|
|
tokenizer.train_from_iterator( |
|
iterator=batch_iterator(input_sentence_size=input_sentence_size), |
|
vocab_size=vocab_size, |
|
show_progress=True, |
|
) |
|
|
|
|
|
tokenizer.save("./tokenizer.json") |
|
|