from datasets import load_dataset from t5_tokenizer_model import SentencePieceUnigramTokenizer # from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer data_dir = "/home/yeb" data_files = [] def train_val_files(): import glob import random SEED = 12345 def add_jsonlines_dir(path, filespec): global data_files data_files += glob.glob(f"{path}/{filespec}") print(f"Number of files {len(data_files)} after adding {path}") # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned") add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*47*.gz") add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz") add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz") random.Random(SEED).shuffle(data_files) print(data_files) total = len(data_files) print(total) perc = 0.01 val_size = int(perc * total) train_size = total - val_size train = data_files[:train_size] val = data_files[train_size:] print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files") assert list(set(train) & set(val)) == [], "Train overlaps with test" return train, val train, val = train_val_files() dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train') vocab_size = 32000 input_sentence_size = None tokenizer = SentencePieceUnigramTokenizer(unk_token="", eos_token="", pad_token="") # Build an iterator over this dataset def batch_iterator(input_sentence_size=None): if input_sentence_size is None: input_sentence_size = len(dataset) batch_length = 100 for i in range(0, input_sentence_size, batch_length): yield dataset[i: i + batch_length]["text"] # Train tokenizer tokenizer.train_from_iterator( iterator=batch_iterator(input_sentence_size=input_sentence_size), vocab_size=vocab_size, show_progress=True, ) # Save files to disk tokenizer.save("./tokenizer.json")