from clean import clean_text from datasets import load_dataset dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True) # data_dir = "/home/yeb" data_dir = "/home/yeb/Developer/data" data_files = [] def train_val_files(): import glob import random SEED = 12345 def add_jsonlines_dir(path, filespec): global data_files data_files += glob.glob(f"{path}/{filespec}") data_files = list(set(data_files)) print(f"Number of files {len(data_files)} after adding {path} glob {filespec}") # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned") add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*73*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*12*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*29*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*74*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*26*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz") # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz") # add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz") # add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz") random.Random(SEED).shuffle(data_files) total = len(data_files) print(total) perc = 0.05 val_size = int(perc * total) train_size = total - val_size train = data_files[:train_size] val = data_files[train_size:] print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files") assert list(set(train) & set(val)) == [], "Train overlaps with test" return train, val train, val = train_val_files() dataset_v0 = load_dataset('json', data_files={'train': train, 'validation': val}) dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl") def f(obj): obj["text"] = clean_text(obj["text"]) return obj dataset_v1 = dataset_v0.map( f, batched=False, num_proc=10, ) datasets = dataset_v1.filter( lambda obj: obj['text'] is not None, num_proc=10, ) it = iter(dataset_v0['train']) print(next(it)) print(next(it)) print(next(it)) it = iter(dataset_v1['train']) print(next(it)) print(next(it)) print(next(it)) # it = iter(dataset_v2) # print(next(it)) # print(next(it)) # print(next(it))