|
from clean import clean_text |
|
|
|
from datasets import load_dataset |
|
|
|
dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True) |
|
|
|
|
|
data_dir = "/home/yeb/Developer/data" |
|
data_files = [] |
|
|
|
def train_val_files(): |
|
import glob |
|
import random |
|
SEED = 12345 |
|
|
|
def add_jsonlines_dir(path, filespec): |
|
global data_files |
|
data_files += glob.glob(f"{path}/{filespec}") |
|
data_files = list(set(data_files)) |
|
print(f"Number of files {len(data_files)} after adding {path} glob {filespec}") |
|
|
|
|
|
add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*73*.gz") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
random.Random(SEED).shuffle(data_files) |
|
|
|
total = len(data_files) |
|
print(total) |
|
perc = 0.05 |
|
val_size = int(perc * total) |
|
train_size = total - val_size |
|
train = data_files[:train_size] |
|
val = data_files[train_size:] |
|
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files") |
|
|
|
assert list(set(train) & set(val)) == [], "Train overlaps with test" |
|
|
|
return train, val |
|
|
|
train, val = train_val_files() |
|
dataset_v0 = load_dataset('json', data_files={'train': train, 'validation': val}) |
|
|
|
|
|
dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl") |
|
|
|
def f(obj): |
|
obj["text"] = clean_text(obj["text"]) |
|
return obj |
|
|
|
|
|
dataset_v1 = dataset_v0.map( |
|
f, |
|
batched=False, |
|
num_proc=10, |
|
) |
|
|
|
datasets = dataset_v1.filter( |
|
lambda obj: obj['text'] is not None, |
|
num_proc=10, |
|
) |
|
|
|
it = iter(dataset_v0['train']) |
|
print(next(it)) |
|
print(next(it)) |
|
print(next(it)) |
|
|
|
it = iter(dataset_v1['train']) |
|
print(next(it)) |
|
print(next(it)) |
|
print(next(it)) |
|
|
|
|
|
|
|
|
|
|
|
|