t5-base-dutch / streaming_dataset_filter_test.py
yhavinga's picture
Update weights and scripts
34a1eb7
raw
history blame
2.96 kB
from clean import clean_text
from datasets import load_dataset
dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True)
# data_dir = "/home/yeb"
data_dir = "/home/yeb/Developer/data"
data_files = []
def train_val_files():
import glob
import random
SEED = 12345
def add_jsonlines_dir(path, filespec):
global data_files
data_files += glob.glob(f"{path}/{filespec}")
data_files = list(set(data_files))
print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
# add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*73*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*12*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*29*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*74*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*26*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
# add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
# add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
random.Random(SEED).shuffle(data_files)
total = len(data_files)
print(total)
perc = 0.05
val_size = int(perc * total)
train_size = total - val_size
train = data_files[:train_size]
val = data_files[train_size:]
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
assert list(set(train) & set(val)) == [], "Train overlaps with test"
return train, val
train, val = train_val_files()
dataset_v0 = load_dataset('json', data_files={'train': train, 'validation': val})
dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl")
def f(obj):
obj["text"] = clean_text(obj["text"])
return obj
dataset_v1 = dataset_v0.map(
f,
batched=False,
num_proc=10,
)
datasets = dataset_v1.filter(
lambda obj: obj['text'] is not None,
num_proc=10,
)
it = iter(dataset_v0['train'])
print(next(it))
print(next(it))
print(next(it))
it = iter(dataset_v1['train'])
print(next(it))
print(next(it))
print(next(it))
# it = iter(dataset_v2)
# print(next(it))
# print(next(it))
# print(next(it))