|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
from abc import ABC, abstractmethod |
|
from multiprocessing import cpu_count |
|
|
|
""" |
|
This registry is for automatically downloading and extracting datasets. |
|
|
|
To register a class you need to inherit the DataDownloader class, and provide name and url attributes, and (optionally) |
|
the number of documents. |
|
|
|
When done, add it to the DATA_DOWNLOADERS dict. The function process_data runs the pre-processing for the selected |
|
dataset. |
|
""" |
|
|
|
GPT2_VOCAB_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json" |
|
GPT2_MERGE_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt" |
|
|
|
|
|
class DataDownloader(ABC): |
|
"""Dataset registry class to automatically download / extract datasets""" |
|
|
|
def __init__( |
|
self, |
|
tokenizer_type=None, |
|
merge_file=None, |
|
vocab_file=None, |
|
data_dir=None, |
|
force_redownload=None, |
|
num_workers=None, |
|
): |
|
if tokenizer_type is None: |
|
tokenizer_type = "GPT2BPETokenizer" |
|
if data_dir is None: |
|
data_dir = os.environ.get("DATA_DIR", "./data") |
|
if merge_file is None: |
|
merge_file = f"{data_dir}/gpt2-merges.txt" |
|
if force_redownload is None: |
|
force_redownload = False |
|
if vocab_file is None: |
|
if tokenizer_type == "GPT2BPETokenizer": |
|
vocab_file = f"{data_dir}/gpt2-vocab.json" |
|
elif tokenizer_type == "HFGPT2Tokenizer": |
|
vocab_file = "gpt2" |
|
elif tokenizer_type == "CharLevelTokenizer": |
|
pass |
|
else: |
|
assert vocab_file is not None, "No vocab file provided" |
|
if num_workers is None: |
|
num_workers = cpu_count() |
|
self._tokenizer_type = tokenizer_type |
|
self._merge_file = merge_file |
|
self._vocab_file = vocab_file |
|
self._data_dir = data_dir |
|
self._force_redownload = force_redownload |
|
self._num_workers = num_workers |
|
|
|
@property |
|
def base_dir(self): |
|
"""base data directory""" |
|
return self._data_dir |
|
|
|
@property |
|
@abstractmethod |
|
def name(self): |
|
"""name of dataset""" |
|
pass |
|
|
|
@property |
|
@abstractmethod |
|
def urls(self): |
|
"""URLs from which to download dataset""" |
|
pass |
|
|
|
@property |
|
def tokenizer_type(self): |
|
"""tokenizer type to use when tokenizing data""" |
|
return self._tokenizer_type |
|
|
|
@property |
|
def merge_file(self): |
|
"""Merge file for tokenizer""" |
|
return self._merge_file |
|
|
|
@property |
|
def vocab_file(self): |
|
"""Vocab file for tokenizer""" |
|
return self._vocab_file |
|
|
|
@property |
|
def num_workers(self): |
|
"""Number of workers to use in preprocessing""" |
|
return self._num_workers |
|
|
|
@property |
|
def num_docs(self): |
|
"""Number of documents in the dataset (if known)""" |
|
return None |
|
|
|
@property |
|
def ftfy(self): |
|
"""Use ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix text encodings""" |
|
return False |
|
|
|
def exists(self): |
|
"""Checks if the dataset is present""" |
|
return os.path.isdir(f"{self.base_dir}/{self.name}") |
|
|
|
def download(self): |
|
"""downloads dataset""" |
|
os.makedirs(os.path.join(self.base_dir, self.name), exist_ok=True) |
|
for url in self.urls: |
|
try: |
|
os_cmd = f"wget {url} -O {os.path.join(self.base_dir, self.name, os.path.basename(url))}" |
|
if os.system(os_cmd) != 0: |
|
raise Exception( |
|
f"Cannot download file at URL {url}: server may be down" |
|
) |
|
except Exception as e: |
|
raise Exception(f"Download error: {e}") |
|
|
|
def tokenize(self): |
|
"""tokenizes dataset""" |
|
parent_folder = os.path.join(self.base_dir, self.name) |
|
jsonl_filepath = ",".join( |
|
[os.path.join(parent_folder, os.path.basename(url)) for url in self.urls] |
|
) |
|
|
|
cmd = f"python tools/datasets/preprocess_data.py \ |
|
--input {jsonl_filepath} \ |
|
--output-prefix {parent_folder}/{self.name} \ |
|
--vocab {self.vocab_file} \ |
|
--dataset-impl mmap \ |
|
--tokenizer-type {self.tokenizer_type} \ |
|
--merge-file {self.merge_file} \ |
|
--append-eod \ |
|
--workers {self.num_workers} " |
|
|
|
if self.num_docs is not None: |
|
cmd += f"--num-docs {self.num_docs} " |
|
|
|
if self.ftfy: |
|
cmd += f"--ftfy " |
|
|
|
os.system(cmd) |
|
|
|
def prepare(self): |
|
if self._force_redownload: |
|
self.download() |
|
else: |
|
if not self.exists(): |
|
self.download() |
|
|
|
self.tokenize() |
|
|
|
|
|
class Enron(DataDownloader): |
|
name = "enron" |
|
urls = ["http://eaidata.bmk.sh/data/enron_emails.jsonl.zst"] |
|
num_docs = 517401 |
|
|
|
|
|
class PileSubset(DataDownloader): |
|
name = "pile_00" |
|
urls = ["https://the-eye.eu/public/AI/pile/train/00.jsonl.zst"] |
|
|
|
|
|
class Pile(DataDownloader): |
|
name = "pile" |
|
urls = [ |
|
f"https://the-eye.eu/public/AI/pile/train/{i:02}.jsonl.zst" for i in range(30) |
|
] |
|
|
|
|
|
class Github(DataDownloader): |
|
name = "github" |
|
urls = ["http://eaidata.bmk.sh/data/github_small.jsonl.zst"] |
|
|
|
|
|
class ArXiv(DataDownloader): |
|
name = "arxiv" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/2020-09-08-arxiv-extracts-nofallback-until-2007-068.tar.gz" |
|
] |
|
|
|
|
|
class EuroParl(DataDownloader): |
|
name = "europarl" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/EuroParliamentProceedings_1996_2011.jsonl.zst" |
|
] |
|
|
|
|
|
class FreeLaw(DataDownloader): |
|
name = "freelaw" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst" |
|
] |
|
|
|
|
|
class NiH(DataDownloader): |
|
name = "nih" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/NIH_ExPORTER_awarded_grant_text.jsonl.zst" |
|
] |
|
|
|
|
|
class PubMed(DataDownloader): |
|
name = "pubmed" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz" |
|
] |
|
|
|
|
|
class Books1(DataDownloader): |
|
name = "books1" |
|
urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"] |
|
|
|
|
|
class Books3(DataDownloader): |
|
name = "books3" |
|
urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz"] |
|
|
|
|
|
class HackerNews(DataDownloader): |
|
name = "hackernews" |
|
urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/hn.tar.gz"] |
|
num_docs = 373000 |
|
|
|
|
|
class OpenWebText2(DataDownloader): |
|
name = "openwebtext2" |
|
urls = [ |
|
"https://huggingface.co/datasets/segyges/OpenWebText2/resolve/main/openwebtext2.jsonl.zst.tar" |
|
] |
|
num_docs = 17103000 |
|
|
|
|
|
class StackExchange(DataDownloader): |
|
name = "stackexchange" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar" |
|
] |
|
|
|
|
|
class UbuntuIRC(DataDownloader): |
|
name = "ubuntu_irc" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/ubuntu_irc_until_2020_9_1.jsonl.zst" |
|
] |
|
|
|
|
|
class YoutubeSubtitles(DataDownloader): |
|
name = "youtube_subtitles" |
|
urls = [ |
|
"https://the-eye.eu/public/AI/pile_preliminary_components/yt_subs.jsonl.zst" |
|
] |
|
|
|
|
|
class C4(DataDownloader): |
|
name = "c4" |
|
urls = [ |
|
f"https://the-eye.eu/eleuther_staging/c4/en/c4-train.{i:05}-of-01024.json.gz" |
|
for i in range(1024) |
|
] |
|
|
|
|
|
class C4OpenWebText(DataDownloader): |
|
name = "c4_openwebtext" |
|
urls = [ |
|
f"https://the-eye.eu/eleuther_staging/c4/realnewslike/c4-train.{i:05}-of-00512.json.gz" |
|
for i in range(512) |
|
] |
|
|
|
|
|
class Enwik8(DataDownloader): |
|
name = "enwik8" |
|
urls = ["http://mattmahoney.net/dc/enwik8.zip"] |
|
|
|
|
|
def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): |
|
if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": |
|
GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json" |
|
GPT2_MERGE_FP = f"{data_dir}/gpt2-merges.txt" |
|
if not os.path.isfile(GPT2_VOCAB_FP): |
|
os.system(f"wget {GPT2_VOCAB_URL} -O {GPT2_VOCAB_FP}") |
|
if not os.path.isfile(GPT2_MERGE_FP): |
|
os.system(f"wget {GPT2_MERGE_URL} -O {GPT2_MERGE_FP}") |
|
|
|
|
|
DATA_DOWNLOADERS = { |
|
"pass": "pass", |
|
"enron": Enron, |
|
"pile_subset": PileSubset, |
|
"pile": Pile, |
|
"github": Github, |
|
"arxiv": ArXiv, |
|
"europarl": EuroParl, |
|
"freelaw": FreeLaw, |
|
"nih": NiH, |
|
"pubmed": PubMed, |
|
"books1": Books1, |
|
"books3": Books3, |
|
"hackernews": HackerNews, |
|
"openwebtext2": OpenWebText2, |
|
"stackexchange": StackExchange, |
|
"ubuntu_irc": UbuntuIRC, |
|
"youtube_subtitles": YoutubeSubtitles, |
|
"c4": C4, |
|
"c4_openwebtext": C4OpenWebText, |
|
"enwik8": Enwik8, |
|
} |
|
|
|
|
|
def prepare_dataset( |
|
dataset_name: str, |
|
tokenizer_type: str = None, |
|
data_dir: str = None, |
|
vocab_file: str = None, |
|
merge_file: str = None, |
|
force_redownload: bool = None, |
|
num_workers: int = None, |
|
): |
|
""" |
|
Downloads + tokenizes a dataset in the registry (dataset_name) and saves output .npy files to data_dir. |
|
""" |
|
if data_dir is None: |
|
data_dir = os.environ.get("DATA_DIR", "./data") |
|
os.makedirs(data_dir, exist_ok=True) |
|
maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir) |
|
DownloaderClass = DATA_DOWNLOADERS.get(dataset_name.lower(), None) |
|
if DownloaderClass is None: |
|
raise NotImplementedError( |
|
f'Dataset "{dataset_name}" not recognized - please choose from {list(DATA_DOWNLOADERS.keys())}' |
|
) |
|
elif DownloaderClass == "pass": |
|
|
|
pass |
|
else: |
|
num_workers = 1 if dataset_name == "enwik8" else num_workers |
|
d = DownloaderClass( |
|
tokenizer_type=tokenizer_type, |
|
vocab_file=vocab_file, |
|
merge_file=merge_file, |
|
data_dir=data_dir, |
|
force_redownload=force_redownload, |
|
num_workers=num_workers, |
|
) |
|
d.prepare() |
|
|