NEOX / tools /datasets /corpora.py
akswelh's picture
Upload 251 files
d90b3a8 verified
raw
history blame
10.8 kB
# Copyright (c) 2024, EleutherAI
# This file is based on code by the authors denoted below and has been modified from its original version.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from abc import ABC, abstractmethod
from multiprocessing import cpu_count
"""
This registry is for automatically downloading and extracting datasets.
To register a class you need to inherit the DataDownloader class, and provide name and url attributes, and (optionally)
the number of documents.
When done, add it to the DATA_DOWNLOADERS dict. The function process_data runs the pre-processing for the selected
dataset.
"""
GPT2_VOCAB_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
GPT2_MERGE_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
class DataDownloader(ABC):
"""Dataset registry class to automatically download / extract datasets"""
def __init__(
self,
tokenizer_type=None,
merge_file=None,
vocab_file=None,
data_dir=None,
force_redownload=None,
num_workers=None,
):
if tokenizer_type is None:
tokenizer_type = "GPT2BPETokenizer"
if data_dir is None:
data_dir = os.environ.get("DATA_DIR", "./data")
if merge_file is None:
merge_file = f"{data_dir}/gpt2-merges.txt"
if force_redownload is None:
force_redownload = False
if vocab_file is None:
if tokenizer_type == "GPT2BPETokenizer":
vocab_file = f"{data_dir}/gpt2-vocab.json"
elif tokenizer_type == "HFGPT2Tokenizer":
vocab_file = "gpt2"
elif tokenizer_type == "CharLevelTokenizer":
pass
else:
assert vocab_file is not None, "No vocab file provided"
if num_workers is None:
num_workers = cpu_count()
self._tokenizer_type = tokenizer_type
self._merge_file = merge_file
self._vocab_file = vocab_file
self._data_dir = data_dir
self._force_redownload = force_redownload
self._num_workers = num_workers
@property
def base_dir(self):
"""base data directory"""
return self._data_dir
@property
@abstractmethod
def name(self):
"""name of dataset"""
pass
@property
@abstractmethod
def urls(self):
"""URLs from which to download dataset"""
pass
@property
def tokenizer_type(self):
"""tokenizer type to use when tokenizing data"""
return self._tokenizer_type
@property
def merge_file(self):
"""Merge file for tokenizer"""
return self._merge_file
@property
def vocab_file(self):
"""Vocab file for tokenizer"""
return self._vocab_file
@property
def num_workers(self):
"""Number of workers to use in preprocessing"""
return self._num_workers
@property
def num_docs(self):
"""Number of documents in the dataset (if known)"""
return None
@property
def ftfy(self):
"""Use ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix text encodings"""
return False
def exists(self):
"""Checks if the dataset is present"""
return os.path.isdir(f"{self.base_dir}/{self.name}")
def download(self):
"""downloads dataset"""
os.makedirs(os.path.join(self.base_dir, self.name), exist_ok=True)
for url in self.urls:
try:
os_cmd = f"wget {url} -O {os.path.join(self.base_dir, self.name, os.path.basename(url))}"
if os.system(os_cmd) != 0:
raise Exception(
f"Cannot download file at URL {url}: server may be down"
)
except Exception as e:
raise Exception(f"Download error: {e}")
def tokenize(self):
"""tokenizes dataset"""
parent_folder = os.path.join(self.base_dir, self.name)
jsonl_filepath = ",".join(
[os.path.join(parent_folder, os.path.basename(url)) for url in self.urls]
)
cmd = f"python tools/datasets/preprocess_data.py \
--input {jsonl_filepath} \
--output-prefix {parent_folder}/{self.name} \
--vocab {self.vocab_file} \
--dataset-impl mmap \
--tokenizer-type {self.tokenizer_type} \
--merge-file {self.merge_file} \
--append-eod \
--workers {self.num_workers} "
if self.num_docs is not None:
cmd += f"--num-docs {self.num_docs} "
if self.ftfy:
cmd += f"--ftfy "
os.system(cmd)
def prepare(self):
if self._force_redownload:
self.download()
else:
if not self.exists():
self.download()
self.tokenize()
class Enron(DataDownloader):
name = "enron"
urls = ["http://eaidata.bmk.sh/data/enron_emails.jsonl.zst"]
num_docs = 517401
class PileSubset(DataDownloader):
name = "pile_00"
urls = ["https://the-eye.eu/public/AI/pile/train/00.jsonl.zst"]
class Pile(DataDownloader):
name = "pile"
urls = [
f"https://the-eye.eu/public/AI/pile/train/{i:02}.jsonl.zst" for i in range(30)
]
class Github(DataDownloader):
name = "github"
urls = ["http://eaidata.bmk.sh/data/github_small.jsonl.zst"]
class ArXiv(DataDownloader):
name = "arxiv"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/2020-09-08-arxiv-extracts-nofallback-until-2007-068.tar.gz"
]
class EuroParl(DataDownloader):
name = "europarl"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/EuroParliamentProceedings_1996_2011.jsonl.zst"
]
class FreeLaw(DataDownloader):
name = "freelaw"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst"
]
class NiH(DataDownloader):
name = "nih"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/NIH_ExPORTER_awarded_grant_text.jsonl.zst"
]
class PubMed(DataDownloader):
name = "pubmed"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz"
]
class Books1(DataDownloader):
name = "books1"
urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz"]
class Books3(DataDownloader):
name = "books3"
urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/books3.tar.gz"]
class HackerNews(DataDownloader):
name = "hackernews"
urls = ["https://the-eye.eu/public/AI/pile_preliminary_components/hn.tar.gz"]
num_docs = 373000
class OpenWebText2(DataDownloader):
name = "openwebtext2"
urls = [
"https://huggingface.co/datasets/segyges/OpenWebText2/resolve/main/openwebtext2.jsonl.zst.tar"
]
num_docs = 17103000
class StackExchange(DataDownloader):
name = "stackexchange"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/stackexchange_dataset.tar"
]
class UbuntuIRC(DataDownloader):
name = "ubuntu_irc"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/ubuntu_irc_until_2020_9_1.jsonl.zst"
]
class YoutubeSubtitles(DataDownloader):
name = "youtube_subtitles"
urls = [
"https://the-eye.eu/public/AI/pile_preliminary_components/yt_subs.jsonl.zst"
]
class C4(DataDownloader):
name = "c4"
urls = [
f"https://the-eye.eu/eleuther_staging/c4/en/c4-train.{i:05}-of-01024.json.gz"
for i in range(1024)
]
class C4OpenWebText(DataDownloader):
name = "c4_openwebtext"
urls = [
f"https://the-eye.eu/eleuther_staging/c4/realnewslike/c4-train.{i:05}-of-00512.json.gz"
for i in range(512)
]
class Enwik8(DataDownloader):
name = "enwik8"
urls = ["http://mattmahoney.net/dc/enwik8.zip"]
def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json"
GPT2_MERGE_FP = f"{data_dir}/gpt2-merges.txt"
if not os.path.isfile(GPT2_VOCAB_FP):
os.system(f"wget {GPT2_VOCAB_URL} -O {GPT2_VOCAB_FP}")
if not os.path.isfile(GPT2_MERGE_FP):
os.system(f"wget {GPT2_MERGE_URL} -O {GPT2_MERGE_FP}")
DATA_DOWNLOADERS = {
"pass": "pass",
"enron": Enron,
"pile_subset": PileSubset,
"pile": Pile,
"github": Github,
"arxiv": ArXiv,
"europarl": EuroParl,
"freelaw": FreeLaw,
"nih": NiH,
"pubmed": PubMed,
"books1": Books1,
"books3": Books3,
"hackernews": HackerNews,
"openwebtext2": OpenWebText2,
"stackexchange": StackExchange,
"ubuntu_irc": UbuntuIRC,
"youtube_subtitles": YoutubeSubtitles,
"c4": C4,
"c4_openwebtext": C4OpenWebText,
"enwik8": Enwik8,
}
def prepare_dataset(
dataset_name: str,
tokenizer_type: str = None,
data_dir: str = None,
vocab_file: str = None,
merge_file: str = None,
force_redownload: bool = None,
num_workers: int = None,
):
"""
Downloads + tokenizes a dataset in the registry (dataset_name) and saves output .npy files to data_dir.
"""
if data_dir is None:
data_dir = os.environ.get("DATA_DIR", "./data")
os.makedirs(data_dir, exist_ok=True)
maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir)
DownloaderClass = DATA_DOWNLOADERS.get(dataset_name.lower(), None)
if DownloaderClass is None:
raise NotImplementedError(
f'Dataset "{dataset_name}" not recognized - please choose from {list(DATA_DOWNLOADERS.keys())}'
)
elif DownloaderClass == "pass":
# pass on building dataset (for unit tests)
pass
else:
num_workers = 1 if dataset_name == "enwik8" else num_workers
d = DownloaderClass(
tokenizer_type=tokenizer_type,
vocab_file=vocab_file,
merge_file=merge_file,
data_dir=data_dir,
force_redownload=force_redownload,
num_workers=num_workers,
)
d.prepare()