from transformers import ( | |
AutoTokenizer, | |
) | |
from datasets import load_dataset | |
raw_datasets = load_dataset('lucadiliello/bookcorpusopen') | |
raw_datasets = raw_datasets['train'].train_test_split(test_size=0.05) | |
print(raw_datasets) | |
tokenizer = AutoTokenizer.from_pretrained('.') | |
seq_len = 512 | |
def tokenize_fn(examples): | |
return tokenizer(examples['text'], | |
max_length=seq_len, | |
return_overflowing_tokens=True, | |
truncation=True) | |
tokenized_datasets = raw_datasets.map( | |
tokenize_fn, | |
batched=True, | |
batch_size=500, | |
remove_columns=raw_datasets['train'].column_names, | |
) | |
tokenized_datasets.save_to_disk('data') | |