Spaces:
Runtime error
Runtime error
File size: 614 Bytes
10b912d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
from datasets import load_dataset
def load_data_for_training(
tokenizer,
loader_path,
dataset_dir,
max_input_length=256,
):
def preprocess_function(examples):
inputs = [doc for doc in examples["document"]]
model_inputs = tokenizer(
inputs, max_length=max_input_length, truncation=True
)
return model_inputs
# preprocess dataset
datasets = load_dataset(
path=loader_path,
data_dir=dataset_dir,
)
tokenized_datasets = datasets.map(preprocess_function, batched=True)
return tokenized_datasets
|