File size: 920 Bytes
ee3e9cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
dataset = load_dataset('i2ebuddy/website_data', split='train')
dataset = dataset.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512), batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=4,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=3,
report_to="none" # do not report to any service for logging
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset
)
trainer.train()
|