|
from docx import Document |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments |
|
import torch |
|
import gradio as gr |
|
|
|
model_name = "gpt2" |
|
model = GPT2LMHeadModel.from_pretrained(model_name) |
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
|
|
|
|
|
dataset = TextDataset(tokenizer=tokenizer, file_path="your_dataset.txt") |
|
|
|
|
|
dataset = TextDataset(tokenizer=tokenizer, file_path="your_dataset.txt") |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./output", |
|
overwrite_output_dir=True, |
|
num_train_epochs=3, |
|
per_device_train_batch_size=4, |
|
save_steps=10_000, |
|
save_total_limit=2, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=dataset, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.evaluate() |
|
|
|
|
|
def chatbot(input_text): |
|
|
|
input_ids = tokenizer.encode(input_text, return_tensors="pt") |
|
|
|
|
|
output_ids = model.generate(input_ids, max_length=50, pad_token_id=tokenizer.eos_token_id) |
|
|
|
|
|
response = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
|
|
return response |
|
|
|
|
|
chatbot_interface = gr.Interface(chatbot, "textbox", "textbox", title="Chatbot") |
|
|
|
|
|
chatbot_interface.launch() |
|
|