from transformers import AutoTokenizer, MT5ForConditionalGeneration from transformers import T5Tokenizer import streamlit as st import pandas as pd from datasets import Dataset import torch from datasets import Dataset, DatasetDict from transformers import Trainer, TrainingArguments tokenizer = T5Tokenizer.from_pretrained('google/mt5-base') model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base") #st.write(model) df = pd.read_csv('proverbs.csv') df dataset = Dataset.from_pandas(df) def preprocess_function(examples): inputs = examples['Proverb'] targets = examples['Meaning'] model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length") model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_dataset = dataset.map(preprocess_function, batched=True) dataset_split = tokenized_dataset.train_test_split(test_size=0.2) train_dataset = dataset_split['train'] test_dataset = dataset_split['test'] print(f"Training dataset size: {len(train_dataset)}") print(f"Testing dataset size: {len(test_dataset)}") training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.01, save_total_limit=2, save_steps=500, ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset ) # Fine-tune the model trainer.train() model.save_pretrained("./fine-tuned-mt5-marathi-proverbs") tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs")