from transformers import AutoTokenizer, MT5ForConditionalGeneration from transformers import T5Tokenizer import streamlit as st import pandas as pd from datasets import Dataset import torch from datasets import Dataset, DatasetDict from transformers import Trainer, TrainingArguments tokenizer = T5Tokenizer.from_pretrained('google/mt5-base') model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base") #st.write(model) df = pd.read_csv('proverbs.csv') df dataset = Dataset.from_pandas(df) def preprocess_function(examples): inputs = examples['Proverb'] targets = examples['Meaning'] model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length") model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_dataset = dataset.map(preprocess_function, batched=True) dataset_split = tokenized_dataset.train_test_split(test_size=0.2) train_dataset = dataset_split['train'] test_dataset = dataset_split['test'] print(f"Training dataset size: {len(train_dataset)}") print(f"Testing dataset size: {len(test_dataset)}") training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=2, per_device_eval_batch_size=2, num_train_epochs=3, weight_decay=0.01, save_total_limit=2, save_steps=500, ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset ) # Fine-tune the model trainer.train() model.save_pretrained("./fine-tuned-mt5-marathi-proverbs") tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs") repo_id = "grpathak22/mt5-proverbs" # # Log in and create the repo # api = HfApi() # api.login(token=hf_token) # api.create_repo(repo_id, exist_ok=True) # # Initialize the Repository object # repo = Repository(local_dir="./fine-tuned-mt5-marathi-proverbs", clone_from=repo_id) # # Push the model and tokenizer to the Hugging Face Hub # repo.push_to_hub(commit_message="Add fine-tuned MT5 model for Marathi proverbs") prompt = "अति शहाणा त्याचा बैल रिकामा" # Tokenize the input prompt input_ids = tokenizer.encode(prompt, return_tensors='pt') # Generate the output output_ids = model.generate(input_ids, max_length=256) # Decode the output to text output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)