Spaces:
Sleeping
Sleeping
File size: 2,649 Bytes
e3660e1 09d32b7 e3660e1 599bc02 e3660e1 1c50f5e e3660e1 1c50f5e e3660e1 36fa541 e3660e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from transformers import AutoTokenizer, MT5ForConditionalGeneration
from transformers import T5Tokenizer
import streamlit as st
import pandas as pd
from datasets import Dataset
import torch
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
tokenizer = T5Tokenizer.from_pretrained('google/mt5-base')
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
#st.write(model)
df = pd.read_csv('proverbs.csv')
df
dataset = Dataset.from_pandas(df)
def preprocess_function(examples):
inputs = examples['Proverb']
targets = examples['Meaning']
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_dataset = dataset.map(preprocess_function, batched=True)
dataset_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = dataset_split['train']
test_dataset = dataset_split['test']
print(f"Training dataset size: {len(train_dataset)}")
print(f"Testing dataset size: {len(test_dataset)}")
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=3,
weight_decay=0.01,
save_total_limit=2,
save_steps=500,
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset
)
# Fine-tune the model
trainer.train()
model.save_pretrained("./fine-tuned-mt5-marathi-proverbs")
tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs")
repo_id = "grpathak22/mt5-proverbs"
# # Log in and create the repo
# api = HfApi()
# api.login(token=hf_token)
# api.create_repo(repo_id, exist_ok=True)
# # Initialize the Repository object
# repo = Repository(local_dir="./fine-tuned-mt5-marathi-proverbs", clone_from=repo_id)
# # Push the model and tokenizer to the Hugging Face Hub
# repo.push_to_hub(commit_message="Add fine-tuned MT5 model for Marathi proverbs")
prompt = "अति शहाणा त्याचा बैल रिकामा"
# Tokenize the input prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')
# Generate the output
output_ids = model.generate(input_ids, max_length=256)
# Decode the output to text
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|