grpathak22 commited on
Commit
e3660e1
1 Parent(s): ad6284c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -17
app.py CHANGED
@@ -1,22 +1,88 @@
1
- import modelrun.py
2
- # from transformers import AutoTokenizer, MT5ForConditionalGeneration
3
- # from transformers import T5Tokenizer
4
- # import streamlit as st
5
- # import pandas as pd
6
- # from datasets import Dataset
7
- # import torch
8
- # from datasets import Dataset, DatasetDict
9
- # from transformers import Trainer, TrainingArguments
10
 
 
 
 
11
 
12
- # prompt = st.text_input("Enter your proverb: ")
 
 
13
 
14
- # # Tokenize the input prompt
15
- # input_ids = tokenizer.encode(prompt, return_tensors='pt')
 
 
 
 
 
 
16
 
17
- # # Generate the output
18
- # output_ids = model.generate(input_ids, max_length=256)
19
 
20
- # # Decode the output to text
21
- # output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
22
- # st.write(output_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, MT5ForConditionalGeneration
2
+ from transformers import T5Tokenizer
3
+ import streamlit as st
4
+ import pandas as pd
5
+ from datasets import Dataset
6
+ import torch
7
+ from datasets import Dataset, DatasetDict
8
+ from transformers import Trainer, TrainingArguments
 
9
 
10
+ tokenizer = T5Tokenizer.from_pretrained('google/mt5-base')
11
+ model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
12
+ #st.write(model)
13
 
14
+ df = pd.read_csv('proverbs.csv')
15
+ df
16
+ dataset = Dataset.from_pandas(df)
17
 
18
+ def preprocess_function(examples):
19
+ inputs = examples['Proverb']
20
+ targets = examples['Meaning']
21
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
22
+ with tokenizer.as_target_tokenizer():
23
+ labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
24
+ model_inputs["labels"] = labels["input_ids"]
25
+ return model_inputs
26
 
 
 
27
 
28
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
29
+
30
+
31
+ dataset_split = tokenized_dataset.train_test_split(test_size=0.2)
32
+
33
+
34
+ train_dataset = dataset_split['train']
35
+ test_dataset = dataset_split['test']
36
+
37
+
38
+ print(f"Training dataset size: {len(train_dataset)}")
39
+ print(f"Testing dataset size: {len(test_dataset)}")
40
+
41
+ training_args = TrainingArguments(
42
+ output_dir="./results",
43
+ evaluation_strategy="epoch",
44
+ learning_rate=2e-5,
45
+ per_device_train_batch_size=4,
46
+ per_device_eval_batch_size=4,
47
+ num_train_epochs=3,
48
+ weight_decay=0.01,
49
+ save_total_limit=2,
50
+ save_steps=500,
51
+ )
52
+
53
+ # Initialize Trainer
54
+ trainer = Trainer(
55
+ model=model,
56
+ args=training_args,
57
+ train_dataset=tokenized_dataset,
58
+ eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset
59
+ )
60
+
61
+ # Fine-tune the model
62
+ trainer.train()
63
+
64
+ model.save_pretrained("./fine-tuned-mt5-marathi-proverbs")
65
+ tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs")
66
+ repo_id = "grpathak22/mt5-proverbs"
67
+
68
+ # # Log in and create the repo
69
+ # api = HfApi()
70
+ # api.login(token=hf_token)
71
+ # api.create_repo(repo_id, exist_ok=True)
72
+
73
+ # # Initialize the Repository object
74
+ # repo = Repository(local_dir="./fine-tuned-mt5-marathi-proverbs", clone_from=repo_id)
75
+
76
+ # # Push the model and tokenizer to the Hugging Face Hub
77
+ # repo.push_to_hub(commit_message="Add fine-tuned MT5 model for Marathi proverbs")
78
+
79
+ prompt = "अति शहाणा त्याचा बैल रिकामा"
80
+
81
+ # Tokenize the input prompt
82
+ input_ids = tokenizer.encode(prompt, return_tensors='pt')
83
+
84
+ # Generate the output
85
+ output_ids = model.generate(input_ids, max_length=256)
86
+
87
+ # Decode the output to text
88
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)