Spaces:
Sleeping
Sleeping
madhavkotecha
commited on
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import math
|
3 |
+
import nltk
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import re
|
6 |
+
import gradio as gr
|
7 |
+
from collections import Counter, defaultdict
|
8 |
+
from sklearn.model_selection import KFold
|
9 |
+
from sklearn import metrics
|
10 |
+
|
11 |
+
nltk.download('brown')
|
12 |
+
nltk.download('universal_tagset')
|
13 |
+
|
14 |
+
class HMM:
|
15 |
+
def __init__(self):
|
16 |
+
self.tagged_sentences = nltk.corpus.brown.tagged_sents(tagset='universal')
|
17 |
+
self.tagset = ['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']
|
18 |
+
self.start_token = '^'
|
19 |
+
self.end_token = '$'
|
20 |
+
|
21 |
+
self.tagged_sentences = [[(self.start_token, self.start_token)] + sentence + [(self.end_token, self.end_token)] for sentence in self.tagged_sentences]
|
22 |
+
self.tagged_sentences = [[(word.lower(),tag) for word, tag in sentence] for sentence in self.tagged_sentences]
|
23 |
+
|
24 |
+
def train(self):
|
25 |
+
tagged_sent = np.array(self.tagged_sentences,dtype='object')
|
26 |
+
|
27 |
+
y_pred = []
|
28 |
+
y_true = []
|
29 |
+
|
30 |
+
train = (int)(0.8*len(tagged_sent))
|
31 |
+
train_sentences = tagged_sent[:train]
|
32 |
+
test_sentences = tagged_sent[train:]
|
33 |
+
tagsCount,wordTagMapping,tagTagMapping = self.mapping(train_sentences)
|
34 |
+
|
35 |
+
for sentence in test_sentences:
|
36 |
+
untaggedWords = [word for word,tag in sentence]
|
37 |
+
prediction = self.viterbi(untaggedWords,tagsCount,wordTagMapping,tagTagMapping)
|
38 |
+
for i in range(1,len(prediction)-1):
|
39 |
+
y_pred.append(prediction[i])
|
40 |
+
y_true.append(sentence[i][1])
|
41 |
+
|
42 |
+
f05_Score = metrics.fbeta_score(y_true,y_pred,beta=0.5,average='weighted',zero_division=0)
|
43 |
+
f1_Score = metrics.fbeta_score(y_true,y_pred,beta=1,average='weighted',zero_division=0)
|
44 |
+
f2_Score = metrics.fbeta_score(y_true,y_pred,beta=2,average='weighted',zero_division=0)
|
45 |
+
precision = metrics.precision_score(y_true,y_pred,average='weighted',zero_division=0)
|
46 |
+
recall = metrics.recall_score(y_true,y_pred,average='weighted',zero_division=0)
|
47 |
+
|
48 |
+
print(f"Precision = {precision:.2f}, Recall = {recall:.2f}, f05-Score = {f05_Score:.2f}, f1-Score = {f1_Score:.2f}, f2-Score = {f2_Score:.2f}")
|
49 |
+
return tagsCount,wordTagMapping,tagTagMapping
|
50 |
+
|
51 |
+
def viterbi(self,untaggedWords,tagsCount,wordTagMapping,tagTagMapping):
|
52 |
+
sent_len = len(untaggedWords)
|
53 |
+
# taglist = []
|
54 |
+
|
55 |
+
prev, curr, path = defaultdict(Counter), defaultdict(Counter), defaultdict(Counter)
|
56 |
+
prev = {tag: 0.0 for tag in tagsCount}
|
57 |
+
prev[self.start_token] = 1.0
|
58 |
+
path[0][self.start_token] = 1.0
|
59 |
+
|
60 |
+
for i in range(1,sent_len-1):
|
61 |
+
word = untaggedWords[i]
|
62 |
+
# max_prev_tag = max(prev, key=prev.get)
|
63 |
+
# taglist.append(max_prev_tag)
|
64 |
+
for tag in tagsCount:
|
65 |
+
curr[tag] = float('-inf')
|
66 |
+
# lprob = prev[max_prev_tag] + math.log(lexical_probability(word,tag,tagsCount,wordTagMapping)) + math.log(transition_probability(max_prev_tag,tag,tagsCount,tagTagMapping))
|
67 |
+
# if lprob>curr[tag]:
|
68 |
+
# curr[tag] = lprob
|
69 |
+
# path[i][tag] = max_prev_tag
|
70 |
+
for prev_tag in tagsCount:
|
71 |
+
lprob = prev[prev_tag] + math.log(self.lexical_probability(word,tag,tagsCount,wordTagMapping)) + math.log(self.transition_probability(prev_tag,tag,tagsCount,tagTagMapping))
|
72 |
+
if lprob>curr[tag]:
|
73 |
+
curr[tag] = lprob
|
74 |
+
path[i][tag] = prev_tag
|
75 |
+
for tag in tagsCount:
|
76 |
+
prev[tag] = curr[tag]
|
77 |
+
|
78 |
+
# max_prev_tag = max(prev, key=prev.get)
|
79 |
+
# taglist.append(max_prev_tag)
|
80 |
+
# taglist.append('$')
|
81 |
+
|
82 |
+
taglist = ['$' for i in range(sent_len)]
|
83 |
+
for tag in tagsCount:
|
84 |
+
if curr[tag] > curr[taglist[sent_len-2]]:
|
85 |
+
taglist[sent_len-2] = tag
|
86 |
+
for i in range(sent_len-3,0,-1):
|
87 |
+
taglist[i] = path[i+1][taglist[i+1]]
|
88 |
+
taglist[0] = self.start_token
|
89 |
+
return taglist
|
90 |
+
|
91 |
+
def mapping(self, sentences):
|
92 |
+
word_tag_pairs = [(word, tag) for sentence in sentences for word, tag in sentence]
|
93 |
+
tagsCount = Counter(tag for _,tag in word_tag_pairs)
|
94 |
+
|
95 |
+
wordTagMapping = defaultdict(Counter)
|
96 |
+
for word, tag in word_tag_pairs:
|
97 |
+
wordTagMapping[word][tag]+=1
|
98 |
+
|
99 |
+
tagTagMapping = defaultdict(Counter)
|
100 |
+
for sentence in sentences:
|
101 |
+
for i in range(len(sentence)-1):
|
102 |
+
tagTagMapping[sentence[i][1]][sentence[i+1][1]]+=1
|
103 |
+
return tagsCount,wordTagMapping,tagTagMapping
|
104 |
+
|
105 |
+
def transition_probability(self,curr,next,tagsCount,tagTagMapping):
|
106 |
+
currToNextCount = tagTagMapping[curr][next]
|
107 |
+
currCount = tagsCount[curr]
|
108 |
+
probability = (currToNextCount) / (currCount)
|
109 |
+
return 10**-9 if probability == 0 else probability
|
110 |
+
|
111 |
+
def lexical_probability(self,word,tag,tagsCount,wordTagMapping):
|
112 |
+
wordTagCount = wordTagMapping[word][tag]
|
113 |
+
tagCount = tagsCount[tag]
|
114 |
+
probability = (wordTagCount+1)/(tagCount+len(wordTagMapping)) # Adding Laplace Smoothing
|
115 |
+
return probability
|
116 |
+
|
117 |
+
def cross_validation(self, tagged_sentences):
|
118 |
+
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
|
119 |
+
|
120 |
+
tagged_sent = np.array(tagged_sentences,dtype='object')
|
121 |
+
y_pred_list = []
|
122 |
+
y_true_list = []
|
123 |
+
for fold, (train, test) in enumerate(kfold.split(tagged_sent)):
|
124 |
+
train_sentences = tagged_sent[train]
|
125 |
+
test_sentences = tagged_sent[test]
|
126 |
+
tagsCount,wordTagMapping,tagTagMapping = self.mapping(train_sentences)
|
127 |
+
|
128 |
+
y_pred = []
|
129 |
+
y_true = []
|
130 |
+
|
131 |
+
for sentence in test_sentences:
|
132 |
+
untaggedWords = [word for word,_ in sentence]
|
133 |
+
pred_taglist = self.viterbi(untaggedWords,tagsCount,wordTagMapping,tagTagMapping)
|
134 |
+
for i in range(1,len(pred_taglist)-1):
|
135 |
+
y_pred.append(pred_taglist[i])
|
136 |
+
y_true.append(sentence[i][1])
|
137 |
+
|
138 |
+
y_pred_list.append(np.array(y_pred))
|
139 |
+
y_true_list.append(np.array(y_true))
|
140 |
+
accuracy = metrics.accuracy_score(y_true_list[-1],y_pred_list[-1],normalize=True)
|
141 |
+
print(f'Fold {fold + 1} Accuracy: {accuracy}')
|
142 |
+
|
143 |
+
f05_Score, f1_Score, f2_Score, precision, recall = 0, 0, 0, 0, 0
|
144 |
+
|
145 |
+
for i in range(5):
|
146 |
+
precision += metrics.precision_score(y_true_list[i],y_pred_list[i],average='weighted',zero_division=0)
|
147 |
+
recall += metrics.recall_score(y_true_list[i],y_pred_list[i],average='weighted',zero_division=0)
|
148 |
+
f05_Score += metrics.fbeta_score(y_true_list[i],y_pred_list[i],beta=0.5,average='weighted',zero_division=0)
|
149 |
+
f1_Score += metrics.fbeta_score(y_true_list[i],y_pred_list[i],beta=1,average='weighted',zero_division=0)
|
150 |
+
f2_Score += metrics.fbeta_score(y_true_list[i],y_pred_list[i],beta=2,average='weighted',zero_division=0)
|
151 |
+
|
152 |
+
precision = precision/5.0
|
153 |
+
recall = recall/5.0
|
154 |
+
f05_Score = f05_Score/5.0
|
155 |
+
f1_Score = f1_Score/5.0
|
156 |
+
f2_Score = f2_Score/5.0
|
157 |
+
|
158 |
+
|
159 |
+
print(f"Average Precision = {precision:.2f}, Average Recall = {recall:.2f}, Average f05-Score = {f05_Score:.2f}, Average f1-Score = {f1_Score:.2f}, Average f2-Score = {f2_Score:.2f}")
|
160 |
+
self.per_pos_report(y_true_list,y_pred_list)
|
161 |
+
self.confusion_matrix(y_true_list,y_pred_list)
|
162 |
+
|
163 |
+
def confusion_matrix(self,y_true_list,y_pred_list):
|
164 |
+
total = 0.0
|
165 |
+
for y_true,y_pred in zip(y_true_list,y_pred_list):
|
166 |
+
cm = metrics.confusion_matrix(y_true,y_pred,labels=self.tagset)
|
167 |
+
total += cm
|
168 |
+
|
169 |
+
matrix = total/len(y_true_list)
|
170 |
+
normalized_matrix = matrix/np.sum(matrix, axis=1, keepdims=True)
|
171 |
+
|
172 |
+
plt.subplots(figsize=(12,10))
|
173 |
+
plt.xticks(np.arange(len(self.tagset)), self.tagset)
|
174 |
+
plt.yticks(np.arange(len(self.tagset)), self.tagset)
|
175 |
+
for i in range(normalized_matrix.shape[0]):
|
176 |
+
for j in range(normalized_matrix.shape[1]):
|
177 |
+
plt.text(j, i, format(normalized_matrix[i, j], '0.2f'), horizontalalignment="center")
|
178 |
+
plt.imshow(normalized_matrix,interpolation='nearest',cmap=plt.cm.Greens)
|
179 |
+
plt.colorbar()
|
180 |
+
plt.savefig('Confusion_Matrix.png')
|
181 |
+
|
182 |
+
def per_pos_report(self,y_true_list,y_pred_list):
|
183 |
+
report, support = 0, 0
|
184 |
+
for y_true,y_pred in zip(y_true_list,y_pred_list):
|
185 |
+
cr = metrics.classification_report(y_true,y_pred,labels=self.tagset,zero_division=0)
|
186 |
+
cr = cr.replace('macro avg', 'MacroAvg').replace('micro avg', 'MicroAvg').replace('weighted avg', 'WeightedAvg')
|
187 |
+
rows = cr.split('\n')
|
188 |
+
tags , reportValues , supportValues = [], [], []
|
189 |
+
for row in rows[1:]:
|
190 |
+
row = row.strip().split()
|
191 |
+
if len(row) < 2:
|
192 |
+
continue
|
193 |
+
tagScores = [float(j) for j in row[1: len(row) - 1]]
|
194 |
+
supportValues.append(int(row[-1]))
|
195 |
+
tags.append(row[0])
|
196 |
+
reportValues.append(tagScores)
|
197 |
+
report += np.array(reportValues)
|
198 |
+
support += np.array(supportValues)
|
199 |
+
report = report/5.0
|
200 |
+
support = support/5.0
|
201 |
+
xlabels = ['Precision', 'Recall', 'F1 Score']
|
202 |
+
ylabels = ['{0}[{1}]'.format(tags[i], sup) for i, sup in enumerate(support)]
|
203 |
+
|
204 |
+
_, ax = plt.subplots(figsize=(18,10))
|
205 |
+
ax.xaxis.set_tick_params()
|
206 |
+
ax.yaxis.set_tick_params()
|
207 |
+
plt.imshow(report, aspect='auto',cmap=plt.cm.RdYlGn)
|
208 |
+
|
209 |
+
plt.xticks(np.arange(3), xlabels)
|
210 |
+
plt.yticks(np.arange(len(tags)), ylabels)
|
211 |
+
plt.colorbar()
|
212 |
+
for i in range(report.shape[0]):
|
213 |
+
for j in range(report.shape[1]):
|
214 |
+
plt.text(j, i, format(report[i, j], '.2f'), horizontalalignment="center", verticalalignment="center")
|
215 |
+
plt.savefig('Per_POS_Accuracy.png')
|
216 |
+
|
217 |
+
def doTagging(self,input_sentence,prevTagsCount,prevWordTagMapping,prevTagTagMapping):
|
218 |
+
input_sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input_sentence.strip()))
|
219 |
+
untaggedWords = input_sentence.lower().split()
|
220 |
+
untaggedWords = ['^'] + untaggedWords + ['$']
|
221 |
+
tags = self.viterbi(untaggedWords, prevTagsCount, prevWordTagMapping, prevTagTagMapping)
|
222 |
+
output_sentence = ''.join(f'{untaggedWords[i]}[{tags[i]}] ' for i in range(1,len(untaggedWords)-1))
|
223 |
+
return output_sentence
|
224 |
+
|
225 |
+
hmm = HMM()
|
226 |
+
hmm.cross_validation(hmm.tagged_sentences)
|
227 |
+
tagsCount,wordTagMapping,tagTagMapping = hmm.train()
|
228 |
+
|
229 |
+
# test_sent = "the united kingdom and the usa are on two sides of the atlantic"
|
230 |
+
def tagging(input_sentence):
|
231 |
+
return hmm.doTagging(input_sentence, tagsCount, wordTagMapping, tagTagMapping)
|
232 |
+
|
233 |
+
|
234 |
+
interface = gr.Interface(fn = tagging,
|
235 |
+
inputs = gr.Textbox(
|
236 |
+
label="Input Sentence",
|
237 |
+
placeholder="Enter your sentence here...",
|
238 |
+
),
|
239 |
+
outputs = gr.Textbox(
|
240 |
+
label="Tagged Output",
|
241 |
+
placeholder="Tagged sentence appears here...",
|
242 |
+
),
|
243 |
+
title = "Hidden Markov Model POS Tagger",
|
244 |
+
description = "CS626 Assignment 1A (Autumn 2024)",
|
245 |
+
theme=gr.themes.Soft())
|
246 |
+
interface.launch(inline = False, share = True)
|