madhavkotecha commited on
Commit
0515619
·
verified ·
1 Parent(s): eb13651

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -0
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import math
3
+ import nltk
4
+ import matplotlib.pyplot as plt
5
+ import re
6
+ import gradio as gr
7
+ from collections import Counter, defaultdict
8
+ from sklearn.model_selection import KFold
9
+ from sklearn import metrics
10
+
11
+ nltk.download('brown')
12
+ nltk.download('universal_tagset')
13
+
14
+ class HMM:
15
+ def __init__(self):
16
+ self.tagged_sentences = nltk.corpus.brown.tagged_sents(tagset='universal')
17
+ self.tagset = ['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']
18
+ self.start_token = '^'
19
+ self.end_token = '$'
20
+
21
+ self.tagged_sentences = [[(self.start_token, self.start_token)] + sentence + [(self.end_token, self.end_token)] for sentence in self.tagged_sentences]
22
+ self.tagged_sentences = [[(word.lower(),tag) for word, tag in sentence] for sentence in self.tagged_sentences]
23
+
24
+ def train(self):
25
+ tagged_sent = np.array(self.tagged_sentences,dtype='object')
26
+
27
+ y_pred = []
28
+ y_true = []
29
+
30
+ train = (int)(0.8*len(tagged_sent))
31
+ train_sentences = tagged_sent[:train]
32
+ test_sentences = tagged_sent[train:]
33
+ tagsCount,wordTagMapping,tagTagMapping = self.mapping(train_sentences)
34
+
35
+ for sentence in test_sentences:
36
+ untaggedWords = [word for word,tag in sentence]
37
+ prediction = self.viterbi(untaggedWords,tagsCount,wordTagMapping,tagTagMapping)
38
+ for i in range(1,len(prediction)-1):
39
+ y_pred.append(prediction[i])
40
+ y_true.append(sentence[i][1])
41
+
42
+ f05_Score = metrics.fbeta_score(y_true,y_pred,beta=0.5,average='weighted',zero_division=0)
43
+ f1_Score = metrics.fbeta_score(y_true,y_pred,beta=1,average='weighted',zero_division=0)
44
+ f2_Score = metrics.fbeta_score(y_true,y_pred,beta=2,average='weighted',zero_division=0)
45
+ precision = metrics.precision_score(y_true,y_pred,average='weighted',zero_division=0)
46
+ recall = metrics.recall_score(y_true,y_pred,average='weighted',zero_division=0)
47
+
48
+ print(f"Precision = {precision:.2f}, Recall = {recall:.2f}, f05-Score = {f05_Score:.2f}, f1-Score = {f1_Score:.2f}, f2-Score = {f2_Score:.2f}")
49
+ return tagsCount,wordTagMapping,tagTagMapping
50
+
51
+ def viterbi(self,untaggedWords,tagsCount,wordTagMapping,tagTagMapping):
52
+ sent_len = len(untaggedWords)
53
+ # taglist = []
54
+
55
+ prev, curr, path = defaultdict(Counter), defaultdict(Counter), defaultdict(Counter)
56
+ prev = {tag: 0.0 for tag in tagsCount}
57
+ prev[self.start_token] = 1.0
58
+ path[0][self.start_token] = 1.0
59
+
60
+ for i in range(1,sent_len-1):
61
+ word = untaggedWords[i]
62
+ # max_prev_tag = max(prev, key=prev.get)
63
+ # taglist.append(max_prev_tag)
64
+ for tag in tagsCount:
65
+ curr[tag] = float('-inf')
66
+ # lprob = prev[max_prev_tag] + math.log(lexical_probability(word,tag,tagsCount,wordTagMapping)) + math.log(transition_probability(max_prev_tag,tag,tagsCount,tagTagMapping))
67
+ # if lprob>curr[tag]:
68
+ # curr[tag] = lprob
69
+ # path[i][tag] = max_prev_tag
70
+ for prev_tag in tagsCount:
71
+ lprob = prev[prev_tag] + math.log(self.lexical_probability(word,tag,tagsCount,wordTagMapping)) + math.log(self.transition_probability(prev_tag,tag,tagsCount,tagTagMapping))
72
+ if lprob>curr[tag]:
73
+ curr[tag] = lprob
74
+ path[i][tag] = prev_tag
75
+ for tag in tagsCount:
76
+ prev[tag] = curr[tag]
77
+
78
+ # max_prev_tag = max(prev, key=prev.get)
79
+ # taglist.append(max_prev_tag)
80
+ # taglist.append('$')
81
+
82
+ taglist = ['$' for i in range(sent_len)]
83
+ for tag in tagsCount:
84
+ if curr[tag] > curr[taglist[sent_len-2]]:
85
+ taglist[sent_len-2] = tag
86
+ for i in range(sent_len-3,0,-1):
87
+ taglist[i] = path[i+1][taglist[i+1]]
88
+ taglist[0] = self.start_token
89
+ return taglist
90
+
91
+ def mapping(self, sentences):
92
+ word_tag_pairs = [(word, tag) for sentence in sentences for word, tag in sentence]
93
+ tagsCount = Counter(tag for _,tag in word_tag_pairs)
94
+
95
+ wordTagMapping = defaultdict(Counter)
96
+ for word, tag in word_tag_pairs:
97
+ wordTagMapping[word][tag]+=1
98
+
99
+ tagTagMapping = defaultdict(Counter)
100
+ for sentence in sentences:
101
+ for i in range(len(sentence)-1):
102
+ tagTagMapping[sentence[i][1]][sentence[i+1][1]]+=1
103
+ return tagsCount,wordTagMapping,tagTagMapping
104
+
105
+ def transition_probability(self,curr,next,tagsCount,tagTagMapping):
106
+ currToNextCount = tagTagMapping[curr][next]
107
+ currCount = tagsCount[curr]
108
+ probability = (currToNextCount) / (currCount)
109
+ return 10**-9 if probability == 0 else probability
110
+
111
+ def lexical_probability(self,word,tag,tagsCount,wordTagMapping):
112
+ wordTagCount = wordTagMapping[word][tag]
113
+ tagCount = tagsCount[tag]
114
+ probability = (wordTagCount+1)/(tagCount+len(wordTagMapping)) # Adding Laplace Smoothing
115
+ return probability
116
+
117
+ def cross_validation(self, tagged_sentences):
118
+ kfold = KFold(n_splits=5, shuffle=True, random_state=1)
119
+
120
+ tagged_sent = np.array(tagged_sentences,dtype='object')
121
+ y_pred_list = []
122
+ y_true_list = []
123
+ for fold, (train, test) in enumerate(kfold.split(tagged_sent)):
124
+ train_sentences = tagged_sent[train]
125
+ test_sentences = tagged_sent[test]
126
+ tagsCount,wordTagMapping,tagTagMapping = self.mapping(train_sentences)
127
+
128
+ y_pred = []
129
+ y_true = []
130
+
131
+ for sentence in test_sentences:
132
+ untaggedWords = [word for word,_ in sentence]
133
+ pred_taglist = self.viterbi(untaggedWords,tagsCount,wordTagMapping,tagTagMapping)
134
+ for i in range(1,len(pred_taglist)-1):
135
+ y_pred.append(pred_taglist[i])
136
+ y_true.append(sentence[i][1])
137
+
138
+ y_pred_list.append(np.array(y_pred))
139
+ y_true_list.append(np.array(y_true))
140
+ accuracy = metrics.accuracy_score(y_true_list[-1],y_pred_list[-1],normalize=True)
141
+ print(f'Fold {fold + 1} Accuracy: {accuracy}')
142
+
143
+ f05_Score, f1_Score, f2_Score, precision, recall = 0, 0, 0, 0, 0
144
+
145
+ for i in range(5):
146
+ precision += metrics.precision_score(y_true_list[i],y_pred_list[i],average='weighted',zero_division=0)
147
+ recall += metrics.recall_score(y_true_list[i],y_pred_list[i],average='weighted',zero_division=0)
148
+ f05_Score += metrics.fbeta_score(y_true_list[i],y_pred_list[i],beta=0.5,average='weighted',zero_division=0)
149
+ f1_Score += metrics.fbeta_score(y_true_list[i],y_pred_list[i],beta=1,average='weighted',zero_division=0)
150
+ f2_Score += metrics.fbeta_score(y_true_list[i],y_pred_list[i],beta=2,average='weighted',zero_division=0)
151
+
152
+ precision = precision/5.0
153
+ recall = recall/5.0
154
+ f05_Score = f05_Score/5.0
155
+ f1_Score = f1_Score/5.0
156
+ f2_Score = f2_Score/5.0
157
+
158
+
159
+ print(f"Average Precision = {precision:.2f}, Average Recall = {recall:.2f}, Average f05-Score = {f05_Score:.2f}, Average f1-Score = {f1_Score:.2f}, Average f2-Score = {f2_Score:.2f}")
160
+ self.per_pos_report(y_true_list,y_pred_list)
161
+ self.confusion_matrix(y_true_list,y_pred_list)
162
+
163
+ def confusion_matrix(self,y_true_list,y_pred_list):
164
+ total = 0.0
165
+ for y_true,y_pred in zip(y_true_list,y_pred_list):
166
+ cm = metrics.confusion_matrix(y_true,y_pred,labels=self.tagset)
167
+ total += cm
168
+
169
+ matrix = total/len(y_true_list)
170
+ normalized_matrix = matrix/np.sum(matrix, axis=1, keepdims=True)
171
+
172
+ plt.subplots(figsize=(12,10))
173
+ plt.xticks(np.arange(len(self.tagset)), self.tagset)
174
+ plt.yticks(np.arange(len(self.tagset)), self.tagset)
175
+ for i in range(normalized_matrix.shape[0]):
176
+ for j in range(normalized_matrix.shape[1]):
177
+ plt.text(j, i, format(normalized_matrix[i, j], '0.2f'), horizontalalignment="center")
178
+ plt.imshow(normalized_matrix,interpolation='nearest',cmap=plt.cm.Greens)
179
+ plt.colorbar()
180
+ plt.savefig('Confusion_Matrix.png')
181
+
182
+ def per_pos_report(self,y_true_list,y_pred_list):
183
+ report, support = 0, 0
184
+ for y_true,y_pred in zip(y_true_list,y_pred_list):
185
+ cr = metrics.classification_report(y_true,y_pred,labels=self.tagset,zero_division=0)
186
+ cr = cr.replace('macro avg', 'MacroAvg').replace('micro avg', 'MicroAvg').replace('weighted avg', 'WeightedAvg')
187
+ rows = cr.split('\n')
188
+ tags , reportValues , supportValues = [], [], []
189
+ for row in rows[1:]:
190
+ row = row.strip().split()
191
+ if len(row) < 2:
192
+ continue
193
+ tagScores = [float(j) for j in row[1: len(row) - 1]]
194
+ supportValues.append(int(row[-1]))
195
+ tags.append(row[0])
196
+ reportValues.append(tagScores)
197
+ report += np.array(reportValues)
198
+ support += np.array(supportValues)
199
+ report = report/5.0
200
+ support = support/5.0
201
+ xlabels = ['Precision', 'Recall', 'F1 Score']
202
+ ylabels = ['{0}[{1}]'.format(tags[i], sup) for i, sup in enumerate(support)]
203
+
204
+ _, ax = plt.subplots(figsize=(18,10))
205
+ ax.xaxis.set_tick_params()
206
+ ax.yaxis.set_tick_params()
207
+ plt.imshow(report, aspect='auto',cmap=plt.cm.RdYlGn)
208
+
209
+ plt.xticks(np.arange(3), xlabels)
210
+ plt.yticks(np.arange(len(tags)), ylabels)
211
+ plt.colorbar()
212
+ for i in range(report.shape[0]):
213
+ for j in range(report.shape[1]):
214
+ plt.text(j, i, format(report[i, j], '.2f'), horizontalalignment="center", verticalalignment="center")
215
+ plt.savefig('Per_POS_Accuracy.png')
216
+
217
+ def doTagging(self,input_sentence,prevTagsCount,prevWordTagMapping,prevTagTagMapping):
218
+ input_sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input_sentence.strip()))
219
+ untaggedWords = input_sentence.lower().split()
220
+ untaggedWords = ['^'] + untaggedWords + ['$']
221
+ tags = self.viterbi(untaggedWords, prevTagsCount, prevWordTagMapping, prevTagTagMapping)
222
+ output_sentence = ''.join(f'{untaggedWords[i]}[{tags[i]}] ' for i in range(1,len(untaggedWords)-1))
223
+ return output_sentence
224
+
225
+ hmm = HMM()
226
+ hmm.cross_validation(hmm.tagged_sentences)
227
+ tagsCount,wordTagMapping,tagTagMapping = hmm.train()
228
+
229
+ # test_sent = "the united kingdom and the usa are on two sides of the atlantic"
230
+ def tagging(input_sentence):
231
+ return hmm.doTagging(input_sentence, tagsCount, wordTagMapping, tagTagMapping)
232
+
233
+
234
+ interface = gr.Interface(fn = tagging,
235
+ inputs = gr.Textbox(
236
+ label="Input Sentence",
237
+ placeholder="Enter your sentence here...",
238
+ ),
239
+ outputs = gr.Textbox(
240
+ label="Tagged Output",
241
+ placeholder="Tagged sentence appears here...",
242
+ ),
243
+ title = "Hidden Markov Model POS Tagger",
244
+ description = "CS626 Assignment 1A (Autumn 2024)",
245
+ theme=gr.themes.Soft())
246
+ interface.launch(inline = False, share = True)