Spaces:
Sleeping
Sleeping
madhavkotecha
commited on
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import sklearn_crfsuite
|
3 |
+
from sklearn_crfsuite import metrics
|
4 |
+
from nltk.stem import LancasterStemmer
|
5 |
+
import numpy as np
|
6 |
+
from sklearn.metrics import confusion_matrix
|
7 |
+
import seaborn as sns
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import re
|
10 |
+
import gradio as gr
|
11 |
+
lancaster = LancasterStemmer()
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
class CRF_POS_Tagger:
|
16 |
+
def __init__(self):
|
17 |
+
self.corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
|
18 |
+
self.corpus = [[(word.lower(), tag) for word, tag in sentence] for sentence in self.corpus]
|
19 |
+
self.actual_tag = []
|
20 |
+
self.predicted_tag = []
|
21 |
+
self.prefixes = [
|
22 |
+
"a", "anti", "auto", "bi", "co", "dis", "en", "em", "ex", "in", "im",
|
23 |
+
"inter", "mis", "non", "over", "pre", "re", "sub", "trans", "un", "under"
|
24 |
+
]
|
25 |
+
|
26 |
+
self.suffixes = [
|
27 |
+
"able", "ible", "al", "ance", "ence", "dom", "er", "or", "ful", "hood",
|
28 |
+
"ic", "ing", "ion", "tion", "ity", "ty", "ive", "less", "ly", "ment",
|
29 |
+
"ness", "ous", "ship", "y", "es", "s"
|
30 |
+
]
|
31 |
+
|
32 |
+
self.prefix_pattern = f"^({'|'.join(self.prefixes)})"
|
33 |
+
self.suffix_pattern = f"({'|'.join(self.suffixes)})$"
|
34 |
+
|
35 |
+
self.X = [[self.word_features(sentence, i) for i in range(len(sentence))] for sentence in self.corpus]
|
36 |
+
self.y = [[postag for _, postag in sentence] for sentence in self.corpus]
|
37 |
+
|
38 |
+
self.split = int(0.8 * len(self.X))
|
39 |
+
self.X_train = self.X[:self.split]
|
40 |
+
self.y_train = self.y[:self.split]
|
41 |
+
self.X_test = self.X[self.split:]
|
42 |
+
self.y_test = self.y[self.split:]
|
43 |
+
self.crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
|
44 |
+
self.train()
|
45 |
+
|
46 |
+
def word_splitter(self, word):
|
47 |
+
prefix = ""
|
48 |
+
stem = word
|
49 |
+
suffix = ""
|
50 |
+
|
51 |
+
prefix_match = re.match(self.prefix_pattern, word)
|
52 |
+
if prefix_match:
|
53 |
+
prefix = prefix_match.group(1)
|
54 |
+
stem = word[len(prefix):]
|
55 |
+
|
56 |
+
suffix_match = re.search(self.suffix_pattern, stem)
|
57 |
+
if suffix_match:
|
58 |
+
suffix = suffix_match.group(1)
|
59 |
+
stem = stem[: -len(suffix)]
|
60 |
+
|
61 |
+
return prefix, stem, suffix
|
62 |
+
|
63 |
+
# Define a function to extract features for each word in a sentence
|
64 |
+
def word_features(self, sentence, i):
|
65 |
+
word = sentence[i][0]
|
66 |
+
prefix, stem, suffix = self.word_splitter(word)
|
67 |
+
features = {
|
68 |
+
'word': word,
|
69 |
+
'prefix': prefix,
|
70 |
+
# 'stem': stem,
|
71 |
+
'stem': lancaster.stem(word),
|
72 |
+
'suffix': suffix,
|
73 |
+
'position': i,
|
74 |
+
'is_first': i == 0, #if the word is a first word
|
75 |
+
'is_last': i == len(sentence) - 1, #if the word is a last word
|
76 |
+
# 'is_capitalized': word[0].upper() == word[0],
|
77 |
+
'is_all_caps': word.isupper(), #word is in uppercase
|
78 |
+
'is_all_lower': word.islower(), #word is in lowercase
|
79 |
+
|
80 |
+
'prefix-1': word[0],
|
81 |
+
'prefix-2': word[:2],
|
82 |
+
'prefix-3': word[:3],
|
83 |
+
'suffix-1': word[-1],
|
84 |
+
'suffix-2': word[-2:],
|
85 |
+
'suffix-3': word[-3:],
|
86 |
+
|
87 |
+
'prefix-un': word[:2] == 'un', #if word starts with un
|
88 |
+
'prefix-re': word[:2] == 're', #if word starts with re
|
89 |
+
'prefix-over': word[:4] == 'over', #if word starts with over
|
90 |
+
'prefix-dis': word[:4] == 'dis', #if word starts with dis
|
91 |
+
'prefix-mis': word[:4] == 'mis', #if word starts with mis
|
92 |
+
'prefix-pre': word[:4] == 'pre', #if word starts with pre
|
93 |
+
'prefix-non': word[:4] == 'non', #if word starts with non
|
94 |
+
'prefix-de': word[:3] == 'de', #if word starts with de
|
95 |
+
'prefix-in': word[:3] == 'in', #if word starts with in
|
96 |
+
'prefix-en': word[:3] == 'en', #if word starts with en
|
97 |
+
|
98 |
+
'suffix-ed': word[-2:] == 'ed', #if word ends with ed
|
99 |
+
'suffix-ing': word[-3:] == 'ing', #if word ends with ing
|
100 |
+
'suffix-es': word[-2:] == 'es', #if word ends with es
|
101 |
+
'suffix-ly': word[-2:] == 'ly', #if word ends with ly
|
102 |
+
'suffix-ment': word[-4:] == 'ment', #if word ends with ment
|
103 |
+
'suffix-er': word[-2:] == 'er', #if word ends with er
|
104 |
+
'suffix-ive': word[-3:] == 'ive',
|
105 |
+
'suffix-ous': word[-3:] == 'ous',
|
106 |
+
'suffix-ness': word[-4:] == 'ness',
|
107 |
+
'ends_with_s': word[-1] == 's',
|
108 |
+
'ends_with_es': word[-2:] == 'es',
|
109 |
+
|
110 |
+
'has_hyphen': '-' in word, #if word has hypen
|
111 |
+
'is_numeric': word.isdigit(), #if word is in numeric
|
112 |
+
'capitals_inside': word[1:].lower() != word[1:],
|
113 |
+
'is_title_case': word.istitle(), #if first letter is in uppercase
|
114 |
+
|
115 |
+
}
|
116 |
+
|
117 |
+
if i > 0:
|
118 |
+
# prev_word, prev_postag = sentence[i-1]
|
119 |
+
prev_word = sentence[i-1][0]
|
120 |
+
prev_prefix, prev_stem, prev_suffix = self.word_splitter(prev_word)
|
121 |
+
|
122 |
+
features.update({
|
123 |
+
'prev_word': prev_word,
|
124 |
+
# 'prev_postag': prev_postag,
|
125 |
+
'prev_prefix': prev_prefix,
|
126 |
+
'prev_stem': lancaster.stem(prev_word),
|
127 |
+
'prev_suffix': prev_suffix,
|
128 |
+
'prev:is_all_caps': prev_word.isupper(),
|
129 |
+
'prev:is_all_lower': prev_word.islower(),
|
130 |
+
'prev:is_numeric': prev_word.isdigit(),
|
131 |
+
'prev:is_title_case': prev_word.istitle(),
|
132 |
+
})
|
133 |
+
|
134 |
+
if i < len(sentence)-1:
|
135 |
+
next_word = sentence[i-1][0]
|
136 |
+
next_prefix, next_stem, next_suffix = self.word_splitter(next_word)
|
137 |
+
features.update({
|
138 |
+
'next_word': next_word,
|
139 |
+
'next_prefix': next_prefix,
|
140 |
+
'next_stem': lancaster.stem(next_word),
|
141 |
+
'next_suffix': next_suffix,
|
142 |
+
'next:is_all_caps': next_word.isupper(),
|
143 |
+
'next:is_all_lower': next_word.islower(),
|
144 |
+
'next:is_numeric': next_word.isdigit(),
|
145 |
+
'next:is_title_case': next_word.istitle(),
|
146 |
+
})
|
147 |
+
|
148 |
+
return features
|
149 |
+
|
150 |
+
def train(self):
|
151 |
+
self.crf_model.fit(self.X_train, self.y_train)
|
152 |
+
|
153 |
+
def predict(self, X_test):
|
154 |
+
return self.crf_model.predict(X_test)
|
155 |
+
|
156 |
+
def accuracy(self, test_data):
|
157 |
+
X_test, y_test = zip(*test_data)
|
158 |
+
y_pred = self.predict(X_test)
|
159 |
+
self.actual_tag.extend([item for sublist in y_test for item in sublist])
|
160 |
+
self.predicted_tag.extend([item for sublist in y_pred for item in sublist])
|
161 |
+
return metrics.flat_accuracy_score(y_test, y_pred)
|
162 |
+
|
163 |
+
def cross_validation(self, data):
|
164 |
+
accuracies = []
|
165 |
+
for i in range(5):
|
166 |
+
n1 = int(i / 5.0 * len(data))
|
167 |
+
n2 = int((i + 1) / 5.0 * len(data))
|
168 |
+
test_data = data[n1:n2]
|
169 |
+
train_data = data[:n1] + data[n2:]
|
170 |
+
self.train(train_data)
|
171 |
+
acc = self.accuracy(test_data)
|
172 |
+
accuracies.append(acc)
|
173 |
+
return accuracies, sum(accuracies) / 5.0
|
174 |
+
|
175 |
+
def con_matrix(self):
|
176 |
+
self.labels = np.unique(self.actual_tag)
|
177 |
+
conf_matrix = confusion_matrix(self.actual_tag, self.predicted_tag, labels=self.labels)
|
178 |
+
|
179 |
+
plt.figure(figsize=(10, 7))
|
180 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=self.labels, yticklabels=self.labels)
|
181 |
+
plt.xlabel('Predicted Tags')
|
182 |
+
plt.ylabel('Actual Tags')
|
183 |
+
plt.title('Confusion Matrix Heatmap')
|
184 |
+
plt.savefig("Confusion_matrix.png")
|
185 |
+
plt.show()
|
186 |
+
|
187 |
+
return conf_matrix
|
188 |
+
|
189 |
+
def per_pos_accuracy(self, conf_matrix):
|
190 |
+
print("Per Tag Precision, Recall, and F-Score:")
|
191 |
+
per_tag_metrics = {}
|
192 |
+
|
193 |
+
for i, tag in enumerate(self.labels):
|
194 |
+
true_positives = conf_matrix[i, i]
|
195 |
+
false_positives = np.sum(conf_matrix[:, i]) - true_positives
|
196 |
+
false_negatives = np.sum(conf_matrix[i, :]) - true_positives
|
197 |
+
|
198 |
+
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
|
199 |
+
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
|
200 |
+
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
201 |
+
beta_0_5 = 0.5
|
202 |
+
beta_2 = 2.0
|
203 |
+
|
204 |
+
f0_5_score = (1 + beta_0_5**2) * (precision * recall) / ((beta_0_5**2 * precision) + recall) if (precision + recall) > 0 else 0
|
205 |
+
f2_score = (1 + beta_2**2) * (precision * recall) / ((beta_2**2 * precision) + recall) if (precision + recall) > 0 else 0
|
206 |
+
|
207 |
+
per_tag_metrics[tag] = {
|
208 |
+
'Precision': precision,
|
209 |
+
'Recall': recall,
|
210 |
+
'f1-Score': f1_score,
|
211 |
+
'f05-Score': f0_5_score,
|
212 |
+
'f2-Score': f2_score
|
213 |
+
}
|
214 |
+
|
215 |
+
print(f"{tag}: Precision = {precision:.2f}, Recall = {recall:.2f}, f1-Score = {f1_score:.2f}, "
|
216 |
+
f"f05-Score = {f0_5_score:.2f}, f2-Score = {f2_score:.2f}")
|
217 |
+
|
218 |
+
def tagging(self, input):
|
219 |
+
sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())).split()
|
220 |
+
sentence_list = [[word] for word in sentence]
|
221 |
+
features = [self.word_features(sentence_list, i) for i in range(len(sentence_list))]
|
222 |
+
|
223 |
+
predicted_tags = self.crf_model.predict([features])
|
224 |
+
output = "".join(f"{sentence[i]}[{predicted_tags[0][i]}] " for i in range(len(sentence)))
|
225 |
+
return output
|
226 |
+
|
227 |
+
tagger = CRF_POS_Tagger()
|
228 |
+
interface = gr.Interface(fn = tagger.tagging,
|
229 |
+
inputs = "text",
|
230 |
+
outputs = "text",
|
231 |
+
title = "CRF POS Tagger",
|
232 |
+
description = "CS626 Assignment 1b by 24M0797, 24M0798, 24M0815, 24M0833")
|
233 |
+
interface.launch(inline = False)
|