Spaces:
Sleeping
Sleeping
madhavkotecha
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -30,6 +30,13 @@ class CRF_POS_Tagger:
|
|
30 |
"ness", "ous", "ship", "y", "es", "s"
|
31 |
]
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
self.prefix_pattern = f"^({'|'.join(self.prefixes)})"
|
34 |
self.suffix_pattern = f"({'|'.join(self.suffixes)})$"
|
35 |
|
@@ -42,7 +49,7 @@ class CRF_POS_Tagger:
|
|
42 |
self.X_test = self.X[self.split:]
|
43 |
self.y_test = self.y[self.split:]
|
44 |
self.crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
|
45 |
-
self.train()
|
46 |
|
47 |
def word_splitter(self, word):
|
48 |
prefix = ""
|
@@ -148,8 +155,9 @@ class CRF_POS_Tagger:
|
|
148 |
|
149 |
return features
|
150 |
|
151 |
-
def train(self):
|
152 |
-
|
|
|
153 |
|
154 |
def predict(self, X_test):
|
155 |
return self.crf_model.predict(X_test)
|
@@ -161,7 +169,8 @@ class CRF_POS_Tagger:
|
|
161 |
self.predicted_tag.extend([item for sublist in y_pred for item in sublist])
|
162 |
return metrics.flat_accuracy_score(y_test, y_pred)
|
163 |
|
164 |
-
def cross_validation(self
|
|
|
165 |
accuracies = []
|
166 |
for i in range(5):
|
167 |
n1 = int(i / 5.0 * len(data))
|
@@ -218,7 +227,7 @@ class CRF_POS_Tagger:
|
|
218 |
|
219 |
def tagging(self, input):
|
220 |
sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())).split()
|
221 |
-
sentence_list = [[word] for word in sentence]
|
222 |
features = [self.word_features(sentence_list, i) for i in range(len(sentence_list))]
|
223 |
|
224 |
predicted_tags = self.crf_model.predict([features])
|
@@ -226,6 +235,14 @@ class CRF_POS_Tagger:
|
|
226 |
return output
|
227 |
|
228 |
tagger = CRF_POS_Tagger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
interface = gr.Interface(fn = tagger.tagging,
|
230 |
inputs = gr.Textbox(
|
231 |
label="Input Sentence",
|
|
|
30 |
"ness", "ous", "ship", "y", "es", "s"
|
31 |
]
|
32 |
|
33 |
+
self.interjections = {
|
34 |
+
'ah', 'alas', 'aha', 'bravo', 'darn', 'drat', 'eh', 'eek', 'eww',
|
35 |
+
'gosh', 'ha', 'hooray', 'hmm', 'huh', 'oops', 'ouch', 'phew',
|
36 |
+
'pow', 'yay', 'whoa', 'wow', 'yikes', 'yippee', 'uh', 'um',
|
37 |
+
'hey', 'hello'
|
38 |
+
}
|
39 |
+
|
40 |
self.prefix_pattern = f"^({'|'.join(self.prefixes)})"
|
41 |
self.suffix_pattern = f"({'|'.join(self.suffixes)})$"
|
42 |
|
|
|
49 |
self.X_test = self.X[self.split:]
|
50 |
self.y_test = self.y[self.split:]
|
51 |
self.crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
|
52 |
+
# self.train()
|
53 |
|
54 |
def word_splitter(self, word):
|
55 |
prefix = ""
|
|
|
155 |
|
156 |
return features
|
157 |
|
158 |
+
def train(self, data=None):
|
159 |
+
X_train, y_train = zip(*data) if data else self.X_train, self.y_train
|
160 |
+
self.crf_model.fit(X_train, y_train)
|
161 |
|
162 |
def predict(self, X_test):
|
163 |
return self.crf_model.predict(X_test)
|
|
|
169 |
self.predicted_tag.extend([item for sublist in y_pred for item in sublist])
|
170 |
return metrics.flat_accuracy_score(y_test, y_pred)
|
171 |
|
172 |
+
def cross_validation(self):
|
173 |
+
data = list(zip(self.X, self.y))
|
174 |
accuracies = []
|
175 |
for i in range(5):
|
176 |
n1 = int(i / 5.0 * len(data))
|
|
|
227 |
|
228 |
def tagging(self, input):
|
229 |
sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())).split()
|
230 |
+
sentence_list = [[word.lower()] for word in sentence]
|
231 |
features = [self.word_features(sentence_list, i) for i in range(len(sentence_list))]
|
232 |
|
233 |
predicted_tags = self.crf_model.predict([features])
|
|
|
235 |
return output
|
236 |
|
237 |
tagger = CRF_POS_Tagger()
|
238 |
+
|
239 |
+
accuracies, avg_accuracy = tagger.cross_validation()
|
240 |
+
print(f"Cross-Validation Accuracies: {accuracies}")
|
241 |
+
print(f"Average Accuracy: {avg_accuracy}")
|
242 |
+
|
243 |
+
conf_matrix = tagger.con_matrix()
|
244 |
+
print(tagger.per_pos_accuracy(conf_matrix))
|
245 |
+
|
246 |
interface = gr.Interface(fn = tagger.tagging,
|
247 |
inputs = gr.Textbox(
|
248 |
label="Input Sentence",
|