madhavkotecha commited on
Commit
389f641
·
verified ·
1 Parent(s): e1f628a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -42
app.py CHANGED
@@ -18,7 +18,7 @@ class CRF_POS_Tagger:
18
  print("Loading Data...")
19
  self.corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
20
  print("Data Loaded...")
21
- self.corpus = [[(word.lower(), tag) for word, tag in sentence] for sentence in self.corpus]
22
  self.actual_tag = []
23
  self.predicted_tag = []
24
  self.prefixes = [
@@ -31,12 +31,6 @@ class CRF_POS_Tagger:
31
  "ic", "ing", "ion", "tion", "ity", "ty", "ive", "less", "ly", "ment",
32
  "ness", "ous", "ship", "y", "es", "s"
33
  ]
34
- self.interjections = {
35
- 'ah', 'alas', 'aha', 'bravo', 'darn', 'drat', 'eh', 'eek', 'eww',
36
- 'gosh', 'ha', 'hooray', 'hmm', 'huh', 'oops', 'ouch', 'phew',
37
- 'pow', 'yay', 'whoa', 'wow', 'yikes', 'yippee', 'uh', 'um',
38
- 'hey', 'hello'
39
- }
40
 
41
  self.prefix_pattern = f"^({'|'.join(self.prefixes)})"
42
  self.suffix_pattern = f"({'|'.join(self.suffixes)})$"
@@ -96,34 +90,34 @@ class CRF_POS_Tagger:
96
  'suffix-2': word[-2:],
97
  'suffix-3': word[-3:],
98
 
99
- # 'prefix-un': word[:2] == 'un', #if word starts with un
100
- # 'prefix-re': word[:2] == 're', #if word starts with re
101
- # 'prefix-over': word[:4] == 'over', #if word starts with over
102
- # 'prefix-dis': word[:4] == 'dis', #if word starts with dis
103
- # 'prefix-mis': word[:4] == 'mis', #if word starts with mis
104
- # 'prefix-pre': word[:4] == 'pre', #if word starts with pre
105
- # 'prefix-non': word[:4] == 'non', #if word starts with non
106
- # 'prefix-de': word[:3] == 'de', #if word starts with de
107
- # 'prefix-in': word[:3] == 'in', #if word starts with in
108
- # 'prefix-en': word[:3] == 'en', #if word starts with en
109
-
110
- # 'suffix-ed': word[-2:] == 'ed', #if word ends with ed
111
- # 'suffix-ing': word[-3:] == 'ing', #if word ends with ing
112
- # 'suffix-es': word[-2:] == 'es', #if word ends with es
113
- # 'suffix-ly': word[-2:] == 'ly', #if word ends with ly
114
- # 'suffix-ment': word[-4:] == 'ment', #if word ends with ment
115
- # 'suffix-er': word[-2:] == 'er', #if word ends with er
116
- # 'suffix-ive': word[-3:] == 'ive',
117
- # 'suffix-ous': word[-3:] == 'ous',
118
- # 'suffix-ness': word[-4:] == 'ness',
119
- # 'ends_with_s': word[-1] == 's',
120
- # 'ends_with_es': word[-2:] == 'es',
121
 
122
  'has_hyphen': '-' in word, #if word has hypen
123
  'is_numeric': word.isdigit(), #if word is in numeric
124
  'capitals_inside': word[1:].lower() != word[1:],
125
  'is_title_case': word.istitle(), #if first letter is in uppercase
126
- 'is_interjection': word.lower() in self.interjections,
127
  }
128
 
129
  if i > 0:
@@ -144,7 +138,7 @@ class CRF_POS_Tagger:
144
  })
145
 
146
  if i < len(sentence)-1:
147
- next_word = sentence[i+1][0]
148
  next_prefix, next_stem, next_suffix = self.word_splitter(next_word)
149
  features.update({
150
  'next_word': next_word,
@@ -201,18 +195,18 @@ class CRF_POS_Tagger:
201
 
202
  def con_matrix(self):
203
  self.labels = np.unique(self.actual_tag)
204
- # print(self.labels, self.actual_tag, self.predicted_tag)
205
  conf_matrix = confusion_matrix(self.actual_tag, self.predicted_tag, labels=self.labels)
206
-
207
  plt.figure(figsize=(10, 7))
208
- sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=self.labels, yticklabels=self.labels)
209
  plt.xlabel('Predicted Tags')
210
  plt.ylabel('Actual Tags')
211
  plt.title('Confusion Matrix Heatmap')
212
  plt.savefig("Confusion_matrix.png")
213
  plt.show()
214
 
215
- return conf_matrix
216
 
217
  def per_pos_accuracy(self, conf_matrix):
218
  print("Per Tag Precision, Recall, and F-Score:")
@@ -245,7 +239,7 @@ class CRF_POS_Tagger:
245
 
246
  def tagging(self, input):
247
  sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())).split()
248
- sentence_list = [[word.lower()] for word in sentence]
249
  features = [self.word_features(sentence_list, i) for i in range(len(sentence_list))]
250
 
251
  predicted_tags = self.crf_model.predict([features])
@@ -253,13 +247,13 @@ class CRF_POS_Tagger:
253
  return output
254
 
255
 
256
- # validate = CRF_POS_Tagger()
257
- # accuracies, avg_accuracy = validate.cross_validation()
258
- # print(f"Cross-Validation Accuracies: {accuracies}")
259
- # print(f"Average Accuracy: {avg_accuracy}")
260
 
261
- # conf_matrix = validate.con_matrix()
262
- # print(validate.per_pos_accuracy(conf_matrix))
263
 
264
  tagger = CRF_POS_Tagger(True)
265
  interface = gr.Interface(fn = tagger.tagging,
 
18
  print("Loading Data...")
19
  self.corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
20
  print("Data Loaded...")
21
+ self.corpus = [[(word, tag) for word, tag in sentence] for sentence in self.corpus]
22
  self.actual_tag = []
23
  self.predicted_tag = []
24
  self.prefixes = [
 
31
  "ic", "ing", "ion", "tion", "ity", "ty", "ive", "less", "ly", "ment",
32
  "ness", "ous", "ship", "y", "es", "s"
33
  ]
 
 
 
 
 
 
34
 
35
  self.prefix_pattern = f"^({'|'.join(self.prefixes)})"
36
  self.suffix_pattern = f"({'|'.join(self.suffixes)})$"
 
90
  'suffix-2': word[-2:],
91
  'suffix-3': word[-3:],
92
 
93
+ 'prefix-un': word[:2] == 'un', #if word starts with un
94
+ 'prefix-re': word[:2] == 're', #if word starts with re
95
+ 'prefix-over': word[:4] == 'over', #if word starts with over
96
+ 'prefix-dis': word[:4] == 'dis', #if word starts with dis
97
+ 'prefix-mis': word[:4] == 'mis', #if word starts with mis
98
+ 'prefix-pre': word[:4] == 'pre', #if word starts with pre
99
+ 'prefix-non': word[:4] == 'non', #if word starts with non
100
+ 'prefix-de': word[:3] == 'de', #if word starts with de
101
+ 'prefix-in': word[:3] == 'in', #if word starts with in
102
+ 'prefix-en': word[:3] == 'en', #if word starts with en
103
+
104
+ 'suffix-ed': word[-2:] == 'ed', #if word ends with ed
105
+ 'suffix-ing': word[-3:] == 'ing', #if word ends with ing
106
+ 'suffix-es': word[-2:] == 'es', #if word ends with es
107
+ 'suffix-ly': word[-2:] == 'ly', #if word ends with ly
108
+ 'suffix-ment': word[-4:] == 'ment', #if word ends with ment
109
+ 'suffix-er': word[-2:] == 'er', #if word ends with er
110
+ 'suffix-ive': word[-3:] == 'ive',
111
+ 'suffix-ous': word[-3:] == 'ous',
112
+ 'suffix-ness': word[-4:] == 'ness',
113
+ 'ends_with_s': word[-1] == 's',
114
+ 'ends_with_es': word[-2:] == 'es',
115
 
116
  'has_hyphen': '-' in word, #if word has hypen
117
  'is_numeric': word.isdigit(), #if word is in numeric
118
  'capitals_inside': word[1:].lower() != word[1:],
119
  'is_title_case': word.istitle(), #if first letter is in uppercase
120
+
121
  }
122
 
123
  if i > 0:
 
138
  })
139
 
140
  if i < len(sentence)-1:
141
+ next_word = sentence[i-1][0]
142
  next_prefix, next_stem, next_suffix = self.word_splitter(next_word)
143
  features.update({
144
  'next_word': next_word,
 
195
 
196
  def con_matrix(self):
197
  self.labels = np.unique(self.actual_tag)
198
+ print(self.labels, self.actual_tag, self.predicted_tag)
199
  conf_matrix = confusion_matrix(self.actual_tag, self.predicted_tag, labels=self.labels)
200
+ normalized_matrix = conf_matrix/np.sum(conf_matrix, axis=1, keepdims=True)
201
  plt.figure(figsize=(10, 7))
202
+ sns.heatmap(normalized_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=self.labels, yticklabels=self.labels)
203
  plt.xlabel('Predicted Tags')
204
  plt.ylabel('Actual Tags')
205
  plt.title('Confusion Matrix Heatmap')
206
  plt.savefig("Confusion_matrix.png")
207
  plt.show()
208
 
209
+ return normalized_matrix
210
 
211
  def per_pos_accuracy(self, conf_matrix):
212
  print("Per Tag Precision, Recall, and F-Score:")
 
239
 
240
  def tagging(self, input):
241
  sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())).split()
242
+ sentence_list = [[word] for word in sentence]
243
  features = [self.word_features(sentence_list, i) for i in range(len(sentence_list))]
244
 
245
  predicted_tags = self.crf_model.predict([features])
 
247
  return output
248
 
249
 
250
+ validate = CRF_POS_Tagger()
251
+ accuracies, avg_accuracy = validate.cross_validation()
252
+ print(f"Cross-Validation Accuracies: {accuracies}")
253
+ print(f"Average Accuracy: {avg_accuracy}")
254
 
255
+ conf_matrix = validate.con_matrix()
256
+ print(validate.per_pos_accuracy(conf_matrix))
257
 
258
  tagger = CRF_POS_Tagger(True)
259
  interface = gr.Interface(fn = tagger.tagging,