Spaces:

madhavkotecha
/

CS626-CRF

Sleeping

App Files Files Community

madhavkotecha commited on Oct 2, 2024

Commit

7a7a538

verified ·

1 Parent(s): 13e1dc8

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -39

app.py CHANGED Viewed

@@ -15,18 +15,20 @@ nltk.download('universal_tagset')
 class CRF_POS_Tagger:
     def __init__(self, train=False):
         self.corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
         self.corpus = [[(word.lower(), tag) for word, tag in sentence] for sentence in self.corpus]
         self.actual_tag = []
         self.predicted_tag = []
         self.prefixes = [
-            "a", "anti", "auto", "bi", "co", "dis", "en", "em", "ex", "in", "im",
             "inter", "mis", "non", "over", "pre", "re", "sub", "trans", "un", "under"
         ]
         self.suffixes = [
-            "able", "ible", "al", "ance", "ence", "dom", "er", "or", "ful", "hood",
-            "ic", "ing", "ion", "tion", "ity", "ty", "ive", "less", "ly", "ment",
             "ness", "ous", "ship", "y", "es", "s"
         ]
@@ -35,16 +37,17 @@ class CRF_POS_Tagger:
         self.X = [[self.word_features(sentence, i) for i in range(len(sentence))] for sentence in self.corpus]
         self.y = [[postag for _, postag in sentence] for sentence in self.corpus]
         self.split = int(0.8 * len(self.X))
         self.X_train = self.X[:self.split]
         self.y_train = self.y[:self.split]
         self.X_test = self.X[self.split:]
         self.y_test = self.y[self.split:]
         self.crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
         if train:
-            data = list(zip(self.X_train, self.y_train))
-            self.train(data)
     def word_splitter(self, word):
         prefix = ""
@@ -62,7 +65,7 @@ class CRF_POS_Tagger:
             stem = stem[: -len(suffix)]
         return prefix, stem, suffix
     # Define a function to extract features for each word in a sentence
     def word_features(self, sentence, i):
         word = sentence[i][0]
@@ -79,7 +82,7 @@ class CRF_POS_Tagger:
             # 'is_capitalized': word[0].upper() == word[0],
             'is_all_caps': word.isupper(),      #word is in uppercase
             'is_all_lower': word.islower(),      #word is in lowercase
             'prefix-1': word[0],
             'prefix-2': word[:2],
             'prefix-3': word[:3],
@@ -97,31 +100,31 @@ class CRF_POS_Tagger:
             'prefix-de': word[:3] == 'de',     #if word starts with de
             'prefix-in': word[:3] == 'in',     #if word starts with in
             'prefix-en': word[:3] == 'en',     #if word starts with en
             'suffix-ed': word[-2:] == 'ed',   #if word ends with ed
             'suffix-ing': word[-3:] == 'ing',  #if word ends with ing
             'suffix-es': word[-2:] == 'es',    #if word ends with es
             'suffix-ly': word[-2:] == 'ly',    #if word ends with ly
             'suffix-ment': word[-4:] == 'ment',  #if word ends with ment
-            'suffix-er': word[-2:] == 'er',     #if word ends with er
             'suffix-ive': word[-3:] == 'ive',
             'suffix-ous': word[-3:] == 'ous',
             'suffix-ness': word[-4:] == 'ness',
-            'ends_with_s': word[-1] == 's',
             'ends_with_es': word[-2:] == 'es',
             'has_hyphen': '-' in word,    #if word has hypen
             'is_numeric': word.isdigit(),  #if word is in numeric
             'capitals_inside': word[1:].lower() != word[1:],
             'is_title_case': word.istitle(),  #if first letter is in uppercase
         }
         if i > 0:
             # prev_word, prev_postag = sentence[i-1]
             prev_word = sentence[i-1][0]
             prev_prefix, prev_stem, prev_suffix = self.word_splitter(prev_word)
             features.update({
                 'prev_word': prev_word,
                 # 'prev_postag': prev_postag,
@@ -131,7 +134,7 @@ class CRF_POS_Tagger:
                 'prev:is_all_caps': prev_word.isupper(),
                 'prev:is_all_lower': prev_word.islower(),
                 'prev:is_numeric': prev_word.isdigit(),
-                'prev:is_title_case': prev_word.istitle(),
             })
         if i < len(sentence)-1:
@@ -145,28 +148,38 @@ class CRF_POS_Tagger:
                 'next:is_all_caps': next_word.isupper(),
                 'next:is_all_lower': next_word.islower(),
                 'next:is_numeric': next_word.isdigit(),
-                'next:is_title_case': next_word.istitle(),
             })
         return features
-    def train(self, data):
-        X_train, y_train = zip(*data)
         self.crf_model.fit(X_train, y_train)
     def predict(self, X_test):
         return self.crf_model.predict(X_test)
     def accuracy(self, test_data):
         X_test, y_test = zip(*test_data)
         y_pred = self.predict(X_test)
         self.actual_tag.extend([item for sublist in y_test for item in sublist])
         self.predicted_tag.extend([item for sublist in y_pred for item in sublist])
         return metrics.flat_accuracy_score(y_test, y_pred)
     def cross_validation(self):
         validator = CRF_POS_Tagger()
-        data = list(zip(self.X, self.y))
         accuracies = []
         for i in range(5):
             n1 = int(i / 5.0 * len(data))
@@ -176,12 +189,15 @@ class CRF_POS_Tagger:
             validator.train(train_data)
             acc = validator.accuracy(test_data)
             accuracies.append(acc)
         return accuracies, sum(accuracies) / 5.0
     def con_matrix(self):
-        self.labels = np.unique(self.actual_tag)
         conf_matrix = confusion_matrix(self.actual_tag, self.predicted_tag, labels=self.labels)
         plt.figure(figsize=(10, 7))
         sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=self.labels, yticklabels=self.labels)
         plt.xlabel('Predicted Tags')
@@ -189,9 +205,9 @@ class CRF_POS_Tagger:
         plt.title('Confusion Matrix Heatmap')
         plt.savefig("Confusion_matrix.png")
         plt.show()
         return conf_matrix
     def per_pos_accuracy(self, conf_matrix):
         print("Per Tag Precision, Recall, and F-Score:")
         per_tag_metrics = {}
@@ -220,7 +236,7 @@ class CRF_POS_Tagger:
             print(f"{tag}: Precision = {precision:.2f}, Recall = {recall:.2f}, f1-Score = {f1_score:.2f}, "
                   f"f05-Score = {f0_5_score:.2f}, f2-Score = {f2_score:.2f}")
     def tagging(self, input):
         sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())).split()
         sentence_list = [[word.lower()] for word in sentence]
@@ -231,23 +247,24 @@ class CRF_POS_Tagger:
         return output
-tagger = CRF_POS_Tagger(True)
-accuracies, avg_accuracy = tagger.cross_validation()
 print(f"Cross-Validation Accuracies: {accuracies}")
 print(f"Average Accuracy: {avg_accuracy}")
-conf_matrix = tagger.con_matrix()
-print(tagger.per_pos_accuracy(conf_matrix))
-interface = gr.Interface(fn = tagger.tagging,
-                            inputs = gr.Textbox(
-                            label="Input Sentence",
-                            placeholder="Enter your sentence here...",
-                        ),
                          outputs = gr.Textbox(
-                            label="Tagged Output",
-                            placeholder="Tagged sentence appears here...",
-                        ),
                          title = "Conditional Random Field POS Tagger",
                          description = "CS626 Assignment 1B (Autumn 2024)",
                          theme=gr.themes.Soft())

 class CRF_POS_Tagger:
     def __init__(self, train=False):
+        print("Loading Data...")
         self.corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
+        print("Data Loaded...")
         self.corpus = [[(word.lower(), tag) for word, tag in sentence] for sentence in self.corpus]
         self.actual_tag = []
         self.predicted_tag = []
         self.prefixes = [
+            "a", "anti", "auto", "bi", "co", "dis", "en", "em", "ex", "in", "im",
             "inter", "mis", "non", "over", "pre", "re", "sub", "trans", "un", "under"
         ]
         self.suffixes = [
+            "able", "ible", "al", "ance", "ence", "dom", "er", "or", "ful", "hood",
+            "ic", "ing", "ion", "tion", "ity", "ty", "ive", "less", "ly", "ment",
             "ness", "ous", "ship", "y", "es", "s"
         ]
         self.X = [[self.word_features(sentence, i) for i in range(len(sentence))] for sentence in self.corpus]
         self.y = [[postag for _, postag in sentence] for sentence in self.corpus]
         self.split = int(0.8 * len(self.X))
         self.X_train = self.X[:self.split]
         self.y_train = self.y[:self.split]
         self.X_test = self.X[self.split:]
         self.y_test = self.y[self.split:]
+        print("Data Loaded...")
         self.crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
+        print("Model Created...")
         if train:
+            self.train()
     def word_splitter(self, word):
         prefix = ""
             stem = stem[: -len(suffix)]
         return prefix, stem, suffix
     # Define a function to extract features for each word in a sentence
     def word_features(self, sentence, i):
         word = sentence[i][0]
             # 'is_capitalized': word[0].upper() == word[0],
             'is_all_caps': word.isupper(),      #word is in uppercase
             'is_all_lower': word.islower(),      #word is in lowercase
             'prefix-1': word[0],
             'prefix-2': word[:2],
             'prefix-3': word[:3],
             'prefix-de': word[:3] == 'de',     #if word starts with de
             'prefix-in': word[:3] == 'in',     #if word starts with in
             'prefix-en': word[:3] == 'en',     #if word starts with en
             'suffix-ed': word[-2:] == 'ed',   #if word ends with ed
             'suffix-ing': word[-3:] == 'ing',  #if word ends with ing
             'suffix-es': word[-2:] == 'es',    #if word ends with es
             'suffix-ly': word[-2:] == 'ly',    #if word ends with ly
             'suffix-ment': word[-4:] == 'ment',  #if word ends with ment
+            'suffix-er': word[-2:] == 'er',     #if word ends with er
             'suffix-ive': word[-3:] == 'ive',
             'suffix-ous': word[-3:] == 'ous',
             'suffix-ness': word[-4:] == 'ness',
+            'ends_with_s': word[-1] == 's',
             'ends_with_es': word[-2:] == 'es',
             'has_hyphen': '-' in word,    #if word has hypen
             'is_numeric': word.isdigit(),  #if word is in numeric
             'capitals_inside': word[1:].lower() != word[1:],
             'is_title_case': word.istitle(),  #if first letter is in uppercase
         }
         if i > 0:
             # prev_word, prev_postag = sentence[i-1]
             prev_word = sentence[i-1][0]
             prev_prefix, prev_stem, prev_suffix = self.word_splitter(prev_word)
             features.update({
                 'prev_word': prev_word,
                 # 'prev_postag': prev_postag,
                 'prev:is_all_caps': prev_word.isupper(),
                 'prev:is_all_lower': prev_word.islower(),
                 'prev:is_numeric': prev_word.isdigit(),
+                'prev:is_title_case': prev_word.istitle(),
             })
         if i < len(sentence)-1:
                 'next:is_all_caps': next_word.isupper(),
                 'next:is_all_lower': next_word.islower(),
                 'next:is_numeric': next_word.isdigit(),
+                'next:is_title_case': next_word.istitle(),
             })
         return features
+    def train(self, data=None):
+        if data:
+            X_train, y_train = zip(*data)
+        else:
+            X_train, y_train = self.X_train, self.y_train
+        print("Training CRF Model...", len(self.X_train), len(self.y_train))
+        # Ensure X_train is a list of lists of dictionaries
+        X_train = [list(map(dict, x)) for x in X_train]
         self.crf_model.fit(X_train, y_train)
     def predict(self, X_test):
         return self.crf_model.predict(X_test)
     def accuracy(self, test_data):
         X_test, y_test = zip(*test_data)
         y_pred = self.predict(X_test)
         self.actual_tag.extend([item for sublist in y_test for item in sublist])
         self.predicted_tag.extend([item for sublist in y_pred for item in sublist])
+        print(len(self.actual_tag), len(self.predicted_tag))
         return metrics.flat_accuracy_score(y_test, y_pred)
     def cross_validation(self):
         validator = CRF_POS_Tagger()
+        data = list(zip(self.X, self.y))
+        print("Cross-Validation...")
         accuracies = []
         for i in range(5):
             n1 = int(i / 5.0 * len(data))
             validator.train(train_data)
             acc = validator.accuracy(test_data)
             accuracies.append(acc)
+        self.actual_tag = validator.actual_tag
+        self.predicted_tag = validator.predicted_tag
         return accuracies, sum(accuracies) / 5.0
     def con_matrix(self):
+        self.labels = np.unique(self.actual_tag)
+        print(self.labels, self.actual_tag, self.predicted_tag)
         conf_matrix = confusion_matrix(self.actual_tag, self.predicted_tag, labels=self.labels)
         plt.figure(figsize=(10, 7))
         sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=self.labels, yticklabels=self.labels)
         plt.xlabel('Predicted Tags')
         plt.title('Confusion Matrix Heatmap')
         plt.savefig("Confusion_matrix.png")
         plt.show()
         return conf_matrix
     def per_pos_accuracy(self, conf_matrix):
         print("Per Tag Precision, Recall, and F-Score:")
         per_tag_metrics = {}
             print(f"{tag}: Precision = {precision:.2f}, Recall = {recall:.2f}, f1-Score = {f1_score:.2f}, "
                   f"f05-Score = {f0_5_score:.2f}, f2-Score = {f2_score:.2f}")
     def tagging(self, input):
         sentence = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())).split()
         sentence_list = [[word.lower()] for word in sentence]
         return output
+validate = CRF_POS_Tagger()
+accuracies, avg_accuracy = validate.cross_validation()
 print(f"Cross-Validation Accuracies: {accuracies}")
 print(f"Average Accuracy: {avg_accuracy}")
+conf_matrix = validate.con_matrix()
+print(validate.per_pos_accuracy(conf_matrix))
+tagger = CRF_POS_Tagger(True)
+interface = gr.Interface(fn = tagger.tagging,
+                         inputs = gr.Textbox(
+                             label="Input Sentence",
+                             placeholder="Enter your sentence here...",
+                         ),
                          outputs = gr.Textbox(
+                             label="Tagged Output",
+                             placeholder="Tagged sentence appears here...",
+                         ),
                          title = "Conditional Random Field POS Tagger",
                          description = "CS626 Assignment 1B (Autumn 2024)",
                          theme=gr.themes.Soft())