CRF-NLP / app.py
madhavkotecha's picture
Update app.py
2ba6acd verified
import numpy as np
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import gradio as gr
import re
nltk.download('brown')
nltk.download('universal_tagset')
corpus = nltk.corpus.brown.tagged_sents(tagset='universal')
sentence = [
('The', 'DET'),
('dog', 'NOUN'),
('jumps', 'VERB'),
('over', 'ADP'),
('the', 'DET'),
('car', 'NOUN')
]
corpus = list(corpus)
corpus[21058] = sentence
def word_features(sentence, i, prev_tag):
word = sentence[i][0]
features = {
'word': word,
'is_first': i == 0, #if the word is a first word
'is_last': i == len(sentence) - 1, #if the word is a last word
'is_capitalized': word[0].upper() == word[0],
'is_all_caps': word.upper() == word, #word is in uppercase
'is_all_lower': word.lower() == word, #word is in lowercase
'prefix-1': word[0],
'prefix-2': word[:2],
'prefix-3': word[:3],
'prefix-un': word[:2] == 'un', #if word starts with un
'prefix-re': word[:2] == 're', #if word starts with re
'prefix-over': word[:4] == 'over', #if word starts with over
'prefix-dis': word[:4] == 'dis', #if word starts with dis
'prefix-mis': word[:4] == 'mis', #if word starts with mis
'prefix-pre': word[:4] == 'pre', #if word starts with pre
'prefix-non': word[:4] == 'non', #if word starts with non
'prefix-de': word[:3] == 'de', #if word starts with de
'prefix-in': word[:3] == 'in', #if word starts with in
'prefix-en': word[:3] == 'en', #if word starts with en
'suffix-1': word[-1],
'suffix-2': word[-2:],
'suffix-3': word[-3:],
'suffix-ed': word[-2:] == 'ed', #if word ends with ed
'suffix-ing': word[-3:] == 'ing', #if word ends with ing
'suffix-es': word[-2:] == 'es', #if word ends with es
'suffix-s': word[-1] == 's', #if word ends with s
'suffix-ly': word[-2:] == 'ly', #if word ends with ly
'suffix-ment': word[-4:] == 'ment', #if word ends with ment
'suffix-er': word[-2:] == 'er', #if word ends with er
'prev_word': '' if i == 0 else sentence[i-1][0],
'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
'has_hyphen': '-' in word, #if word has hypen
'is_numeric': word.isdigit(), #if word is in numeric
'capitals_inside': word[1:].lower() != word[1:],
'is_first_capital': word[0].upper() == word[0], #if first letter is in uppercase
'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN', #if word ends with s and previous tag is NOUN
'prev_tag': prev_tag,
}
return features
X = []
y = []
for sentence in corpus:
X_sentence = []
y_sentence = []
for i in range(len(sentence)):
X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1]))
y_sentence.append(sentence[i][1])
X.append(X_sentence)
y.append(y_sentence)
# Split the data into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]
# Train a CRF model on the training data
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
# Make predictions on the test data and evaluate the performance
y_pred = crf.predict(X_test)
print(metrics.flat_accuracy_score(y_test, y_pred))
def predict_tags(sentence):
tokens = sentence.split()
tokens2 = [(token, '') for token in tokens]
features = []
prev_prev_tag = ''
prev_tag = ''
for i in range(len(tokens)):
features.append(word_features(tokens2, i, prev_tag))
if i > 0:
prev_tag = crf.predict([features[:i]])[0][i-1]
predicted_tags = crf.predict([features])[0]
return list(zip(tokens, predicted_tags))
# Example usage
new_sentence = "The dog walks over the car"
predicted_tags = predict_tags(new_sentence)
print(predicted_tags)
def tagging(input):
input = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip()))
tagged_list = predict_tags(input)
output = ''.join(f"{word}[{tag}] " for word, tag in tagged_list)
return output
interface = gr.Interface(fn = tagging,
inputs = gr.Textbox(
label="Input Sentence",
placeholder="Enter your sentence here...",
),
outputs = gr.Textbox(
label="Tagged Output",
placeholder="Tagged sentence appears here...",
),
title = "Conditional Random Field POS Tagger",
description = "CS626 Assignment 1B (Autumn 2024)",
theme=gr.themes.Soft())
interface.launch(inline = False)