Spaces:
Build error
Build error
| import numpy as np | |
| import nltk | |
| import sklearn_crfsuite | |
| from sklearn_crfsuite import metrics | |
| import gradio as gr | |
| import re | |
| nltk.download('brown') | |
| nltk.download('universal_tagset') | |
| corpus = nltk.corpus.brown.tagged_sents(tagset='universal') | |
| sentence = [ | |
| ('The', 'DET'), | |
| ('dog', 'NOUN'), | |
| ('jumps', 'VERB'), | |
| ('over', 'ADP'), | |
| ('the', 'DET'), | |
| ('car', 'NOUN') | |
| ] | |
| corpus = list(corpus) | |
| corpus[21058] = sentence | |
| def word_features(sentence, i, prev_tag): | |
| word = sentence[i][0] | |
| features = { | |
| 'word': word, | |
| 'is_first': i == 0, #if the word is a first word | |
| 'is_last': i == len(sentence) - 1, #if the word is a last word | |
| 'is_capitalized': word[0].upper() == word[0], | |
| 'is_all_caps': word.upper() == word, #word is in uppercase | |
| 'is_all_lower': word.lower() == word, #word is in lowercase | |
| 'prefix-1': word[0], | |
| 'prefix-2': word[:2], | |
| 'prefix-3': word[:3], | |
| 'prefix-un': word[:2] == 'un', #if word starts with un | |
| 'prefix-re': word[:2] == 're', #if word starts with re | |
| 'prefix-over': word[:4] == 'over', #if word starts with over | |
| 'prefix-dis': word[:4] == 'dis', #if word starts with dis | |
| 'prefix-mis': word[:4] == 'mis', #if word starts with mis | |
| 'prefix-pre': word[:4] == 'pre', #if word starts with pre | |
| 'prefix-non': word[:4] == 'non', #if word starts with non | |
| 'prefix-de': word[:3] == 'de', #if word starts with de | |
| 'prefix-in': word[:3] == 'in', #if word starts with in | |
| 'prefix-en': word[:3] == 'en', #if word starts with en | |
| 'suffix-1': word[-1], | |
| 'suffix-2': word[-2:], | |
| 'suffix-3': word[-3:], | |
| 'suffix-ed': word[-2:] == 'ed', #if word ends with ed | |
| 'suffix-ing': word[-3:] == 'ing', #if word ends with ing | |
| 'suffix-es': word[-2:] == 'es', #if word ends with es | |
| 'suffix-s': word[-1] == 's', #if word ends with s | |
| 'suffix-ly': word[-2:] == 'ly', #if word ends with ly | |
| 'suffix-ment': word[-4:] == 'ment', #if word ends with ment | |
| 'suffix-er': word[-2:] == 'er', #if word ends with er | |
| 'prev_word': '' if i == 0 else sentence[i-1][0], | |
| 'next_word': '' if i == len(sentence)-1 else sentence[i+1][0], | |
| 'has_hyphen': '-' in word, #if word has hypen | |
| 'is_numeric': word.isdigit(), #if word is in numeric | |
| 'capitals_inside': word[1:].lower() != word[1:], | |
| 'is_first_capital': word[0].upper() == word[0], #if first letter is in uppercase | |
| 'suffix-s_and-prev_tag_noun': word[-1] == 's' and prev_tag == 'NOUN', #if word ends with s and previous tag is NOUN | |
| 'prev_tag': prev_tag, | |
| } | |
| return features | |
| X = [] | |
| y = [] | |
| for sentence in corpus: | |
| X_sentence = [] | |
| y_sentence = [] | |
| for i in range(len(sentence)): | |
| X_sentence.append(word_features(sentence, i, '' if i == 0 else sentence[i-1][1])) | |
| y_sentence.append(sentence[i][1]) | |
| X.append(X_sentence) | |
| y.append(y_sentence) | |
| # Split the data into training and testing sets | |
| split = int(0.8 * len(X)) | |
| X_train = X[:split] | |
| y_train = y[:split] | |
| X_test = X[split:] | |
| y_test = y[split:] | |
| # Train a CRF model on the training data | |
| crf = sklearn_crfsuite.CRF( | |
| algorithm='lbfgs', | |
| c1=0.1, | |
| c2=0.1, | |
| max_iterations=100, | |
| all_possible_transitions=True | |
| ) | |
| crf.fit(X_train, y_train) | |
| # Make predictions on the test data and evaluate the performance | |
| y_pred = crf.predict(X_test) | |
| print(metrics.flat_accuracy_score(y_test, y_pred)) | |
| def predict_tags(sentence): | |
| tokens = sentence.split() | |
| tokens2 = [(token, '') for token in tokens] | |
| features = [] | |
| prev_prev_tag = '' | |
| prev_tag = '' | |
| for i in range(len(tokens)): | |
| features.append(word_features(tokens2, i, prev_tag)) | |
| if i > 0: | |
| prev_tag = crf.predict([features[:i]])[0][i-1] | |
| predicted_tags = crf.predict([features])[0] | |
| return list(zip(tokens, predicted_tags)) | |
| # Example usage | |
| new_sentence = "The dog walks over the car" | |
| predicted_tags = predict_tags(new_sentence) | |
| print(predicted_tags) | |
| def tagging(input): | |
| input = (re.sub(r'(\S)([.,;:!?])', r'\1 \2', input.strip())) | |
| tagged_list = predict_tags(input) | |
| output = ''.join(f"{word}[{tag}] " for word, tag in tagged_list) | |
| return output | |
| interface = gr.Interface(fn = tagging, | |
| inputs = gr.Textbox( | |
| label="Input Sentence", | |
| placeholder="Enter your sentence here...", | |
| ), | |
| outputs = gr.Textbox( | |
| label="Tagged Output", | |
| placeholder="Tagged sentence appears here...", | |
| ), | |
| title = "Conditional Random Field POS Tagger", | |
| description = "CS626 Assignment 1B (Autumn 2024)", | |
| theme=gr.themes.Soft()) | |
| interface.launch(inline = False) | |