Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,43 +6,28 @@ from transformers import T5Tokenizer, T5ForConditionalGeneration
|
|
6 |
import gradio as gr
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
9 |
-
from nltk.corpus import wordnet as wn
|
10 |
from difflib import SequenceMatcher
|
11 |
|
|
|
|
|
|
|
|
|
|
|
12 |
# Load a pre-trained T5 model specifically fine-tuned for grammar correction
|
13 |
tokenizer = T5Tokenizer.from_pretrained("prithivida/grammar_error_correcter_v1")
|
14 |
model = T5ForConditionalGeneration.from_pretrained("prithivida/grammar_error_correcter_v1")
|
15 |
|
16 |
-
# Function to
|
17 |
-
def get_base_form(word, tag):
|
18 |
-
wn_tag = {'VBD': wn.VERB, 'VBG': wn.VERB, 'VBN': wn.VERB, 'VBP': wn.VERB, 'VBZ': wn.VERB, 'VB': wn.VERB}
|
19 |
-
if tag in wn_tag:
|
20 |
-
lemma = nltk.WordNetLemmatizer().lemmatize(word, wn_tag[tag])
|
21 |
-
return lemma
|
22 |
-
return word
|
23 |
-
|
24 |
-
# Function to extract verbs from a sentence
|
25 |
-
def extract_verbs(sentence):
|
26 |
-
words = word_tokenize(sentence)
|
27 |
-
tagged = nltk.pos_tag(words)
|
28 |
-
verbs = [(word, tag) for word, tag in tagged if tag.startswith('VB')]
|
29 |
-
return verbs
|
30 |
-
|
31 |
-
# Function to perform grammar correction and generate verb forms list
|
32 |
def grammar_check(text):
|
33 |
sentences = sent_tokenize(text)
|
34 |
corrected_sentences = []
|
35 |
-
original_verbs = []
|
36 |
-
corrected_verbs = []
|
37 |
|
38 |
for sentence in sentences:
|
39 |
-
original_verbs.extend(extract_verbs(sentence))
|
40 |
input_text = f"gec: {sentence}"
|
41 |
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
|
42 |
outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
|
43 |
corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
44 |
corrected_sentences.append(corrected_sentence)
|
45 |
-
corrected_verbs.extend(extract_verbs(corrected_sentence))
|
46 |
|
47 |
# Function to underline and color revised parts
|
48 |
def underline_and_color_revisions(original, corrected):
|
@@ -61,23 +46,13 @@ def grammar_check(text):
|
|
61 |
underline_and_color_revisions(orig, corr) for orig, corr in zip(sentences, corrected_sentences)
|
62 |
)
|
63 |
|
64 |
-
|
65 |
-
verb_forms_list = []
|
66 |
-
for orig, corr in zip(original_verbs, corrected_verbs):
|
67 |
-
base_orig = get_base_form(orig[0], orig[1])
|
68 |
-
base_corr = get_base_form(corr[0], corr[1])
|
69 |
-
if base_orig != base_corr:
|
70 |
-
verb_forms_list.append(f"{base_orig} - {corr[0]} - {base_corr}")
|
71 |
-
|
72 |
-
verb_forms_str = "\n".join(verb_forms_list)
|
73 |
-
|
74 |
-
return corrected_text, verb_forms_str
|
75 |
|
76 |
# Create Gradio interface with a writing prompt
|
77 |
interface = gr.Interface(
|
78 |
fn=grammar_check,
|
79 |
inputs="text",
|
80 |
-
outputs=
|
81 |
title="Grammar Checker",
|
82 |
description=(
|
83 |
"Enter text to check for grammar mistakes.\n\n"
|
|
|
6 |
import gradio as gr
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
|
|
9 |
from difflib import SequenceMatcher
|
10 |
|
11 |
+
# Download necessary resources
|
12 |
+
nltk.download('punkt')
|
13 |
+
nltk.download('averaged_perceptron_tagger')
|
14 |
+
nltk.download('wordnet')
|
15 |
+
|
16 |
# Load a pre-trained T5 model specifically fine-tuned for grammar correction
|
17 |
tokenizer = T5Tokenizer.from_pretrained("prithivida/grammar_error_correcter_v1")
|
18 |
model = T5ForConditionalGeneration.from_pretrained("prithivida/grammar_error_correcter_v1")
|
19 |
|
20 |
+
# Function to perform grammar correction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def grammar_check(text):
|
22 |
sentences = sent_tokenize(text)
|
23 |
corrected_sentences = []
|
|
|
|
|
24 |
|
25 |
for sentence in sentences:
|
|
|
26 |
input_text = f"gec: {sentence}"
|
27 |
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
|
28 |
outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
|
29 |
corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
30 |
corrected_sentences.append(corrected_sentence)
|
|
|
31 |
|
32 |
# Function to underline and color revised parts
|
33 |
def underline_and_color_revisions(original, corrected):
|
|
|
46 |
underline_and_color_revisions(orig, corr) for orig, corr in zip(sentences, corrected_sentences)
|
47 |
)
|
48 |
|
49 |
+
return corrected_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
# Create Gradio interface with a writing prompt
|
52 |
interface = gr.Interface(
|
53 |
fn=grammar_check,
|
54 |
inputs="text",
|
55 |
+
outputs="html", # Output type is HTML
|
56 |
title="Grammar Checker",
|
57 |
description=(
|
58 |
"Enter text to check for grammar mistakes.\n\n"
|