Spaces:

teatwots
/

grammarchecking

Sleeping

App Files Files Community

teatwots commited on Jun 9

Commit

700ece2

•

1 Parent(s): 4f72f15

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -21

app.py CHANGED Viewed

@@ -2,49 +2,79 @@
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 import gradio as gr
 import nltk
-from nltk.tokenize import sent_tokenize
-import difflib
-# Download the punkt tokenizer for sentence splitting
 nltk.download('punkt')
 # Load a pre-trained T5 model specifically fine-tuned for grammar correction
 tokenizer = T5Tokenizer.from_pretrained("prithivida/grammar_error_correcter_v1")
 model = T5ForConditionalGeneration.from_pretrained("prithivida/grammar_error_correcter_v1")
-# Function to perform grammar correction
 def grammar_check(text):
-    # Split the text into sentences
     sentences = sent_tokenize(text)
     corrected_sentences = []
-    original_sentences = []
     for sentence in sentences:
-        original_sentences.append(sentence)
         input_text = f"gec: {sentence}"
         input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
         outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
         corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
         corrected_sentences.append(corrected_sentence)
     # Function to underline and color revised parts
     def underline_and_color_revisions(original, corrected):
-        diff = difflib.ndiff(original.split(), corrected.split())
         result = []
-        for word in diff:
-            if word.startswith("+ "):
-                result.append(f"<u style='color:red;'>{word[2:]}</u>")
-            elif word.startswith("- "):
-                continue
-            else:
-                result.append(word[2:])
         return " ".join(result)
-    # Join the corrected sentences back into a single string
     corrected_text = " ".join(
-        underline_and_color_revisions(orig, corr) for orig, corr in zip(original_sentences, corrected_sentences)
     )
-    return corrected_text
 # Create Gradio interface with a writing prompt
 interface = gr.Interface(
@@ -57,10 +87,10 @@ interface = gr.Interface(
         "Writing Prompt:\n"
         "In the story, Alex and his friends discovered an ancient treasure in Whispering Hollow and decided to donate the artifacts to the local museum.\n\n"
         "In the past, did you have a similar experience where you found something valuable or interesting? Tell the story. Describe what you found, what you did with it, and how you felt about your decision.\n\n"
-        "Remember to use past tense in your writing.\n"
-        "Sample text for testing: When I was 10, I find an old coin in my backyard. I kept it for a while and shows it to my friends. They was impressed and say it might be valuable. Later, I take it to a local antique shop, and the owner told me it was very old. I decided to give it to the museum in my town. The museum was happy and put it on display. I feel proud of my decision."
     )
 )
 # Launch the interface
-interface.launch()

 from transformers import T5Tokenizer, T5ForConditionalGeneration
 import gradio as gr
 import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import wordnet as wn
+from difflib import SequenceMatcher
+# Download necessary resources
 nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('wordnet')
 # Load a pre-trained T5 model specifically fine-tuned for grammar correction
 tokenizer = T5Tokenizer.from_pretrained("prithivida/grammar_error_correcter_v1")
 model = T5ForConditionalGeneration.from_pretrained("prithivida/grammar_error_correcter_v1")
+# Function to get the base form (lemma) of verbs
+def get_base_form(word, tag):
+    wn_tag = {'VBD': wn.VERB, 'VBG': wn.VERB, 'VBN': wn.VERB, 'VBP': wn.VERB, 'VBZ': wn.VERB, 'VB': wn.VERB}
+    if tag in wn_tag:
+        lemma = nltk.WordNetLemmatizer().lemmatize(word, wn_tag[tag])
+        return lemma
+    return word
+# Function to extract verbs from a sentence
+def extract_verbs(sentence):
+    words = word_tokenize(sentence)
+    tagged = nltk.pos_tag(words)
+    verbs = [(word, tag) for word, tag in tagged if tag.startswith('VB')]
+    return verbs
+# Function to perform grammar correction and generate verb forms list
 def grammar_check(text):
     sentences = sent_tokenize(text)
     corrected_sentences = []
+    original_verbs = []
+    corrected_verbs = []
     for sentence in sentences:
+        original_verbs.extend(extract_verbs(sentence))
         input_text = f"gec: {sentence}"
         input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
         outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
         corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
         corrected_sentences.append(corrected_sentence)
+        corrected_verbs.extend(extract_verbs(corrected_sentence))
     # Function to underline and color revised parts
     def underline_and_color_revisions(original, corrected):
+        diff = SequenceMatcher(None, original.split(), corrected.split())
         result = []
+        for tag, i1, i2, j1, j2 in diff.get_opcodes():
+            if tag == 'insert':
+                result.append(f"<u style='color:red;'>{' '.join(corrected.split()[j1:j2])}</u>")
+            elif tag == 'replace':
+                result.append(f"<u style='color:red;'>{' '.join(corrected.split()[j1:j2])}</u>")
+            elif tag == 'equal':
+                result.append(' '.join(original.split()[i1:i2]))
         return " ".join(result)
     corrected_text = " ".join(
+        underline_and_color_revisions(orig, corr) for orig, corr in zip(sentences, corrected_sentences)
     )
+    # Generate verb forms list
+    verb_forms_list = []
+    for orig, corr in zip(original_verbs, corrected_verbs):
+        base_orig = get_base_form(orig[0], orig[1])
+        base_corr = get_base_form(corr[0], corr[1])
+        if base_orig != base_corr:
+            verb_forms_list.append(f"{base_orig}-{corr[0]}-{base_corr}")
+    verb_forms_str = "\n".join(verb_forms_list)
+    # Return combined result
+    return f"{corrected_text}\n\n<b>Revised Verb Forms:</b>\n{verb_forms_str}"
 # Create Gradio interface with a writing prompt
 interface = gr.Interface(
         "Writing Prompt:\n"
         "In the story, Alex and his friends discovered an ancient treasure in Whispering Hollow and decided to donate the artifacts to the local museum.\n\n"
         "In the past, did you have a similar experience where you found something valuable or interesting? Tell the story. Describe what you found, what you did with it, and how you felt about your decision.\n\n"
+        "Remember to use past tense in your writing."
     )
 )
 # Launch the interface
+interface.launch()