Spaces:

pdltiet
/

OCR_demo

Sleeping

App Files Files Community

Ayushnangia commited on Jan 10

Commit

ea5f05b

•

1 Parent(s): b8b3256

updated with spell check and grammar

Browse files

Files changed (1) hide show

app.py +75 -44

app.py CHANGED Viewed

@@ -1,58 +1,89 @@
 import os
-os.environ['USE_TORCH'] = '1'
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 import gradio as gr
 from PIL import Image
-import base64
-from utils import HocrParser
-predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn',pretrained=True)
-title="DocTR OCR (PDL Demo)"
-description="Upload an image to get the OCR results !"
-def greet(img):
     img.save("out.jpg")
     doc = DocumentFile.from_images("out.jpg")
-    output=predictor(doc)
-    xml_outputs = output.export_as_xml()
-    parser = HocrParser()
-    res=""
     for obj in output.pages:
-      for obj1 in obj.blocks:
-        for obj2 in obj1.lines:
-          for obj3 in obj2.words:
-            res=res + " " + obj3.value
-        res=res + "\n"
-      res=res + "\n"
     _output_name = "RESULT_OCR.txt"
-    _output_name_pdf="RESULT_OCR.pdf"
-    open(_output_name, 'w').close() # clear file
-    with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
-        f.write(res)
-        print("Writing into file")
-    base64_encoded_pdfs = list()
-    for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
-      xml_element_tree = xml[1]
-      parser.export_pdfa(_output_name_pdf,
-            hocr=xml_element_tree, image=img)
-      with open(_output_name_pdf, 'rb') as f:
-            base64_encoded_pdfs.append(base64.b64encode(f.read()))
-    return res, _output_name, _output_name_pdf
-demo = gr.Interface(fn=greet,
-                    inputs=gr.Image(type="pil"),
-                    outputs=["text", "file","file"],
-                    title=title,
-                    description=description,
-                    examples=[["Examples/Book.png"],["Examples/News.png"],["Examples/Manuscript.jpg"],["Examples/Files.jpg"]]
-                    )
-demo.launch(debug=True)

 import os
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 import gradio as gr
 from PIL import Image
+from happytransformer import HappyTextToText, TTSettings
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import re
+# OCR Predictor initialization
+predictor = ocr_predictor(det_arch='db_mobilenet_v3_large', reco_arch='crnn_vgg16_bn', pretrained=True)
+# Grammar Correction Model initialization
+happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
+grammar_args = TTSettings(num_beams=5, min_length=1)
+# Spell Check Model initialization
+tokenizer = AutoTokenizer.from_pretrained("Bhuvana/t5-base-spellchecker", use_fast=False)
+model = AutoModelForSeq2SeqLM.from_pretrained("Bhuvana/t5-base-spellchecker")
+def correct_spell(inputs):
+    input_ids = tokenizer.encode(inputs, return_tensors='pt')
+    sample_output = model.generate(
+        input_ids,
+        do_sample=True,
+        max_length=512,
+        top_p=0.99,
+        num_return_sequences=1
+    )
+    res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
+    return res
+def process_text_in_chunks(text, process_function, max_chunk_size=256):
+    # Split text into sentences
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    processed_text = ""
+    for sentence in sentences:
+        # Further split long sentences into smaller chunks
+        chunks = [sentence[i:i + max_chunk_size] for i in range(0, len(sentence), max_chunk_size)]
+        for chunk in chunks:
+            processed_text += process_function(chunk)
+        processed_text += " "  # Add space after each processed sentence
+    return processed_text.strip()
+def greet(img, apply_grammar_correction, apply_spell_check):
     img.save("out.jpg")
     doc = DocumentFile.from_images("out.jpg")
+    output = predictor(doc)
+    res = ""
     for obj in output.pages:
+        for obj1 in obj.blocks:
+            for obj2 in obj1.lines:
+                for obj3 in obj2.words:
+                    res += " " + obj3.value
+            res += "\n"
+        res += "\n"
+    # Process in chunks for grammar correction
+    if apply_grammar_correction:
+        res = process_text_in_chunks(res, lambda x: happy_tt.generate_text("grammar: " + x, args=grammar_args).text)
+    # Process in chunks for spell check
+    if apply_spell_check:
+        res = process_text_in_chunks(res, correct_spell)
     _output_name = "RESULT_OCR.txt"
+    open(_output_name, 'w').write(res)
+    return res, _output_name
+# Gradio Interface
+title = "DocTR OCR with Grammar and Spell Check"
+description = "Upload an image to get the OCR results. Optionally, apply grammar and spell check."
+demo = gr.Interface(
+    fn=greet,
+    inputs=[
+        gr.Image(type="pil"),
+        gr.Checkbox(label="Apply Grammar Correction"),
+        gr.Checkbox(label="Apply Spell Check")
+    ],
+    outputs=["text", "file"],
+    title=title,
+    description=description,
+)
+demo.launch(debug=True)