OCR-image-to-text

Sleeping

Genzo1010 commited on Aug 20, 2024

Commit

81f2edf

verified ·

1 Parent(s): 2969c24

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,7 +16,35 @@ import datasets
 from datasets import load_dataset, Image
 from PIL import Image
 from paddleocr import PaddleOCR
-from pdf2image import convert_from_path
 """
 Paddle OCR
 """
@@ -69,11 +97,9 @@ def generate_ocr(Method, file):
         file = io.BytesIO(file)
     if file.name.endswith('.pdf'):
-        # Convert PDF to images
-        images = convert_from_path(file)
-        for img in images:
-            img_np = np.array(img)
-            text_output += generate_text_from_image(Method, img_np) + "\n"
     else:
         # Handle image file
         img_np = np.array(Image.open(file))

 from datasets import load_dataset, Image
 from PIL import Image
 from paddleocr import PaddleOCR
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+ocr_model = ocr_predictor(pretrained=True)
+"""
+Perform OCR with doctr
+"""
+def ocr_with_doctr(file):
+    text_output = ''
+    # Load the document
+    doc = DocumentFile.from_pdf(file)
+    # Perform OCR
+    result = ocr_model(doc)
+    # Extract text from OCR result
+    for page in result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                text_output += " ".join([word.value for word in line.words]) + "\n"
+    return text_output
 """
 Paddle OCR
 """
         file = io.BytesIO(file)
     if file.name.endswith('.pdf'):
+        # Perform OCR on the PDF using doctr
+        text_output = ocr_with_doctr(file)
     else:
         # Handle image file
         img_np = np.array(Image.open(file))