Spaces:

capitaletech
/

cv-to-json

Running

App Files Files Community

Essa20001 commited on Aug 16

Commit

c7d6bcb

•

1 Parent(s): 421905b

Upload ocr_functions.py with huggingface_hub

Browse files

Files changed (1) hide show

ocr_functions.py +68 -0

ocr_functions.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from dotenv import load_dotenv
+import io
+import boto3
+from paddleocr import PaddleOCR
+import os
+import pytesseract
+from PIL import ImageFilter
+import numpy as np
+def textract_ocr(image, box):
+    load_dotenv()
+    x1, y1, x2, y2 = box
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image = cropped_image.convert("L")
+    img_bytes = io.BytesIO()
+    cropped_image.save(img_bytes, format='PNG')
+    img_bytes = img_bytes.getvalue()
+    client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
+                          aws_secret_access_key=os.getenv('aws_secret_access_key')
+    )
+    response = client.detect_document_text(Document={'Bytes': img_bytes})
+    blocks = response['Blocks']
+    texttract = ""
+    line_confidence = {}
+    for block in blocks:
+        if(block['BlockType'] == 'LINE'):
+            line_confidence[block['Text']] = block['Confidence']
+            texttract+= block['Text']+"\n"
+    return texttract
+def paddle_ocr(image,box):
+    x1, y1, x2, y2 = box
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image = np.array(cropped_image)
+    ocr = PaddleOCR(use_angle_cls=False, lang='latin')
+    result = ocr.ocr(cropped_image, cls=False)
+    text= ""
+    if result [0] != None:
+        result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
+        text = [x[1][0] for x in result[0]]
+    return "\n".join(text)
+def tesseract_ocr(image, box):
+    target_dpi = 300
+    x1, y1, x2, y2 = box
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image = cropped_image.convert("L")
+    current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
+    if current_dpi:
+        scale_factor = target_dpi / current_dpi
+    else:
+        scale_factor = 1.0
+    binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
+    binarized_image = binarized_image.point(lambda p: p > 180 and 255)
+    text = pytesseract.image_to_string(binarized_image, config="--psm 6")
+    return text