Spaces:

capitaletech
/

cv-to-json

Running

Essa20001 commited on Aug 16

Commit

94b2105

•

1 Parent(s): 60b491f

Upload ocr_functions.py

Files changed (1) hide show

ocr_functions.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from dotenv import load_dotenv
 import io
 import boto3
-from paddleocr import PaddleOCR
 import os
 import pytesseract
 from PIL import ImageFilter
@@ -32,37 +32,4 @@ def textract_ocr(image, box):
-def paddle_ocr(image,box):
-    x1, y1, x2, y2 = box
-    cropped_image = image.crop((x1, y1, x2, y2))
-    cropped_image = np.array(cropped_image)
-    ocr = PaddleOCR(use_angle_cls=False, lang='latin')
-    result = ocr.ocr(cropped_image, cls=False)
-    text= ""
-    if result [0] != None:
-        result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
-        text = [x[1][0] for x in result[0]]
-    return "\n".join(text)
-def tesseract_ocr(image, box):
-    target_dpi = 300
-    x1, y1, x2, y2 = box
-    cropped_image = image.crop((x1, y1, x2, y2))
-    cropped_image = cropped_image.convert("L")
-    current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
-    if current_dpi:
-        scale_factor = target_dpi / current_dpi
-    else:
-        scale_factor = 1.0
-    binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
-    binarized_image = binarized_image.point(lambda p: p > 180 and 255)
-    text = pytesseract.image_to_string(binarized_image, config="--psm 6")
-    return text

 from dotenv import load_dotenv
 import io
 import boto3
 import os
 import pytesseract
 from PIL import ImageFilter