Spaces:
Sleeping
Sleeping
from dotenv import load_dotenv | |
import io | |
import boto3 | |
from paddleocr import PaddleOCR | |
import os | |
import pytesseract | |
from PIL import ImageFilter | |
import numpy as np | |
def textract_ocr(image, box): | |
load_dotenv() | |
x1, y1, x2, y2 = box | |
cropped_image = image.crop((x1, y1, x2, y2)) | |
cropped_image = cropped_image.convert("L") | |
img_bytes = io.BytesIO() | |
cropped_image.save(img_bytes, format='PNG') | |
img_bytes = img_bytes.getvalue() | |
client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"), | |
aws_secret_access_key=os.getenv('aws_secret_access_key') | |
) | |
response = client.detect_document_text(Document={'Bytes': img_bytes}) | |
blocks = response['Blocks'] | |
texttract = "" | |
line_confidence = {} | |
for block in blocks: | |
if(block['BlockType'] == 'LINE'): | |
line_confidence[block['Text']] = block['Confidence'] | |
texttract+= block['Text']+"\n" | |
return texttract | |
def paddle_ocr(image,box): | |
x1, y1, x2, y2 = box | |
cropped_image = image.crop((x1, y1, x2, y2)) | |
cropped_image = np.array(cropped_image) | |
ocr = PaddleOCR(use_angle_cls=False, lang='latin') | |
result = ocr.ocr(cropped_image, cls=False) | |
text= "" | |
if result [0] != None: | |
result.sort(key=lambda x: (x[0][0][1], x[0][0][0])) | |
text = [x[1][0] for x in result[0]] | |
return "\n".join(text) | |
def tesseract_ocr(image, box): | |
target_dpi = 300 | |
x1, y1, x2, y2 = box | |
cropped_image = image.crop((x1, y1, x2, y2)) | |
cropped_image = cropped_image.convert("L") | |
current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None | |
if current_dpi: | |
scale_factor = target_dpi / current_dpi | |
else: | |
scale_factor = 1.0 | |
binarized_image = cropped_image.filter(ImageFilter.MedianFilter()) | |
binarized_image = binarized_image.point(lambda p: p > 180 and 255) | |
text = pytesseract.image_to_string(binarized_image, config="--psm 6") | |
return text | |