cv-to-json / ocr_functions.py
Essa20001's picture
Upload ocr_functions.py
94b2105 verified
raw
history blame
974 Bytes
from dotenv import load_dotenv
import io
import boto3
import os
import pytesseract
from PIL import ImageFilter
import numpy as np
def textract_ocr(image, box):
load_dotenv()
x1, y1, x2, y2 = box
cropped_image = image.crop((x1, y1, x2, y2))
cropped_image = cropped_image.convert("L")
img_bytes = io.BytesIO()
cropped_image.save(img_bytes, format='PNG')
img_bytes = img_bytes.getvalue()
client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
aws_secret_access_key=os.getenv('aws_secret_access_key')
)
response = client.detect_document_text(Document={'Bytes': img_bytes})
blocks = response['Blocks']
texttract = ""
line_confidence = {}
for block in blocks:
if(block['BlockType'] == 'LINE'):
line_confidence[block['Text']] = block['Confidence']
texttract+= block['Text']+"\n"
return texttract