Spaces:

capitaletech
/

cv-to-json

Running

File size: 974 Bytes

c7d6bcb
 
 
94b2105
c7d6bcb

from dotenv import load_dotenv
import io
import boto3

import os
import pytesseract
from PIL import ImageFilter
import numpy as np

def textract_ocr(image, box):
    load_dotenv()
    x1, y1, x2, y2 = box
    cropped_image = image.crop((x1, y1, x2, y2))
    cropped_image = cropped_image.convert("L")
    img_bytes = io.BytesIO()
    cropped_image.save(img_bytes, format='PNG')
    img_bytes = img_bytes.getvalue()
    client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
                          aws_secret_access_key=os.getenv('aws_secret_access_key')
    )

    response = client.detect_document_text(Document={'Bytes': img_bytes})
    blocks = response['Blocks']
    texttract = ""
    line_confidence = {}
    for block in blocks:
        if(block['BlockType'] == 'LINE'):
            line_confidence[block['Text']] = block['Confidence']
            texttract+= block['Text']+"\n"
    
    return texttract