File size: 974 Bytes
c7d6bcb
 
 
94b2105
c7d6bcb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from dotenv import load_dotenv
import io
import boto3

import os
import pytesseract
from PIL import ImageFilter
import numpy as np

def textract_ocr(image, box):
    load_dotenv()
    x1, y1, x2, y2 = box
    cropped_image = image.crop((x1, y1, x2, y2))
    cropped_image = cropped_image.convert("L")
    img_bytes = io.BytesIO()
    cropped_image.save(img_bytes, format='PNG')
    img_bytes = img_bytes.getvalue()
    client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
                          aws_secret_access_key=os.getenv('aws_secret_access_key')
    )

    response = client.detect_document_text(Document={'Bytes': img_bytes})
    blocks = response['Blocks']
    texttract = ""
    line_confidence = {}
    for block in blocks:
        if(block['BlockType'] == 'LINE'):
            line_confidence[block['Text']] = block['Confidence']
            texttract+= block['Text']+"\n"
    
    return texttract