Spaces:

capitaletech
/

cv-to-json

Running

cv-to-json / ocr_functions.py

Upload ocr_functions.py

94b2105 verified 4 months ago

974 Bytes

	from dotenv import load_dotenv
	import io
	import boto3

	import os
	import pytesseract
	from PIL import ImageFilter
	import numpy as np

	def textract_ocr(image, box):
	load_dotenv()
	x1, y1, x2, y2 = box
	cropped_image = image.crop((x1, y1, x2, y2))
	cropped_image = cropped_image.convert("L")
	img_bytes = io.BytesIO()
	cropped_image.save(img_bytes, format='PNG')
	img_bytes = img_bytes.getvalue()
	client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
	aws_secret_access_key=os.getenv('aws_secret_access_key')
	)

	response = client.detect_document_text(Document={'Bytes': img_bytes})
	blocks = response['Blocks']
	texttract = ""
	line_confidence = {}
	for block in blocks:
	if(block['BlockType'] == 'LINE'):
	line_confidence[block['Text']] = block['Confidence']
	texttract+= block['Text']+"\n"

	return texttract