Essa20001 commited on
Commit
c7d6bcb
1 Parent(s): 421905b

Upload ocr_functions.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. ocr_functions.py +68 -0
ocr_functions.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import io
3
+ import boto3
4
+ from paddleocr import PaddleOCR
5
+ import os
6
+ import pytesseract
7
+ from PIL import ImageFilter
8
+ import numpy as np
9
+
10
+ def textract_ocr(image, box):
11
+ load_dotenv()
12
+ x1, y1, x2, y2 = box
13
+ cropped_image = image.crop((x1, y1, x2, y2))
14
+ cropped_image = cropped_image.convert("L")
15
+ img_bytes = io.BytesIO()
16
+ cropped_image.save(img_bytes, format='PNG')
17
+ img_bytes = img_bytes.getvalue()
18
+ client = boto3.client('textract', region_name='eu-west-3', aws_access_key_id=os.getenv("aws_access_key_id"),
19
+ aws_secret_access_key=os.getenv('aws_secret_access_key')
20
+ )
21
+
22
+ response = client.detect_document_text(Document={'Bytes': img_bytes})
23
+ blocks = response['Blocks']
24
+ texttract = ""
25
+ line_confidence = {}
26
+ for block in blocks:
27
+ if(block['BlockType'] == 'LINE'):
28
+ line_confidence[block['Text']] = block['Confidence']
29
+ texttract+= block['Text']+"\n"
30
+
31
+ return texttract
32
+
33
+
34
+
35
+ def paddle_ocr(image,box):
36
+ x1, y1, x2, y2 = box
37
+ cropped_image = image.crop((x1, y1, x2, y2))
38
+ cropped_image = np.array(cropped_image)
39
+ ocr = PaddleOCR(use_angle_cls=False, lang='latin')
40
+ result = ocr.ocr(cropped_image, cls=False)
41
+ text= ""
42
+ if result [0] != None:
43
+ result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
44
+ text = [x[1][0] for x in result[0]]
45
+ return "\n".join(text)
46
+
47
+
48
+
49
+ def tesseract_ocr(image, box):
50
+ target_dpi = 300
51
+ x1, y1, x2, y2 = box
52
+ cropped_image = image.crop((x1, y1, x2, y2))
53
+ cropped_image = cropped_image.convert("L")
54
+
55
+ current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
56
+
57
+ if current_dpi:
58
+ scale_factor = target_dpi / current_dpi
59
+ else:
60
+
61
+ scale_factor = 1.0
62
+ binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
63
+ binarized_image = binarized_image.point(lambda p: p > 180 and 255)
64
+ text = pytesseract.image_to_string(binarized_image, config="--psm 6")
65
+ return text
66
+
67
+
68
+