Essa20001 commited on
Commit
94b2105
1 Parent(s): 60b491f

Upload ocr_functions.py

Browse files
Files changed (1) hide show
  1. ocr_functions.py +1 -34
ocr_functions.py CHANGED
@@ -1,7 +1,7 @@
1
  from dotenv import load_dotenv
2
  import io
3
  import boto3
4
- from paddleocr import PaddleOCR
5
  import os
6
  import pytesseract
7
  from PIL import ImageFilter
@@ -32,37 +32,4 @@ def textract_ocr(image, box):
32
 
33
 
34
 
35
- def paddle_ocr(image,box):
36
- x1, y1, x2, y2 = box
37
- cropped_image = image.crop((x1, y1, x2, y2))
38
- cropped_image = np.array(cropped_image)
39
- ocr = PaddleOCR(use_angle_cls=False, lang='latin')
40
- result = ocr.ocr(cropped_image, cls=False)
41
- text= ""
42
- if result [0] != None:
43
- result.sort(key=lambda x: (x[0][0][1], x[0][0][0]))
44
- text = [x[1][0] for x in result[0]]
45
- return "\n".join(text)
46
-
47
-
48
-
49
- def tesseract_ocr(image, box):
50
- target_dpi = 300
51
- x1, y1, x2, y2 = box
52
- cropped_image = image.crop((x1, y1, x2, y2))
53
- cropped_image = cropped_image.convert("L")
54
-
55
- current_dpi = cropped_image.info['dpi'][0] if 'dpi' in image.info else None
56
-
57
- if current_dpi:
58
- scale_factor = target_dpi / current_dpi
59
- else:
60
-
61
- scale_factor = 1.0
62
- binarized_image = cropped_image.filter(ImageFilter.MedianFilter())
63
- binarized_image = binarized_image.point(lambda p: p > 180 and 255)
64
- text = pytesseract.image_to_string(binarized_image, config="--psm 6")
65
- return text
66
-
67
-
68
 
 
1
  from dotenv import load_dotenv
2
  import io
3
  import boto3
4
+
5
  import os
6
  import pytesseract
7
  from PIL import ImageFilter
 
32
 
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35