Genzo1010 commited on
Commit
81f2edf
·
verified ·
1 Parent(s): 2969c24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -6
app.py CHANGED
@@ -16,7 +16,35 @@ import datasets
16
  from datasets import load_dataset, Image
17
  from PIL import Image
18
  from paddleocr import PaddleOCR
19
- from pdf2image import convert_from_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  """
21
  Paddle OCR
22
  """
@@ -69,11 +97,9 @@ def generate_ocr(Method, file):
69
  file = io.BytesIO(file)
70
 
71
  if file.name.endswith('.pdf'):
72
- # Convert PDF to images
73
- images = convert_from_path(file)
74
- for img in images:
75
- img_np = np.array(img)
76
- text_output += generate_text_from_image(Method, img_np) + "\n"
77
  else:
78
  # Handle image file
79
  img_np = np.array(Image.open(file))
 
16
  from datasets import load_dataset, Image
17
  from PIL import Image
18
  from paddleocr import PaddleOCR
19
+ from doctr.io import DocumentFile
20
+ from doctr.models import ocr_predictor
21
+
22
+
23
+
24
+ ocr_model = ocr_predictor(pretrained=True)
25
+
26
+
27
+
28
+ """
29
+ Perform OCR with doctr
30
+ """
31
+ def ocr_with_doctr(file):
32
+ text_output = ''
33
+
34
+ # Load the document
35
+ doc = DocumentFile.from_pdf(file)
36
+
37
+ # Perform OCR
38
+ result = ocr_model(doc)
39
+
40
+ # Extract text from OCR result
41
+ for page in result.pages:
42
+ for block in page.blocks:
43
+ for line in block.lines:
44
+ text_output += " ".join([word.value for word in line.words]) + "\n"
45
+
46
+ return text_output
47
+
48
  """
49
  Paddle OCR
50
  """
 
97
  file = io.BytesIO(file)
98
 
99
  if file.name.endswith('.pdf'):
100
+ # Perform OCR on the PDF using doctr
101
+ text_output = ocr_with_doctr(file)
102
+
 
 
103
  else:
104
  # Handle image file
105
  img_np = np.array(Image.open(file))