clfegg commited on
Commit
2a0cd88
1 Parent(s): ccb118a

Upload 2 files

Browse files
Files changed (2) hide show
  1. handler.py +40 -0
  2. requirements.txt +4 -0
handler.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, List
2
+ import os
3
+
4
+ current_dir = os.getcwd()
5
+ os.environ['HF_HOME'] = os.path.join(current_dir)
6
+
7
+ from marker.convert import convert_single_pdf
8
+ from marker.logger import configure_logging
9
+ from marker.models import load_all_models
10
+ from marker.output import save_markdown
11
+ from io import BytesIO
12
+ class EndpointHandler:
13
+ def __init__(self, path=""):
14
+ # Initialize the OCR model
15
+ self.models = load_all_models()
16
+ self.file_location = "input/temp.pdf"
17
+ os.makedirs("input", exist_ok=True)
18
+
19
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
20
+ """
21
+ data args:
22
+ max_pages (:obj: int): The maximum number of pages to process.
23
+ file (:obj: UploadFile): The uploaded PDF file.
24
+ Return:
25
+ A list of dictionaries containing the extracted text.
26
+ """
27
+ # Get inputs
28
+ self.upload_file(data['file'])
29
+ pdf_path = self.file_location
30
+ max_pages = data.get("max_pages", None)
31
+
32
+ # Perform OCR on the input PDF
33
+ extracted_text, _, _ = convert_single_pdf(pdf_path, self.models, max_pages=max_pages, langs=["vi"])
34
+ # Return the extracted text
35
+ return [{"extracted_text": extracted_text}]
36
+
37
+ def upload_file(self, file: BytesIO, max_pages: int = None):
38
+ with open(self.file_location, "wb") as f:
39
+ f.write(file.read())
40
+ return True
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ marker-pdf==0.3.10
2
+ Pillow
3
+ torchvision
4
+ transformers