dinhquangson commited on
Commit
274c53e
1 Parent(s): 224565b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -3
app.py CHANGED
@@ -2,8 +2,11 @@ from fastapi import FastAPI, UploadFile, File
2
  from fastapi.responses import FileResponse
3
  from datasets import load_dataset
4
  from fastapi.middleware.cors import CORSMiddleware
 
 
5
  # Loading
6
  import os
 
7
  import shutil
8
  from os import makedirs,getcwd
9
  from os.path import join,exists,dirname
@@ -43,9 +46,43 @@ document_store = QdrantDocumentStore(
43
  use_sparse_embeddings=True,
44
  embedding_dim = 384
45
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  @app.post("/uploadfile/")
48
- async def create_upload_file(text_field: str, file: UploadFile = File(...)):
49
  # Imports
50
  import time
51
  from haystack import Document, Pipeline
@@ -75,7 +112,19 @@ async def create_upload_file(text_field: str, file: UploadFile = File(...)):
75
  obj = json.loads(line)
76
  document = Document(content=obj[text_field], meta=obj)
77
  documents.append(document)
78
-
 
 
 
 
 
 
 
 
 
 
 
 
79
  else:
80
  raise NotImplementedError("This feature is not supported yet")
81
 
@@ -178,7 +227,7 @@ async def create_upload_file(file: UploadFile = File(...)):
178
  ocr_text = pytesseract.image_to_string(image,lang='vie')
179
  text=text+ocr_text+'\n'
180
 
181
- return ocr_text
182
  @app.get("/")
183
  def api_home():
184
  return {'detail': 'Welcome to FastAPI Qdrant importer!'}
 
2
  from fastapi.responses import FileResponse
3
  from datasets import load_dataset
4
  from fastapi.middleware.cors import CORSMiddleware
5
+ import pdfplumber
6
+
7
  # Loading
8
  import os
9
+ import zipfile
10
  import shutil
11
  from os import makedirs,getcwd
12
  from os.path import join,exists,dirname
 
46
  use_sparse_embeddings=True,
47
  embedding_dim = 384
48
  )
49
+
50
+ def extract_zip(zip_path, target_folder):
51
+ """
52
+ Extracts all files from a ZIP archive and returns a list of their paths.
53
+
54
+ Args:
55
+ zip_path (str): Path to the ZIP file.
56
+ target_folder (str): Folder where the files will be extracted.
57
+
58
+ Returns:
59
+ List[str]: List of extracted file paths.
60
+ """
61
+ extracted_files = []
62
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
63
+ zip_ref.extractall(target_folder)
64
+ for filename in zip_ref.namelist():
65
+ extracted_files.append(os.path.join(target_folder, filename))
66
+ return extracted_files
67
+
68
+
69
+ def extract_text_from_pdf(pdf_path):
70
+ with pdfplumber.open(pdf_path) as pdf:
71
+ text = ""
72
+ for page in pdf.pages:
73
+ text += page.extract_text()
74
+ return text
75
+
76
+ def extract_ocr_text_from_pdf(pdf_path):
77
+ from pdf2image import convert_from_path
78
+ images = convert_from_path(pdf_path)
79
+
80
+ for image in images:
81
+ text += pytesseract.image_to_string(image,lang='vie')
82
+ return text
83
 
84
  @app.post("/uploadfile/")
85
+ async def create_upload_file(text_field: str, file: UploadFile = File(...), ocr:bool=False):
86
  # Imports
87
  import time
88
  from haystack import Document, Pipeline
 
112
  obj = json.loads(line)
113
  document = Document(content=obj[text_field], meta=obj)
114
  documents.append(document)
115
+
116
+ elif '.zip' in file_savePath:
117
+ extracted_files_list = extract_zip(file_savePath, temp_path)
118
+ print("Extracted files:")
119
+ for file_path in extracted_files_list:
120
+ if '.pdf' in file_path:
121
+ if ocr:
122
+ text = extract_ocr_text_from_pdf(file_path)
123
+ else:
124
+ text = extract_text_from_pdf(file_path)
125
+ obj = {text_field=text,file_path=file_path}
126
+ document = Document(content=obj[text_field], meta=obj)
127
+ documents.append(document)
128
  else:
129
  raise NotImplementedError("This feature is not supported yet")
130
 
 
227
  ocr_text = pytesseract.image_to_string(image,lang='vie')
228
  text=text+ocr_text+'\n'
229
 
230
+ return text
231
  @app.get("/")
232
  def api_home():
233
  return {'detail': 'Welcome to FastAPI Qdrant importer!'}