QDrantRAG9

Sleeping

App Files Files Community

dinhquangson commited on Jun 23

Commit

274c53e

•

1 Parent(s): 224565b

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -3

app.py CHANGED Viewed

@@ -2,8 +2,11 @@ from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import FileResponse
 from datasets import load_dataset
 from fastapi.middleware.cors import CORSMiddleware
 # Loading
 import os
 import shutil
 from os import makedirs,getcwd
 from os.path import join,exists,dirname
@@ -43,9 +46,43 @@ document_store = QdrantDocumentStore(
         use_sparse_embeddings=True,
         embedding_dim = 384
     )
 @app.post("/uploadfile/")
-async def create_upload_file(text_field: str, file: UploadFile = File(...)):
     # Imports
     import time
     from haystack import Document, Pipeline
@@ -75,7 +112,19 @@ async def create_upload_file(text_field: str, file: UploadFile = File(...)):
                 obj = json.loads(line)
                 document = Document(content=obj[text_field], meta=obj)
                 documents.append(document)
     else:
         raise NotImplementedError("This feature is not supported yet")
@@ -178,7 +227,7 @@ async def create_upload_file(file: UploadFile = File(...)):
         ocr_text = pytesseract.image_to_string(image,lang='vie')
         text=text+ocr_text+'\n'
-    return ocr_text
 @app.get("/")
 def api_home():
     return {'detail': 'Welcome to FastAPI Qdrant importer!'}

 from fastapi.responses import FileResponse
 from datasets import load_dataset
 from fastapi.middleware.cors import CORSMiddleware
+import pdfplumber
 # Loading
 import os
+import zipfile
 import shutil
 from os import makedirs,getcwd
 from os.path import join,exists,dirname
         use_sparse_embeddings=True,
         embedding_dim = 384
     )
+def extract_zip(zip_path, target_folder):
+    """
+    Extracts all files from a ZIP archive and returns a list of their paths.
+    Args:
+        zip_path (str): Path to the ZIP file.
+        target_folder (str): Folder where the files will be extracted.
+    Returns:
+        List[str]: List of extracted file paths.
+    """
+    extracted_files = []
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(target_folder)
+        for filename in zip_ref.namelist():
+            extracted_files.append(os.path.join(target_folder, filename))
+    return extracted_files
+def extract_text_from_pdf(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        text = ""
+        for page in pdf.pages:
+            text += page.extract_text()
+    return text
+def extract_ocr_text_from_pdf(pdf_path):
+    from pdf2image import convert_from_path
+    images = convert_from_path(pdf_path)
+    for image in images:
+        text += pytesseract.image_to_string(image,lang='vie')
+    return text
 @app.post("/uploadfile/")
+async def create_upload_file(text_field: str, file: UploadFile = File(...), ocr:bool=False):
     # Imports
     import time
     from haystack import Document, Pipeline
                 obj = json.loads(line)
                 document = Document(content=obj[text_field], meta=obj)
                 documents.append(document)
+    elif '.zip' in file_savePath:
+        extracted_files_list = extract_zip(file_savePath, temp_path)
+        print("Extracted files:")
+        for file_path in extracted_files_list:
+            if '.pdf' in file_path:
+                if ocr:
+                    text = extract_ocr_text_from_pdf(file_path)
+                else:
+                    text = extract_text_from_pdf(file_path)
+                obj = {text_field=text,file_path=file_path}
+                document = Document(content=obj[text_field], meta=obj)
+                documents.append(document)
     else:
         raise NotImplementedError("This feature is not supported yet")
         ocr_text = pytesseract.image_to_string(image,lang='vie')
         text=text+ocr_text+'\n'
+    return text
 @app.get("/")
 def api_home():
     return {'detail': 'Welcome to FastAPI Qdrant importer!'}