Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
274c53e
1
Parent(s):
224565b
Update app.py
Browse files
app.py
CHANGED
@@ -2,8 +2,11 @@ from fastapi import FastAPI, UploadFile, File
|
|
2 |
from fastapi.responses import FileResponse
|
3 |
from datasets import load_dataset
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
|
5 |
# Loading
|
6 |
import os
|
|
|
7 |
import shutil
|
8 |
from os import makedirs,getcwd
|
9 |
from os.path import join,exists,dirname
|
@@ -43,9 +46,43 @@ document_store = QdrantDocumentStore(
|
|
43 |
use_sparse_embeddings=True,
|
44 |
embedding_dim = 384
|
45 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
@app.post("/uploadfile/")
|
48 |
-
async def create_upload_file(text_field: str, file: UploadFile = File(...)):
|
49 |
# Imports
|
50 |
import time
|
51 |
from haystack import Document, Pipeline
|
@@ -75,7 +112,19 @@ async def create_upload_file(text_field: str, file: UploadFile = File(...)):
|
|
75 |
obj = json.loads(line)
|
76 |
document = Document(content=obj[text_field], meta=obj)
|
77 |
documents.append(document)
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
else:
|
80 |
raise NotImplementedError("This feature is not supported yet")
|
81 |
|
@@ -178,7 +227,7 @@ async def create_upload_file(file: UploadFile = File(...)):
|
|
178 |
ocr_text = pytesseract.image_to_string(image,lang='vie')
|
179 |
text=text+ocr_text+'\n'
|
180 |
|
181 |
-
return
|
182 |
@app.get("/")
|
183 |
def api_home():
|
184 |
return {'detail': 'Welcome to FastAPI Qdrant importer!'}
|
|
|
2 |
from fastapi.responses import FileResponse
|
3 |
from datasets import load_dataset
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
import pdfplumber
|
6 |
+
|
7 |
# Loading
|
8 |
import os
|
9 |
+
import zipfile
|
10 |
import shutil
|
11 |
from os import makedirs,getcwd
|
12 |
from os.path import join,exists,dirname
|
|
|
46 |
use_sparse_embeddings=True,
|
47 |
embedding_dim = 384
|
48 |
)
|
49 |
+
|
50 |
+
def extract_zip(zip_path, target_folder):
|
51 |
+
"""
|
52 |
+
Extracts all files from a ZIP archive and returns a list of their paths.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
zip_path (str): Path to the ZIP file.
|
56 |
+
target_folder (str): Folder where the files will be extracted.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
List[str]: List of extracted file paths.
|
60 |
+
"""
|
61 |
+
extracted_files = []
|
62 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
63 |
+
zip_ref.extractall(target_folder)
|
64 |
+
for filename in zip_ref.namelist():
|
65 |
+
extracted_files.append(os.path.join(target_folder, filename))
|
66 |
+
return extracted_files
|
67 |
+
|
68 |
+
|
69 |
+
def extract_text_from_pdf(pdf_path):
|
70 |
+
with pdfplumber.open(pdf_path) as pdf:
|
71 |
+
text = ""
|
72 |
+
for page in pdf.pages:
|
73 |
+
text += page.extract_text()
|
74 |
+
return text
|
75 |
+
|
76 |
+
def extract_ocr_text_from_pdf(pdf_path):
|
77 |
+
from pdf2image import convert_from_path
|
78 |
+
images = convert_from_path(pdf_path)
|
79 |
+
|
80 |
+
for image in images:
|
81 |
+
text += pytesseract.image_to_string(image,lang='vie')
|
82 |
+
return text
|
83 |
|
84 |
@app.post("/uploadfile/")
|
85 |
+
async def create_upload_file(text_field: str, file: UploadFile = File(...), ocr:bool=False):
|
86 |
# Imports
|
87 |
import time
|
88 |
from haystack import Document, Pipeline
|
|
|
112 |
obj = json.loads(line)
|
113 |
document = Document(content=obj[text_field], meta=obj)
|
114 |
documents.append(document)
|
115 |
+
|
116 |
+
elif '.zip' in file_savePath:
|
117 |
+
extracted_files_list = extract_zip(file_savePath, temp_path)
|
118 |
+
print("Extracted files:")
|
119 |
+
for file_path in extracted_files_list:
|
120 |
+
if '.pdf' in file_path:
|
121 |
+
if ocr:
|
122 |
+
text = extract_ocr_text_from_pdf(file_path)
|
123 |
+
else:
|
124 |
+
text = extract_text_from_pdf(file_path)
|
125 |
+
obj = {text_field=text,file_path=file_path}
|
126 |
+
document = Document(content=obj[text_field], meta=obj)
|
127 |
+
documents.append(document)
|
128 |
else:
|
129 |
raise NotImplementedError("This feature is not supported yet")
|
130 |
|
|
|
227 |
ocr_text = pytesseract.image_to_string(image,lang='vie')
|
228 |
text=text+ocr_text+'\n'
|
229 |
|
230 |
+
return text
|
231 |
@app.get("/")
|
232 |
def api_home():
|
233 |
return {'detail': 'Welcome to FastAPI Qdrant importer!'}
|