Spaces:
Sleeping
Sleeping
viboognesh-doaz
commited on
Commit
·
4b993b3
1
Parent(s):
5e32aa7
create new vectorstore each time
Browse files- pdf_processing.py +32 -32
pdf_processing.py
CHANGED
|
@@ -11,7 +11,7 @@ import os
|
|
| 11 |
from llama_index.core.indices import MultiModalVectorStoreIndex
|
| 12 |
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
| 13 |
from llama_index.core import SimpleDirectoryReader, StorageContext
|
| 14 |
-
from awsfunctions import upload_folder_to_s3, check_file_exists_in_s3, download_folder_from_s3
|
| 15 |
import qdrant_client
|
| 16 |
import streamlit as st
|
| 17 |
|
|
@@ -111,38 +111,38 @@ def process_pdf(pdf_file):
|
|
| 111 |
username = "ptchecker"
|
| 112 |
aws_prefix_path = os.path.join(os.getenv("FOLDER_PREFIX"), username, "FILES", os.path.splitext(pdf_file.name)[0])
|
| 113 |
if check_file_exists_in_s3(os.path.join(aws_prefix_path, pdf_file.name)):
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
-
|
|
|
|
| 11 |
from llama_index.core.indices import MultiModalVectorStoreIndex
|
| 12 |
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
| 13 |
from llama_index.core import SimpleDirectoryReader, StorageContext
|
| 14 |
+
from awsfunctions import upload_folder_to_s3, check_file_exists_in_s3, download_folder_from_s3, delete_s3_folder
|
| 15 |
import qdrant_client
|
| 16 |
import streamlit as st
|
| 17 |
|
|
|
|
| 111 |
username = "ptchecker"
|
| 112 |
aws_prefix_path = os.path.join(os.getenv("FOLDER_PREFIX"), username, "FILES", os.path.splitext(pdf_file.name)[0])
|
| 113 |
if check_file_exists_in_s3(os.path.join(aws_prefix_path, pdf_file.name)):
|
| 114 |
+
delete_s3_folder(aws_prefix_path)
|
| 115 |
+
# temp_dir = tempfile.mkdtemp()
|
| 116 |
+
# download_folder_from_s3(local_folder=temp_dir, aws_folder_prefix=os.path.join(aws_prefix_path, "qdrant"))
|
| 117 |
+
# client = qdrant_client.QdrantClient(path=os.path.join(temp_dir, "qdrant"))
|
| 118 |
+
# image_store = QdrantVectorStore(client = client , collection_name=f"image_collection")
|
| 119 |
+
# text_store = QdrantVectorStore(client = client , collection_name=f"text_collection")
|
| 120 |
+
# index = MultiModalVectorStoreIndex.from_vector_store(vector_store=text_store, image_store=image_store)
|
| 121 |
+
# retriever_engine = index.as_retriever(similarity_top_k=1, image_similarity_top_k=1)
|
| 122 |
+
# shutil.rmtree(temp_dir)
|
| 123 |
+
# return retriever_engine
|
| 124 |
+
temp_dir = tempfile.mkdtemp()
|
| 125 |
+
temp_pdf_path = os.path.join(temp_dir, pdf_file.name)
|
| 126 |
+
with open(temp_pdf_path, "wb") as f:
|
| 127 |
+
f.write(pdf_file.getvalue())
|
| 128 |
|
| 129 |
+
data_path = os.path.join(temp_dir, "data")
|
| 130 |
+
os.makedirs(data_path , exist_ok=True)
|
| 131 |
+
img_save_path = os.path.join(temp_dir, "images")
|
| 132 |
+
os.makedirs(img_save_path , exist_ok=True)
|
| 133 |
|
| 134 |
+
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
| 135 |
+
with open(os.path.join(data_path, "content.txt"), "w") as file:
|
| 136 |
+
file.write(extracted_text)
|
| 137 |
|
| 138 |
+
extract_images_from_pdf(temp_pdf_path, img_save_path)
|
| 139 |
+
moved_count = move_images(img_save_path, data_path)
|
| 140 |
+
print("Images moved count : ", moved_count)
|
| 141 |
+
remove_low_size_images(data_path)
|
| 142 |
+
remove_duplicate_images(data_path)
|
| 143 |
+
shutil.rmtree(img_save_path)
|
| 144 |
+
retriever_engine = initialize_qdrant(temp_dir=temp_dir, aws_prefix=aws_prefix_path) # os.path.join("folder" , os.path.splitext(pdf_file.name)[0] , unique_folder_name)
|
| 145 |
+
upload_folder_to_s3(temp_dir, aws_prefix_path)
|
| 146 |
+
shutil.rmtree(temp_dir)
|
| 147 |
|
| 148 |
+
return retriever_engine
|