Spaces:
Sleeping
Sleeping
viboognesh
commited on
Commit
•
26969e6
1
Parent(s):
9b53cea
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- app.py +7 -9
- qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/.lock +1 -0
- qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite +3 -0
- qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite +3 -0
- qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/meta.json +1 -0
.gitattributes
CHANGED
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
qdrant_mm_db_pipeline/collection/text_collection_pipeline/storage.sqlite filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
qdrant_mm_db_pipeline/collection/text_collection_pipeline/storage.sqlite filter=lfs diff=lfs merge=lfs -text
|
37 |
+
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite filter=lfs diff=lfs merge=lfs -text
|
38 |
+
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -34,8 +34,6 @@ from dotenv import load_dotenv
|
|
34 |
load_dotenv()
|
35 |
|
36 |
|
37 |
-
|
38 |
-
|
39 |
def extract_text_from_pdf(pdf_path):
|
40 |
reader = PdfReader(pdf_path)
|
41 |
full_text = ''
|
@@ -117,7 +115,6 @@ def remove_duplicate_images(data_path) :
|
|
117 |
# from langchain_chroma import Chroma
|
118 |
# import chromadb
|
119 |
def initialize_qdrant(temp_dir , file_name , user):
|
120 |
-
|
121 |
client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
|
122 |
# client = qdrant_client.QdrantClient(url = "http://localhost:2452")
|
123 |
# client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
|
@@ -182,16 +179,17 @@ def retrieve_and_query(query, retriever_engine):
|
|
182 |
retrieved_image_path_list.append(node.metadata['file_path'])
|
183 |
|
184 |
return response, retrieved_image_path_list
|
185 |
-
|
186 |
-
def process_pdf(pdf_file
|
187 |
temp_dir = tempfile.TemporaryDirectory()
|
|
|
188 |
temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
|
189 |
with open(temp_pdf_path, "wb") as f:
|
190 |
f.write(pdf_file.getvalue())
|
191 |
|
192 |
-
data_path = os.path.join(temp_dir.name, f"my_own_data_{
|
193 |
os.makedirs(data_path , exist_ok=True)
|
194 |
-
img_save_path = os.path.join(temp_dir.name, f"extracted_images_{
|
195 |
os.makedirs(img_save_path , exist_ok=True)
|
196 |
|
197 |
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
@@ -202,7 +200,7 @@ def process_pdf(pdf_file , user):
|
|
202 |
moved_count = move_images(img_save_path, data_path)
|
203 |
remove_low_size_images(data_path)
|
204 |
remove_duplicate_images(data_path)
|
205 |
-
retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] ,
|
206 |
|
207 |
return temp_dir, retriever_engine
|
208 |
|
@@ -222,7 +220,7 @@ def main():
|
|
222 |
st.info(f"Uploaded PDF: {uploaded_file.name}")
|
223 |
if st.button("Process PDF"):
|
224 |
with st.spinner("Processing PDF..."):
|
225 |
-
temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file
|
226 |
|
227 |
st.success("PDF processed successfully!")
|
228 |
|
|
|
34 |
load_dotenv()
|
35 |
|
36 |
|
|
|
|
|
37 |
def extract_text_from_pdf(pdf_path):
|
38 |
reader = PdfReader(pdf_path)
|
39 |
full_text = ''
|
|
|
115 |
# from langchain_chroma import Chroma
|
116 |
# import chromadb
|
117 |
def initialize_qdrant(temp_dir , file_name , user):
|
|
|
118 |
client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
|
119 |
# client = qdrant_client.QdrantClient(url = "http://localhost:2452")
|
120 |
# client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
|
|
|
179 |
retrieved_image_path_list.append(node.metadata['file_path'])
|
180 |
|
181 |
return response, retrieved_image_path_list
|
182 |
+
#tmpnimvp35m , tmpnimvp35m
|
183 |
+
def process_pdf(pdf_file):
|
184 |
temp_dir = tempfile.TemporaryDirectory()
|
185 |
+
unique_folder_name = temp_dir.name.split('/')[-1]
|
186 |
temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
|
187 |
with open(temp_pdf_path, "wb") as f:
|
188 |
f.write(pdf_file.getvalue())
|
189 |
|
190 |
+
data_path = os.path.join(temp_dir.name, f"my_own_data_{unique_folder_name}_{os.path.splitext(pdf_file.name)[0]}")
|
191 |
os.makedirs(data_path , exist_ok=True)
|
192 |
+
img_save_path = os.path.join(temp_dir.name, f"extracted_images_{unique_folder_name}_{os.path.splitext(pdf_file.name)[0]}")
|
193 |
os.makedirs(img_save_path , exist_ok=True)
|
194 |
|
195 |
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
|
|
200 |
moved_count = move_images(img_save_path, data_path)
|
201 |
remove_low_size_images(data_path)
|
202 |
remove_duplicate_images(data_path)
|
203 |
+
retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , unique_folder_name)
|
204 |
|
205 |
return temp_dir, retriever_engine
|
206 |
|
|
|
220 |
st.info(f"Uploaded PDF: {uploaded_file.name}")
|
221 |
if st.button("Process PDF"):
|
222 |
with st.spinner("Processing PDF..."):
|
223 |
+
temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file)
|
224 |
|
225 |
st.success("PDF processed successfully!")
|
226 |
|
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/.lock
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tmp lock file
|
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:716020e47d41d39ea7e3e302ac0687b1a62e2b94991bc700fb5bcbaf6722380f
|
3 |
+
size 1585152
|
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c077e65278dfde8d472135f12860bf39fedcf018d44a00d54fc04b2f4eb6afb0
|
3 |
+
size 2998272
|
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/meta.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"collections": {"text_collection_pipeline_tmp0uzyg0nb_construction_pdf": {"vectors": {"size": 1536, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null}, "image_collection_pipeline_tmp0uzyg0nb_construction_pdf": {"vectors": {"size": 512, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null}}, "aliases": {}}
|