Spaces:

DoazInc
/

ptchecker

Sleeping

App Files Files Community

viboognesh commited on Sep 4

Commit

26969e6

•

1 Parent(s): 9b53cea

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +2 -0
app.py +7 -9
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/.lock +1 -0
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite +3 -0
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite +3 -0
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/meta.json +1 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 qdrant_mm_db_pipeline/collection/text_collection_pipeline/storage.sqlite filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 qdrant_mm_db_pipeline/collection/text_collection_pipeline/storage.sqlite filter=lfs diff=lfs merge=lfs -text
+qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite filter=lfs diff=lfs merge=lfs -text
+qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -34,8 +34,6 @@ from dotenv import load_dotenv
 load_dotenv()
 def extract_text_from_pdf(pdf_path):
     reader = PdfReader(pdf_path)
     full_text = ''
@@ -117,7 +115,6 @@ def remove_duplicate_images(data_path) :
 # from langchain_chroma import Chroma
 # import chromadb
 def initialize_qdrant(temp_dir , file_name , user):
     client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
     # client = qdrant_client.QdrantClient(url = "http://localhost:2452")
     # client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
@@ -182,16 +179,17 @@ def retrieve_and_query(query, retriever_engine):
                 retrieved_image_path_list.append(node.metadata['file_path'])
     return response, retrieved_image_path_list
-def process_pdf(pdf_file , user):
     temp_dir = tempfile.TemporaryDirectory()
     temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
     with open(temp_pdf_path, "wb") as f:
         f.write(pdf_file.getvalue())
-    data_path = os.path.join(temp_dir.name, f"my_own_data_{user}_{os.path.splitext(pdf_file.name)[0]}")
     os.makedirs(data_path , exist_ok=True)
-    img_save_path = os.path.join(temp_dir.name, f"extracted_images_{user}_{os.path.splitext(pdf_file.name)[0]}")
     os.makedirs(img_save_path , exist_ok=True)
     extracted_text = extract_text_from_pdf(temp_pdf_path)
@@ -202,7 +200,7 @@ def process_pdf(pdf_file , user):
     moved_count = move_images(img_save_path, data_path)
     remove_low_size_images(data_path)
     remove_duplicate_images(data_path)
-    retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , curr_user)
     return temp_dir, retriever_engine
@@ -222,7 +220,7 @@ def main():
         st.info(f"Uploaded PDF: {uploaded_file.name}")
         if st.button("Process PDF"):
             with st.spinner("Processing PDF..."):
-                temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file , curr_user)
                 st.success("PDF processed successfully!")

 load_dotenv()
 def extract_text_from_pdf(pdf_path):
     reader = PdfReader(pdf_path)
     full_text = ''
 # from langchain_chroma import Chroma
 # import chromadb
 def initialize_qdrant(temp_dir , file_name , user):
     client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
     # client = qdrant_client.QdrantClient(url = "http://localhost:2452")
     # client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
                 retrieved_image_path_list.append(node.metadata['file_path'])
     return response, retrieved_image_path_list
+#tmpnimvp35m , tmpnimvp35m
+def process_pdf(pdf_file):
     temp_dir = tempfile.TemporaryDirectory()
+    unique_folder_name = temp_dir.name.split('/')[-1]
     temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
     with open(temp_pdf_path, "wb") as f:
         f.write(pdf_file.getvalue())
+    data_path = os.path.join(temp_dir.name, f"my_own_data_{unique_folder_name}_{os.path.splitext(pdf_file.name)[0]}")
     os.makedirs(data_path , exist_ok=True)
+    img_save_path = os.path.join(temp_dir.name, f"extracted_images_{unique_folder_name}_{os.path.splitext(pdf_file.name)[0]}")
     os.makedirs(img_save_path , exist_ok=True)
     extracted_text = extract_text_from_pdf(temp_pdf_path)
     moved_count = move_images(img_save_path, data_path)
     remove_low_size_images(data_path)
     remove_duplicate_images(data_path)
+    retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , unique_folder_name)
     return temp_dir, retriever_engine
         st.info(f"Uploaded PDF: {uploaded_file.name}")
         if st.button("Process PDF"):
             with st.spinner("Processing PDF..."):
+                temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file)
                 st.success("PDF processed successfully!")

qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/.lock ADDED Viewed

	@@ -0,0 +1 @@


1	+ tmp lock file

qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:716020e47d41d39ea7e3e302ac0687b1a62e2b94991bc700fb5bcbaf6722380f
+size 1585152

qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c077e65278dfde8d472135f12860bf39fedcf018d44a00d54fc04b2f4eb6afb0
+size 2998272

qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/meta.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"collections": {"text_collection_pipeline_tmp0uzyg0nb_construction_pdf": {"vectors": {"size": 1536, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null}, "image_collection_pipeline_tmp0uzyg0nb_construction_pdf": {"vectors": {"size": 512, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null}}, "aliases": {}}