viboognesh commited on
Commit
26969e6
1 Parent(s): 9b53cea

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  qdrant_mm_db_pipeline/collection/text_collection_pipeline/storage.sqlite filter=lfs diff=lfs merge=lfs -text
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  qdrant_mm_db_pipeline/collection/text_collection_pipeline/storage.sqlite filter=lfs diff=lfs merge=lfs -text
37
+ qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite filter=lfs diff=lfs merge=lfs -text
38
+ qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -34,8 +34,6 @@ from dotenv import load_dotenv
34
  load_dotenv()
35
 
36
 
37
-
38
-
39
  def extract_text_from_pdf(pdf_path):
40
  reader = PdfReader(pdf_path)
41
  full_text = ''
@@ -117,7 +115,6 @@ def remove_duplicate_images(data_path) :
117
  # from langchain_chroma import Chroma
118
  # import chromadb
119
  def initialize_qdrant(temp_dir , file_name , user):
120
-
121
  client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
122
  # client = qdrant_client.QdrantClient(url = "http://localhost:2452")
123
  # client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
@@ -182,16 +179,17 @@ def retrieve_and_query(query, retriever_engine):
182
  retrieved_image_path_list.append(node.metadata['file_path'])
183
 
184
  return response, retrieved_image_path_list
185
-
186
- def process_pdf(pdf_file , user):
187
  temp_dir = tempfile.TemporaryDirectory()
 
188
  temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
189
  with open(temp_pdf_path, "wb") as f:
190
  f.write(pdf_file.getvalue())
191
 
192
- data_path = os.path.join(temp_dir.name, f"my_own_data_{user}_{os.path.splitext(pdf_file.name)[0]}")
193
  os.makedirs(data_path , exist_ok=True)
194
- img_save_path = os.path.join(temp_dir.name, f"extracted_images_{user}_{os.path.splitext(pdf_file.name)[0]}")
195
  os.makedirs(img_save_path , exist_ok=True)
196
 
197
  extracted_text = extract_text_from_pdf(temp_pdf_path)
@@ -202,7 +200,7 @@ def process_pdf(pdf_file , user):
202
  moved_count = move_images(img_save_path, data_path)
203
  remove_low_size_images(data_path)
204
  remove_duplicate_images(data_path)
205
- retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , curr_user)
206
 
207
  return temp_dir, retriever_engine
208
 
@@ -222,7 +220,7 @@ def main():
222
  st.info(f"Uploaded PDF: {uploaded_file.name}")
223
  if st.button("Process PDF"):
224
  with st.spinner("Processing PDF..."):
225
- temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file , curr_user)
226
 
227
  st.success("PDF processed successfully!")
228
 
 
34
  load_dotenv()
35
 
36
 
 
 
37
  def extract_text_from_pdf(pdf_path):
38
  reader = PdfReader(pdf_path)
39
  full_text = ''
 
115
  # from langchain_chroma import Chroma
116
  # import chromadb
117
  def initialize_qdrant(temp_dir , file_name , user):
 
118
  client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
119
  # client = qdrant_client.QdrantClient(url = "http://localhost:2452")
120
  # client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
 
179
  retrieved_image_path_list.append(node.metadata['file_path'])
180
 
181
  return response, retrieved_image_path_list
182
+ #tmpnimvp35m , tmpnimvp35m
183
+ def process_pdf(pdf_file):
184
  temp_dir = tempfile.TemporaryDirectory()
185
+ unique_folder_name = temp_dir.name.split('/')[-1]
186
  temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
187
  with open(temp_pdf_path, "wb") as f:
188
  f.write(pdf_file.getvalue())
189
 
190
+ data_path = os.path.join(temp_dir.name, f"my_own_data_{unique_folder_name}_{os.path.splitext(pdf_file.name)[0]}")
191
  os.makedirs(data_path , exist_ok=True)
192
+ img_save_path = os.path.join(temp_dir.name, f"extracted_images_{unique_folder_name}_{os.path.splitext(pdf_file.name)[0]}")
193
  os.makedirs(img_save_path , exist_ok=True)
194
 
195
  extracted_text = extract_text_from_pdf(temp_pdf_path)
 
200
  moved_count = move_images(img_save_path, data_path)
201
  remove_low_size_images(data_path)
202
  remove_duplicate_images(data_path)
203
+ retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , unique_folder_name)
204
 
205
  return temp_dir, retriever_engine
206
 
 
220
  st.info(f"Uploaded PDF: {uploaded_file.name}")
221
  if st.button("Process PDF"):
222
  with st.spinner("Processing PDF..."):
223
+ temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file)
224
 
225
  st.success("PDF processed successfully!")
226
 
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/.lock ADDED
@@ -0,0 +1 @@
 
 
1
+ tmp lock file
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/image_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:716020e47d41d39ea7e3e302ac0687b1a62e2b94991bc700fb5bcbaf6722380f
3
+ size 1585152
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/collection/text_collection_pipeline_tmp0uzyg0nb_construction_pdf/storage.sqlite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c077e65278dfde8d472135f12860bf39fedcf018d44a00d54fc04b2f4eb6afb0
3
+ size 2998272
qdrant_mm_db_pipeline_tmp0uzyg0nb_construction_pdf/meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"collections": {"text_collection_pipeline_tmp0uzyg0nb_construction_pdf": {"vectors": {"size": 1536, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null}, "image_collection_pipeline_tmp0uzyg0nb_construction_pdf": {"vectors": {"size": 512, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null}}, "aliases": {}}