sanjeevl10 commited on
Commit
8b05ebc
1 Parent(s): be508b8

removed unused versions, cleaned up Docker file

Browse files
Files changed (2) hide show
  1. app.py +9 -11
  2. requirements.txt +2 -1
app.py CHANGED
@@ -5,7 +5,7 @@ from operator import itemgetter
5
  from langchain_huggingface import HuggingFaceEndpoint
6
  from langchain_community.document_loaders import TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- from langchain.document_loaders import PyMuPDFLoader
9
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
10
  from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.runnable.config import RunnableConfig
@@ -38,7 +38,7 @@ HF_TOKEN = os.environ["HF_TOKEN"]
38
  4. Index Files if they do not exist, otherwise load the vectorstore
39
  """
40
  #Load the Pdf Documents from airbnb-10k
41
- documents = PyMuPDFLoader("data/airbnb-10k.pdf").load()
42
 
43
  ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
44
  text_splitter = RecursiveCharacterTextSplitter(
@@ -56,9 +56,7 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
56
  huggingfacehub_api_token=HF_TOKEN,
57
  )
58
 
59
- vectordbdir = "./data"
60
- vectordbfile = os.path.join(vectordbdir, "/vectorstore")
61
-
62
 
63
  if os.path.exists(vectordbfile):
64
  vectorstore = Qdrant.from_existing_collection(
@@ -70,12 +68,12 @@ if os.path.exists(vectordbfile):
70
  else:
71
  print("Indexing Files")
72
  os.makedirs(vectordbfile, exist_ok=True)
73
- for i in range(0, len(split_documents), 32):
74
- if i == 0:
75
- vectorstore = Qdrant.from_documents(split_documents[i:i+32], hf_embeddings)
76
- continue
77
- vectorstore.add_documents(split_documents[i:i+32])
78
- vectorstore.save_local(vectordbfile)
79
  hf_retriever = vectorstore.as_retriever()
80
 
81
  ### 4. INDEX FILES
 
5
  from langchain_huggingface import HuggingFaceEndpoint
6
  from langchain_community.document_loaders import TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain.document_loaders import UnstructuredPDFLoader
9
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
10
  from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.runnable.config import RunnableConfig
 
38
  4. Index Files if they do not exist, otherwise load the vectorstore
39
  """
40
  #Load the Pdf Documents from airbnb-10k
41
+ documents = UnstructuredPDFLoader("data/airbnb-10k.pdf").load()
42
 
43
  ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
44
  text_splitter = RecursiveCharacterTextSplitter(
 
56
  huggingfacehub_api_token=HF_TOKEN,
57
  )
58
 
59
+ vectordbfile = "./data/vectorstore"
 
 
60
 
61
  if os.path.exists(vectordbfile):
62
  vectorstore = Qdrant.from_existing_collection(
 
68
  else:
69
  print("Indexing Files")
70
  os.makedirs(vectordbfile, exist_ok=True)
71
+ vectorstore = Qdrant.from_documents(
72
+ documents=split_documents,
73
+ embedding=hf_embeddings,
74
+ path=vectordbfile,
75
+ collection_name="airbnb-10k",
76
+ )
77
  hf_retriever = vectorstore.as_retriever()
78
 
79
  ### 4. INDEX FILES
requirements.txt CHANGED
@@ -6,4 +6,5 @@ langchain_huggingface==0.0.3
6
  langchain_text_splitters==0.2.1
7
  python-dotenv==1.0.1
8
  pymupdf==1.24.5
9
- qdrant-client==1.9.2
 
 
6
  langchain_text_splitters==0.2.1
7
  python-dotenv==1.0.1
8
  pymupdf==1.24.5
9
+ qdrant-client==1.9.2
10
+ unstructured==0.5.6