sanjeevl10 commited on
Commit
53b1dd3
1 Parent(s): d154644

build the qdrant

Browse files
Files changed (2) hide show
  1. app.py +18 -18
  2. requirements.txt +2 -1
app.py CHANGED
@@ -10,6 +10,9 @@ from langchain.document_loaders import PyMuPDFLoader
10
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
11
  from langchain_core.prompts import PromptTemplate
12
  from langchain.schema.runnable.config import RunnableConfig
 
 
 
13
 
14
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
15
  # ---- ENV VARIABLES ---- #
@@ -36,13 +39,11 @@ HF_TOKEN = os.environ["HF_TOKEN"]
36
  3. Load HuggingFace Embeddings (remember to use the URL we set above)
37
  4. Index Files if they do not exist, otherwise load the vectorstore
38
  """
39
- # Loop through all the pdf documents in the folder data
40
- def load_pdfdocuments(self,path: str):
41
- self.documents = []
42
- return PyMuPDFLoader("data/airbnb-10k.pdf").load()
43
-
44
  #Load the Pdf Documents from airbnb-10k
45
- documents = load_pdfdocuments()
 
 
 
46
 
47
  ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
48
  text_splitter = RecursiveCharacterTextSplitter(
@@ -60,27 +61,26 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
60
  huggingfacehub_api_token=HF_TOKEN,
61
  )
62
 
63
- vectordb = os.path.join("./data", "vectorstore")
64
- vectordbfile = os.path.join(vectordb, "index.faiss")
65
 
66
 
67
  if os.path.exists(vectordbfile):
68
- vectorstore = FAISS.load_local(
69
- vectordb,
70
- hf_embeddings,
71
- allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
72
  )
73
  hf_retriever = vectorstore.as_retriever()
74
- print("Loaded Vectorstore")
75
  else:
76
  print("Indexing Files")
77
- os.makedirs(vectordb, exist_ok=True)
78
  for i in range(0, len(split_documents), 32):
79
  if i == 0:
80
- vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
81
  continue
82
  vectorstore.add_documents(split_documents[i:i+32])
83
- vectorstore.save_local(vectordb)
84
  hf_retriever = vectorstore.as_retriever()
85
 
86
  ### 4. INDEX FILES
@@ -132,10 +132,10 @@ def rename(original_author: str):
132
  """
133
  This function can be used to rename the 'author' of a message.
134
 
135
- In this case, we're overriding the 'Assistant' author to be 'AirBnb LLM Assistant'.
136
  """
137
  rename_dict = {
138
- "Assistant" : "AirBnB LLM Assitant"
139
  }
140
  return rename_dict.get(original_author, original_author)
141
 
 
10
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
11
  from langchain_core.prompts import PromptTemplate
12
  from langchain.schema.runnable.config import RunnableConfig
13
+ from langchain_openai.embeddings import OpenAIEmbeddings
14
+ from langchain_community.vectorstores import Qdrant
15
+
16
 
17
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
18
  # ---- ENV VARIABLES ---- #
 
39
  3. Load HuggingFace Embeddings (remember to use the URL we set above)
40
  4. Index Files if they do not exist, otherwise load the vectorstore
41
  """
 
 
 
 
 
42
  #Load the Pdf Documents from airbnb-10k
43
+ documents = PyMuPDFLoader("data/airbnb-10k.pdf").load()
44
+
45
+ #use the embedding model
46
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
47
 
48
  ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
49
  text_splitter = RecursiveCharacterTextSplitter(
 
61
  huggingfacehub_api_token=HF_TOKEN,
62
  )
63
 
64
+ vectordbdir = "./data"
65
+ vectordbfile = os.path.join(vectordbdir, "/vectorstore")
66
 
67
 
68
  if os.path.exists(vectordbfile):
69
+ vectorstore = Qdrant.from_existing_collection(
70
+ embedding=hf_embeddings,
71
+ path=vectordbfile,
72
+ collection_name="airbnb-10k",
73
  )
74
  hf_retriever = vectorstore.as_retriever()
 
75
  else:
76
  print("Indexing Files")
77
+ os.makedirs(vectordbfile, exist_ok=True)
78
  for i in range(0, len(split_documents), 32):
79
  if i == 0:
80
+ vectorstore = Qdrant.from_documents(split_documents[i:i+32], hf_embeddings)
81
  continue
82
  vectorstore.add_documents(split_documents[i:i+32])
83
+ vectorstore.save_local(vectordbfile)
84
  hf_retriever = vectorstore.as_retriever()
85
 
86
  ### 4. INDEX FILES
 
132
  """
133
  This function can be used to rename the 'author' of a message.
134
 
135
+ In this case, we're overriding the 'Assistant' author to be 'AirBnb Stock Analyzer'.
136
  """
137
  rename_dict = {
138
+ "Assistant" : "AirBnB Stock Analyzer"
139
  }
140
  return rename_dict.get(original_author, original_author)
141
 
requirements.txt CHANGED
@@ -5,4 +5,5 @@ langchain_core==0.2.9
5
  langchain_huggingface==0.0.3
6
  langchain_text_splitters==0.2.1
7
  python-dotenv==1.0.1
8
- faiss-cpu
 
 
5
  langchain_huggingface==0.0.3
6
  langchain_text_splitters==0.2.1
7
  python-dotenv==1.0.1
8
+ faiss-cpu
9
+ camelot-py[cv]==0.10.1