Spaces:
Runtime error
Runtime error
sanjeevl10
commited on
Commit
•
53b1dd3
1
Parent(s):
d154644
build the qdrant
Browse files- app.py +18 -18
- requirements.txt +2 -1
app.py
CHANGED
@@ -10,6 +10,9 @@ from langchain.document_loaders import PyMuPDFLoader
|
|
10 |
from langchain_huggingface import HuggingFaceEndpointEmbeddings
|
11 |
from langchain_core.prompts import PromptTemplate
|
12 |
from langchain.schema.runnable.config import RunnableConfig
|
|
|
|
|
|
|
13 |
|
14 |
# GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
|
15 |
# ---- ENV VARIABLES ---- #
|
@@ -36,13 +39,11 @@ HF_TOKEN = os.environ["HF_TOKEN"]
|
|
36 |
3. Load HuggingFace Embeddings (remember to use the URL we set above)
|
37 |
4. Index Files if they do not exist, otherwise load the vectorstore
|
38 |
"""
|
39 |
-
# Loop through all the pdf documents in the folder data
|
40 |
-
def load_pdfdocuments(self,path: str):
|
41 |
-
self.documents = []
|
42 |
-
return PyMuPDFLoader("data/airbnb-10k.pdf").load()
|
43 |
-
|
44 |
#Load the Pdf Documents from airbnb-10k
|
45 |
-
documents =
|
|
|
|
|
|
|
46 |
|
47 |
### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
|
48 |
text_splitter = RecursiveCharacterTextSplitter(
|
@@ -60,27 +61,26 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
|
|
60 |
huggingfacehub_api_token=HF_TOKEN,
|
61 |
)
|
62 |
|
63 |
-
|
64 |
-
vectordbfile = os.path.join(
|
65 |
|
66 |
|
67 |
if os.path.exists(vectordbfile):
|
68 |
-
vectorstore =
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
)
|
73 |
hf_retriever = vectorstore.as_retriever()
|
74 |
-
print("Loaded Vectorstore")
|
75 |
else:
|
76 |
print("Indexing Files")
|
77 |
-
os.makedirs(
|
78 |
for i in range(0, len(split_documents), 32):
|
79 |
if i == 0:
|
80 |
-
vectorstore =
|
81 |
continue
|
82 |
vectorstore.add_documents(split_documents[i:i+32])
|
83 |
-
vectorstore.save_local(
|
84 |
hf_retriever = vectorstore.as_retriever()
|
85 |
|
86 |
### 4. INDEX FILES
|
@@ -132,10 +132,10 @@ def rename(original_author: str):
|
|
132 |
"""
|
133 |
This function can be used to rename the 'author' of a message.
|
134 |
|
135 |
-
In this case, we're overriding the 'Assistant' author to be 'AirBnb
|
136 |
"""
|
137 |
rename_dict = {
|
138 |
-
"Assistant" : "AirBnB
|
139 |
}
|
140 |
return rename_dict.get(original_author, original_author)
|
141 |
|
|
|
10 |
from langchain_huggingface import HuggingFaceEndpointEmbeddings
|
11 |
from langchain_core.prompts import PromptTemplate
|
12 |
from langchain.schema.runnable.config import RunnableConfig
|
13 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
14 |
+
from langchain_community.vectorstores import Qdrant
|
15 |
+
|
16 |
|
17 |
# GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
|
18 |
# ---- ENV VARIABLES ---- #
|
|
|
39 |
3. Load HuggingFace Embeddings (remember to use the URL we set above)
|
40 |
4. Index Files if they do not exist, otherwise load the vectorstore
|
41 |
"""
|
|
|
|
|
|
|
|
|
|
|
42 |
#Load the Pdf Documents from airbnb-10k
|
43 |
+
documents = PyMuPDFLoader("data/airbnb-10k.pdf").load()
|
44 |
+
|
45 |
+
#use the embedding model
|
46 |
+
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
|
47 |
|
48 |
### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
|
49 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
61 |
huggingfacehub_api_token=HF_TOKEN,
|
62 |
)
|
63 |
|
64 |
+
vectordbdir = "./data"
|
65 |
+
vectordbfile = os.path.join(vectordbdir, "/vectorstore")
|
66 |
|
67 |
|
68 |
if os.path.exists(vectordbfile):
|
69 |
+
vectorstore = Qdrant.from_existing_collection(
|
70 |
+
embedding=hf_embeddings,
|
71 |
+
path=vectordbfile,
|
72 |
+
collection_name="airbnb-10k",
|
73 |
)
|
74 |
hf_retriever = vectorstore.as_retriever()
|
|
|
75 |
else:
|
76 |
print("Indexing Files")
|
77 |
+
os.makedirs(vectordbfile, exist_ok=True)
|
78 |
for i in range(0, len(split_documents), 32):
|
79 |
if i == 0:
|
80 |
+
vectorstore = Qdrant.from_documents(split_documents[i:i+32], hf_embeddings)
|
81 |
continue
|
82 |
vectorstore.add_documents(split_documents[i:i+32])
|
83 |
+
vectorstore.save_local(vectordbfile)
|
84 |
hf_retriever = vectorstore.as_retriever()
|
85 |
|
86 |
### 4. INDEX FILES
|
|
|
132 |
"""
|
133 |
This function can be used to rename the 'author' of a message.
|
134 |
|
135 |
+
In this case, we're overriding the 'Assistant' author to be 'AirBnb Stock Analyzer'.
|
136 |
"""
|
137 |
rename_dict = {
|
138 |
+
"Assistant" : "AirBnB Stock Analyzer"
|
139 |
}
|
140 |
return rename_dict.get(original_author, original_author)
|
141 |
|
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ langchain_core==0.2.9
|
|
5 |
langchain_huggingface==0.0.3
|
6 |
langchain_text_splitters==0.2.1
|
7 |
python-dotenv==1.0.1
|
8 |
-
faiss-cpu
|
|
|
|
5 |
langchain_huggingface==0.0.3
|
6 |
langchain_text_splitters==0.2.1
|
7 |
python-dotenv==1.0.1
|
8 |
+
faiss-cpu
|
9 |
+
camelot-py[cv]==0.10.1
|