sanjeevl10 commited on
Commit
f63f886
1 Parent(s): 768e225

added local path for vectordb errors

Browse files
Files changed (2) hide show
  1. app.py +15 -5
  2. solution_app.py +0 -155
app.py CHANGED
@@ -11,6 +11,11 @@ from langchain_core.prompts import PromptTemplate
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
 
 
 
 
 
14
 
15
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
16
  # ---- ENV VARIABLES ---- #
@@ -58,9 +63,13 @@ hf_embeddings = HuggingFaceEndpointEmbeddings(
58
  huggingfacehub_api_token=HF_TOKEN,
59
  )
60
 
61
- if os.path.exists("./data/vectorstore"):
 
 
 
 
62
  vectorstore = FAISS.load_local(
63
- "./data/vectorstore",
64
  hf_embeddings,
65
  allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
66
  )
@@ -68,18 +77,19 @@ if os.path.exists("./data/vectorstore"):
68
  print("Loaded Vectorstore")
69
  else:
70
  print("Indexing Files")
71
- os.makedirs("./data/vectorstore", exist_ok=True)
72
  for i in range(0, len(split_documents), 32):
73
  if i == 0:
74
  vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
75
  continue
76
  vectorstore.add_documents(split_documents[i:i+32])
77
- vectorstore.save_local("./data/vectorstore")
 
78
 
79
  ### 4. INDEX FILES
80
  ### NOTE: REMEMBER TO BATCH THE DOCUMENTS WITH MAXIMUM BATCH SIZE = 32
81
 
82
- hf_retriever = vectorstore.as_retriever()
83
 
84
  # -- AUGMENTED -- #
85
  """
 
11
  from langchain.schema.output_parser import StrOutputParser
12
  from langchain.schema.runnable import RunnablePassthrough
13
  from langchain.schema.runnable.config import RunnableConfig
14
+ from pathlib import Path
15
+
16
+ DATA_DIR = "./data"
17
+ VECTORSTORE_DIR = os.path.join(DATA_DIR, "vectorstore")
18
+ VECTORSTORE_PATH = os.path.join(VECTORSTORE_DIR, "index.faiss")
19
 
20
  # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
21
  # ---- ENV VARIABLES ---- #
 
63
  huggingfacehub_api_token=HF_TOKEN,
64
  )
65
 
66
+ vectordb = os.path.join("./data", "vectorstore")
67
+ vectordbfile = os.path.join(VECTORSTORE_DIR, "index.faiss")
68
+
69
+
70
+ if os.path.exists(vectordbfile):
71
  vectorstore = FAISS.load_local(
72
+ vectordb,
73
  hf_embeddings,
74
  allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
75
  )
 
77
  print("Loaded Vectorstore")
78
  else:
79
  print("Indexing Files")
80
+ os.makedirs(vectordb, exist_ok=True)
81
  for i in range(0, len(split_documents), 32):
82
  if i == 0:
83
  vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
84
  continue
85
  vectorstore.add_documents(split_documents[i:i+32])
86
+ vectorstore.save_local(vectordb)
87
+ hf_retriever = vectorstore.as_retriever()
88
 
89
  ### 4. INDEX FILES
90
  ### NOTE: REMEMBER TO BATCH THE DOCUMENTS WITH MAXIMUM BATCH SIZE = 32
91
 
92
+
93
 
94
  # -- AUGMENTED -- #
95
  """
solution_app.py DELETED
@@ -1,155 +0,0 @@
1
- import os
2
- import chainlit as cl
3
- from dotenv import load_dotenv
4
- from operator import itemgetter
5
- from langchain_huggingface import HuggingFaceEndpoint
6
- from langchain_community.document_loaders import TextLoader
7
- from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- from langchain_community.vectorstores import FAISS
9
- from langchain_huggingface import HuggingFaceEndpointEmbeddings
10
- from langchain_core.prompts import PromptTemplate
11
- from langchain.schema.output_parser import StrOutputParser
12
- from langchain.schema.runnable import RunnablePassthrough
13
- from langchain.schema.runnable.config import RunnableConfig
14
-
15
- # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
16
- # ---- ENV VARIABLES ---- #
17
- """
18
- This function will load our environment file (.env) if it is present.
19
-
20
- NOTE: Make sure that .env is in your .gitignore file - it is by default, but please ensure it remains there.
21
- """
22
- load_dotenv()
23
-
24
- """
25
- We will load our environment variables here.
26
- """
27
- HF_LLM_ENDPOINT = os.environ["HF_LLM_ENDPOINT"]
28
- HF_EMBED_ENDPOINT = os.environ["HF_EMBED_ENDPOINT"]
29
- HF_TOKEN = os.environ["HF_TOKEN"]
30
-
31
- # ---- GLOBAL DECLARATIONS ---- #
32
-
33
- # -- RETRIEVAL -- #
34
- """
35
- 1. Load Documents from Text File
36
- 2. Split Documents into Chunks
37
- 3. Load HuggingFace Embeddings (remember to use the URL we set above)
38
- 4. Index Files if they do not exist, otherwise load the vectorstore
39
- """
40
- document_loader = TextLoader("./data/paul_graham_essays.txt")
41
- documents = document_loader.load()
42
-
43
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
44
- split_documents = text_splitter.split_documents(documents)
45
-
46
- hf_embeddings = HuggingFaceEndpointEmbeddings(
47
- model=HF_EMBED_ENDPOINT,
48
- task="feature-extraction",
49
- huggingfacehub_api_token=HF_TOKEN,
50
- )
51
-
52
- if os.path.exists("./data/vectorstore"):
53
- vectorstore = FAISS.load_local(
54
- "./data/vectorstore",
55
- hf_embeddings,
56
- allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
57
- )
58
- hf_retriever = vectorstore.as_retriever()
59
- print("Loaded Vectorstore")
60
- else:
61
- print("Indexing Files")
62
- os.makedirs("./data/vectorstore", exist_ok=True)
63
- for i in range(0, len(split_documents), 32):
64
- if i == 0:
65
- vectorstore = FAISS.from_documents(split_documents[i:i+32], hf_embeddings)
66
- continue
67
- vectorstore.add_documents(split_documents[i:i+32])
68
- vectorstore.save_local("./data/vectorstore")
69
-
70
- hf_retriever = vectorstore.as_retriever()
71
-
72
- # -- AUGMENTED -- #
73
- """
74
- 1. Define a String Template
75
- 2. Create a Prompt Template from the String Template
76
- """
77
- RAG_PROMPT_TEMPLATE = """\
78
- <|start_header_id|>system<|end_header_id|>
79
- You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context, say you don't know.<|eot_id|>
80
-
81
- <|start_header_id|>user<|end_header_id|>
82
- User Query:
83
- {query}
84
-
85
- Context:
86
- {context}<|eot_id|>
87
-
88
- <|start_header_id|>assistant<|end_header_id|>
89
- """
90
-
91
- rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
92
-
93
- # -- GENERATION -- #
94
- """
95
- 1. Create a HuggingFaceEndpoint for the LLM
96
- """
97
- hf_llm = HuggingFaceEndpoint(
98
- endpoint_url=HF_LLM_ENDPOINT,
99
- max_new_tokens=512,
100
- top_k=10,
101
- top_p=0.95,
102
- temperature=0.3,
103
- repetition_penalty=1.15,
104
- huggingfacehub_api_token=HF_TOKEN,
105
- )
106
-
107
- @cl.author_rename
108
- def rename(original_author: str):
109
- """
110
- This function can be used to rename the 'author' of a message.
111
-
112
- In this case, we're overriding the 'Assistant' author to be 'Paul Graham Essay Bot'.
113
- """
114
- rename_dict = {
115
- "Assistant" : "Paul Graham Essay Bot"
116
- }
117
- return rename_dict.get(original_author, original_author)
118
-
119
- @cl.on_chat_start
120
- async def start_chat():
121
- """
122
- This function will be called at the start of every user session.
123
-
124
- We will build our LCEL RAG chain here, and store it in the user session.
125
-
126
- The user session is a dictionary that is unique to each user session, and is stored in the memory of the server.
127
- """
128
-
129
- lcel_rag_chain = (
130
- {"context": itemgetter("query") | hf_retriever, "query": itemgetter("query")}
131
- | rag_prompt | hf_llm
132
- )
133
-
134
- cl.user_session.set("lcel_rag_chain", lcel_rag_chain)
135
-
136
- @cl.on_message
137
- async def main(message: cl.Message):
138
- """
139
- This function will be called every time a message is recieved from a session.
140
-
141
- We will use the LCEL RAG chain to generate a response to the user query.
142
-
143
- The LCEL RAG chain is stored in the user session, and is unique to each user session - this is why we can access it here.
144
- """
145
- lcel_rag_chain = cl.user_session.get("lcel_rag_chain")
146
-
147
- msg = cl.Message(content="")
148
-
149
- for chunk in await cl.make_async(lcel_rag_chain.stream)(
150
- {"query": message.content},
151
- config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
152
- ):
153
- await msg.stream_token(chunk)
154
-
155
- await msg.send()