captain-awesome commited on
Commit
a5eb8c4
1 Parent(s): f26dba9

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +287 -0
  2. reqirements.rtf +32 -0
app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """main.py
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1DPJ6tc2bCveBZyHSX02h_fbBS0fzzMrC
8
+ """
9
+
10
+
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ from langchain.chains.question_answering import load_qa_chain
13
+ from langchain.memory import ConversationBufferMemory
14
+ from langchain.llms import HuggingFacePipeline
15
+ from langchain import PromptTemplate
16
+ from langchain.embeddings import HuggingFaceEmbeddings
17
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+ from langchain.vectorstores import Chroma
19
+ from langchain.document_loaders import (
20
+ CSVLoader,
21
+ DirectoryLoader,
22
+ GitLoader,
23
+ NotebookLoader,
24
+ OnlinePDFLoader,
25
+ PythonLoader,
26
+ TextLoader,
27
+ UnstructuredFileLoader,
28
+ UnstructuredHTMLLoader,
29
+ UnstructuredPDFLoader,
30
+ UnstructuredWordDocumentLoader,
31
+ WebBaseLoader,
32
+ )
33
+ from transformers import (
34
+ AutoModelForCausalLM,
35
+ AutoTokenizer,
36
+ StoppingCriteria,
37
+ StoppingCriteriaList,
38
+ pipeline,
39
+ GenerationConfig,
40
+ TextStreamer,
41
+ pipeline
42
+ )
43
+ import torch
44
+ from transformers import BitsAndBytesConfig
45
+
46
+ def load_model(
47
+ model_path="vilsonrodrigues/falcon-7b-instruct-sharded"
48
+ ):
49
+
50
+ if not os.path.exists(model_path):
51
+ raise FileNotFoundError(f"No model file found at {model_path}")
52
+
53
+ quantization_config = BitsAndBytesConfig(
54
+ load_in_4bit=True,
55
+ bnb_4bit_compute_dtype=torch.float16,
56
+ bnb_4bit_quant_type="nf4",
57
+ bnb_4bit_use_double_quant=True,
58
+ )
59
+
60
+ model_4bit = AutoModelForCausalLM.from_pretrained(
61
+ model_path,
62
+ device_map="auto",
63
+ quantization_config=quantization_config,
64
+ )
65
+
66
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
67
+
68
+ pipeline = pipeline(
69
+ "text-generation",
70
+ model=model_4bit,
71
+ tokenizer=tokenizer,
72
+ use_cache=True,
73
+ device_map="auto",
74
+ max_length=700,
75
+ do_sample=True,
76
+ top_k=5,
77
+ num_return_sequences=1,
78
+ eos_token_id=tokenizer.eos_token_id,
79
+ pad_token_id=tokenizer.eos_token_id,
80
+ )
81
+
82
+ llm = HuggingFacePipeline(pipeline=pipeline)
83
+ return llm
84
+
85
+ def create_vector_database():
86
+ DB_DIR: str = os.path.join(ABS_PATH, "db")
87
+ """
88
+ Creates a vector database using document loaders and embeddings.
89
+
90
+ This function loads data from PDF, markdown and text files in the 'data/' directory,
91
+ splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
92
+ and finally persists the embeddings into a Chroma vector database.
93
+
94
+ """
95
+ # Initialize loaders for different file types
96
+ pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
97
+ markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
98
+ text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
99
+ csv_loader = DirectoryLoader("data/", glob="**/*.csv", loader_cls=CSVLoader)
100
+ python_loader = DirectoryLoader("data/", glob="**/*.py", loader_cls=PythonLoader)
101
+ epub_loader = DirectoryLoader("data/", glob="**/*.epub", loader_cls=UnstructuredEPubLoader)
102
+ html_loader = DirectoryLoader("data/", glob="**/*.html", loader_cls=UnstructuredHTMLLoader)
103
+ ppt_loader = DirectoryLoader("data/", glob="**/*.ppt", loader_cls=UnstructuredPowerPointLoader)
104
+ pptx_loader = DirectoryLoader("data/", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
105
+ doc_loader = DirectoryLoader("data/", glob="**/*.doc", loader_cls=UnstructuredWordDocumentLoader)
106
+ docx_loader = DirectoryLoader("data/", glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader)
107
+ odt_loader = DirectoryLoader("data/", glob="**/*.odt", loader_cls=UnstructuredODTLoader)
108
+ notebook_loader = DirectoryLoader("data/", glob="**/*.ipynb", loader_cls=NotebookLoader)
109
+
110
+
111
+ all_loaders = [pdf_loader, markdown_loader, text_loader, csv_loader, python_loader, epub_loader, html_loader, ppt_loader, pptx_loader, doc_loader, docx_loader, odt_loader, notebook_loader]
112
+
113
+ # Load documents from all loaders
114
+ loaded_documents = []
115
+ for loader in all_loaders:
116
+ loaded_documents.extend(loader.load())
117
+
118
+ # Split loaded documents into chunks
119
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
120
+ chunked_documents = text_splitter.split_documents(loaded_documents)
121
+
122
+ # Initialize HuggingFace embeddings
123
+ embeddings = HuggingFaceEmbeddings(
124
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
125
+ )
126
+
127
+ # Create and persist a Chroma vector database from the chunked documents
128
+ db = Chroma.from_documents(
129
+ documents=chunked_documents,
130
+ embedding=embeddings,
131
+ persist_directory=DB_DIR,
132
+ )
133
+ db.persist()
134
+ return db
135
+
136
+ def set_custom_prompt_condense():
137
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
138
+
139
+ Chat History:
140
+ {chat_history}
141
+ Follow Up Input: {question}
142
+ Standalone question:"""
143
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
144
+ return CONDENSE_QUESTION_PROMPT
145
+
146
+ def set_custom_prompt():
147
+ """
148
+ Prompt template for retrieval for each vectorstore
149
+ """
150
+
151
+
152
+ prompt_template = """<Instructions>
153
+ Important:
154
+ Answer with the facts listed in the list of sources below. If there isn't enough information below, say you don't know.
155
+ If asking a clarifying question to the user would help, ask the question.
156
+ ALWAYS return a "SOURCES" part in your answer, except for small-talk conversations.
157
+
158
+ Question: {question}
159
+
160
+ {context}
161
+
162
+
163
+ Question: {question}
164
+ Helpful Answer:
165
+
166
+ ---------------------------
167
+ ---------------------------
168
+ Sources:
169
+ """
170
+
171
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
172
+ return prompt
173
+
174
+ def create_chain(llm, prompt, CONDENSE_QUESTION_PROMPT, db):
175
+ """
176
+ Creates a Retrieval Question-Answering (QA) chain using a given language model, prompt, and database.
177
+
178
+ This function initializes a ConversationalRetrievalChain object with a specific chain type and configurations,
179
+ and returns this chain. The retriever is set up to return the top 3 results (k=3).
180
+
181
+ Args:
182
+ llm (any): The language model to be used in the RetrievalQA.
183
+ prompt (str): The prompt to be used in the chain type.
184
+ db (any): The database to be used as the retriever.
185
+
186
+ Returns:
187
+ ConversationalRetrievalChain: The initialized conversational chain.
188
+ """
189
+ memory = ConversationTokenBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, input_key='question', max_token_limit=1000)
190
+ chain = ConversationalRetrievalChain.from_llm(
191
+ llm=llm,
192
+ chain_type="stuff",
193
+ retriever=db.as_retriever(search_kwargs={"k": 3}),
194
+ return_source_documents=True,
195
+ combine_docs_chain_kwargs={"prompt": prompt},
196
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
197
+ memory=memory,
198
+ )
199
+ return chain
200
+
201
+ def create_retrieval_qa_bot():
202
+ if not os.path.exists(persist_dir):
203
+ raise FileNotFoundError(f"No directory found at {persist_dir}")
204
+
205
+ try:
206
+ llm = load_model() # Assuming this function exists and works as expected
207
+ except Exception as e:
208
+ raise Exception(f"Failed to load model: {str(e)}")
209
+
210
+ try:
211
+ prompt = set_custom_prompt() # Assuming this function exists and works as expected
212
+ except Exception as e:
213
+ raise Exception(f"Failed to get prompt: {str(e)}")
214
+
215
+ try:
216
+ CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense() # Assuming this function exists and works as expected
217
+ except Exception as e:
218
+ raise Exception(f"Failed to get condense prompt: {str(e)}")
219
+
220
+ try:
221
+ db = create_vector_database() # Assuming this function exists and works as expected
222
+ except Exception as e:
223
+ raise Exception(f"Failed to get database: {str(e)}")
224
+
225
+ try:
226
+ qa = create_chain(
227
+ llm=llm, prompt=prompt,CONDENSE_QUESTION_PROMPT=CONDENSE_QUESTION_PROMPT, db=db
228
+ ) # Assuming this function exists and works as expected
229
+ except Exception as e:
230
+ raise Exception(f"Failed to create retrieval QA chain: {str(e)}")
231
+
232
+ return qa
233
+
234
+ def retrieve_bot_answer(query):
235
+ """
236
+ Retrieves the answer to a given query using a QA bot.
237
+
238
+ This function creates an instance of a QA bot, passes the query to it,
239
+ and returns the bot's response.
240
+
241
+ Args:
242
+ query (str): The question to be answered by the QA bot.
243
+
244
+ Returns:
245
+ dict: The QA bot's response, typically a dictionary with response details.
246
+ """
247
+ qa_bot_instance = create_retrieval_qa_bot()
248
+ bot_response = qa_bot_instance({"query": query})
249
+ return bot_response
250
+
251
+ import streamlit as st
252
+ from your_module import load_model, set_custom_prompt, set_custom_prompt_condense, create_vector_database, retrieve_bot_answer
253
+
254
+ def main():
255
+ st.title("Docuverse")
256
+
257
+ # Upload files
258
+ uploaded_files = st.file_uploader("Upload your documents", type=["pdf", "md", "txt", "csv", "py", "epub", "html", "ppt", "pptx", "doc", "docx", "odt", "ipynb"], accept_multiple_files=True)
259
+
260
+ if uploaded_files:
261
+ # Process uploaded files
262
+ for uploaded_file in uploaded_files:
263
+ st.write(f"Uploaded: {uploaded_file.name}")
264
+
265
+ st.write("Chat with the Document:")
266
+ query = st.text_input("Ask a question:")
267
+
268
+ if st.button("Get Answer"):
269
+ if query:
270
+ # Load model, set prompts, create vector database, and retrieve answer
271
+ try:
272
+ llm = load_model()
273
+ prompt = set_custom_prompt()
274
+ CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()
275
+ db = create_vector_database()
276
+ response = retrieve_bot_answer(query)
277
+
278
+ # Display bot response
279
+ st.write("Bot Response:")
280
+ st.write(response)
281
+ except Exception as e:
282
+ st.error(f"An error occurred: {str(e)}")
283
+ else:
284
+ st.warning("Please enter a question.")
285
+
286
+ if __name__ == "__main__":
287
+ main()
reqirements.rtf ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {\rtf1\ansi\ansicpg1252\cocoartf2709
2
+ \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fmodern\fcharset0 Courier;}
3
+ {\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red255\green255\blue255;\red255\green255\blue255;
4
+ \red203\green203\blue202;\red202\green202\blue202;\red203\green203\blue202;}
5
+ {\*\expandedcolortbl;;\cssrgb\c0\c0\c0;\cssrgb\c100000\c100000\c100000\c0;\cssrgb\c100000\c100000\c99956;
6
+ \cssrgb\c83320\c83320\c83112;\cssrgb\c83229\c83229\c83125;\cssrgb\c83411\c83411\c83099;}
7
+ \margl1440\margr1440\vieww11520\viewh8400\viewkind0
8
+ \deftab720
9
+ \pard\pardeftab720\partightenfactor0
10
+
11
+ \f0\fs28 \cf2 \cb3 \expnd0\expndtw0\kerning0
12
+ \outl0\strokewidth0 \strokec4 langchain\
13
+ PyPDF2\
14
+ streamlit\
15
+ #openai\
16
+ faiss-cpu\
17
+ \pard\pardeftab720\partightenfactor0
18
+ \cf2 \strokec5 safetensors\strokec4 \
19
+ \pard\pardeftab720\partightenfactor0
20
+ \cf2 \strokec4 huggingface-hub\
21
+ InstructorEmbedding\
22
+ sentence-transformers\
23
+ \pard\pardeftab720\partightenfactor0
24
+ \cf2 \strokec5 torch\
25
+ sentence_transformers\
26
+ einops\strokec4 \
27
+ \pard\pardeftab720\partightenfactor0
28
+ \cf2 \strokec5 bitsandbytes\
29
+ accelerate\
30
+ peft\cb1 \strokec6 \
31
+ \cb3 \strokec7 transformers\cb1 \strokec6 \
32
+ }