captain-awesome
commited on
Commit
•
a5eb8c4
1
Parent(s):
f26dba9
Upload 2 files
Browse files- app.py +287 -0
- reqirements.rtf +32 -0
app.py
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""main.py
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1DPJ6tc2bCveBZyHSX02h_fbBS0fzzMrC
|
8 |
+
"""
|
9 |
+
|
10 |
+
|
11 |
+
from langchain.chains import ConversationalRetrievalChain
|
12 |
+
from langchain.chains.question_answering import load_qa_chain
|
13 |
+
from langchain.memory import ConversationBufferMemory
|
14 |
+
from langchain.llms import HuggingFacePipeline
|
15 |
+
from langchain import PromptTemplate
|
16 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
17 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
18 |
+
from langchain.vectorstores import Chroma
|
19 |
+
from langchain.document_loaders import (
|
20 |
+
CSVLoader,
|
21 |
+
DirectoryLoader,
|
22 |
+
GitLoader,
|
23 |
+
NotebookLoader,
|
24 |
+
OnlinePDFLoader,
|
25 |
+
PythonLoader,
|
26 |
+
TextLoader,
|
27 |
+
UnstructuredFileLoader,
|
28 |
+
UnstructuredHTMLLoader,
|
29 |
+
UnstructuredPDFLoader,
|
30 |
+
UnstructuredWordDocumentLoader,
|
31 |
+
WebBaseLoader,
|
32 |
+
)
|
33 |
+
from transformers import (
|
34 |
+
AutoModelForCausalLM,
|
35 |
+
AutoTokenizer,
|
36 |
+
StoppingCriteria,
|
37 |
+
StoppingCriteriaList,
|
38 |
+
pipeline,
|
39 |
+
GenerationConfig,
|
40 |
+
TextStreamer,
|
41 |
+
pipeline
|
42 |
+
)
|
43 |
+
import torch
|
44 |
+
from transformers import BitsAndBytesConfig
|
45 |
+
|
46 |
+
def load_model(
|
47 |
+
model_path="vilsonrodrigues/falcon-7b-instruct-sharded"
|
48 |
+
):
|
49 |
+
|
50 |
+
if not os.path.exists(model_path):
|
51 |
+
raise FileNotFoundError(f"No model file found at {model_path}")
|
52 |
+
|
53 |
+
quantization_config = BitsAndBytesConfig(
|
54 |
+
load_in_4bit=True,
|
55 |
+
bnb_4bit_compute_dtype=torch.float16,
|
56 |
+
bnb_4bit_quant_type="nf4",
|
57 |
+
bnb_4bit_use_double_quant=True,
|
58 |
+
)
|
59 |
+
|
60 |
+
model_4bit = AutoModelForCausalLM.from_pretrained(
|
61 |
+
model_path,
|
62 |
+
device_map="auto",
|
63 |
+
quantization_config=quantization_config,
|
64 |
+
)
|
65 |
+
|
66 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
67 |
+
|
68 |
+
pipeline = pipeline(
|
69 |
+
"text-generation",
|
70 |
+
model=model_4bit,
|
71 |
+
tokenizer=tokenizer,
|
72 |
+
use_cache=True,
|
73 |
+
device_map="auto",
|
74 |
+
max_length=700,
|
75 |
+
do_sample=True,
|
76 |
+
top_k=5,
|
77 |
+
num_return_sequences=1,
|
78 |
+
eos_token_id=tokenizer.eos_token_id,
|
79 |
+
pad_token_id=tokenizer.eos_token_id,
|
80 |
+
)
|
81 |
+
|
82 |
+
llm = HuggingFacePipeline(pipeline=pipeline)
|
83 |
+
return llm
|
84 |
+
|
85 |
+
def create_vector_database():
|
86 |
+
DB_DIR: str = os.path.join(ABS_PATH, "db")
|
87 |
+
"""
|
88 |
+
Creates a vector database using document loaders and embeddings.
|
89 |
+
|
90 |
+
This function loads data from PDF, markdown and text files in the 'data/' directory,
|
91 |
+
splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
|
92 |
+
and finally persists the embeddings into a Chroma vector database.
|
93 |
+
|
94 |
+
"""
|
95 |
+
# Initialize loaders for different file types
|
96 |
+
pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
|
97 |
+
markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
|
98 |
+
text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
|
99 |
+
csv_loader = DirectoryLoader("data/", glob="**/*.csv", loader_cls=CSVLoader)
|
100 |
+
python_loader = DirectoryLoader("data/", glob="**/*.py", loader_cls=PythonLoader)
|
101 |
+
epub_loader = DirectoryLoader("data/", glob="**/*.epub", loader_cls=UnstructuredEPubLoader)
|
102 |
+
html_loader = DirectoryLoader("data/", glob="**/*.html", loader_cls=UnstructuredHTMLLoader)
|
103 |
+
ppt_loader = DirectoryLoader("data/", glob="**/*.ppt", loader_cls=UnstructuredPowerPointLoader)
|
104 |
+
pptx_loader = DirectoryLoader("data/", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
|
105 |
+
doc_loader = DirectoryLoader("data/", glob="**/*.doc", loader_cls=UnstructuredWordDocumentLoader)
|
106 |
+
docx_loader = DirectoryLoader("data/", glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader)
|
107 |
+
odt_loader = DirectoryLoader("data/", glob="**/*.odt", loader_cls=UnstructuredODTLoader)
|
108 |
+
notebook_loader = DirectoryLoader("data/", glob="**/*.ipynb", loader_cls=NotebookLoader)
|
109 |
+
|
110 |
+
|
111 |
+
all_loaders = [pdf_loader, markdown_loader, text_loader, csv_loader, python_loader, epub_loader, html_loader, ppt_loader, pptx_loader, doc_loader, docx_loader, odt_loader, notebook_loader]
|
112 |
+
|
113 |
+
# Load documents from all loaders
|
114 |
+
loaded_documents = []
|
115 |
+
for loader in all_loaders:
|
116 |
+
loaded_documents.extend(loader.load())
|
117 |
+
|
118 |
+
# Split loaded documents into chunks
|
119 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
|
120 |
+
chunked_documents = text_splitter.split_documents(loaded_documents)
|
121 |
+
|
122 |
+
# Initialize HuggingFace embeddings
|
123 |
+
embeddings = HuggingFaceEmbeddings(
|
124 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
125 |
+
)
|
126 |
+
|
127 |
+
# Create and persist a Chroma vector database from the chunked documents
|
128 |
+
db = Chroma.from_documents(
|
129 |
+
documents=chunked_documents,
|
130 |
+
embedding=embeddings,
|
131 |
+
persist_directory=DB_DIR,
|
132 |
+
)
|
133 |
+
db.persist()
|
134 |
+
return db
|
135 |
+
|
136 |
+
def set_custom_prompt_condense():
|
137 |
+
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
|
138 |
+
|
139 |
+
Chat History:
|
140 |
+
{chat_history}
|
141 |
+
Follow Up Input: {question}
|
142 |
+
Standalone question:"""
|
143 |
+
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
|
144 |
+
return CONDENSE_QUESTION_PROMPT
|
145 |
+
|
146 |
+
def set_custom_prompt():
|
147 |
+
"""
|
148 |
+
Prompt template for retrieval for each vectorstore
|
149 |
+
"""
|
150 |
+
|
151 |
+
|
152 |
+
prompt_template = """<Instructions>
|
153 |
+
Important:
|
154 |
+
Answer with the facts listed in the list of sources below. If there isn't enough information below, say you don't know.
|
155 |
+
If asking a clarifying question to the user would help, ask the question.
|
156 |
+
ALWAYS return a "SOURCES" part in your answer, except for small-talk conversations.
|
157 |
+
|
158 |
+
Question: {question}
|
159 |
+
|
160 |
+
{context}
|
161 |
+
|
162 |
+
|
163 |
+
Question: {question}
|
164 |
+
Helpful Answer:
|
165 |
+
|
166 |
+
---------------------------
|
167 |
+
---------------------------
|
168 |
+
Sources:
|
169 |
+
"""
|
170 |
+
|
171 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
|
172 |
+
return prompt
|
173 |
+
|
174 |
+
def create_chain(llm, prompt, CONDENSE_QUESTION_PROMPT, db):
|
175 |
+
"""
|
176 |
+
Creates a Retrieval Question-Answering (QA) chain using a given language model, prompt, and database.
|
177 |
+
|
178 |
+
This function initializes a ConversationalRetrievalChain object with a specific chain type and configurations,
|
179 |
+
and returns this chain. The retriever is set up to return the top 3 results (k=3).
|
180 |
+
|
181 |
+
Args:
|
182 |
+
llm (any): The language model to be used in the RetrievalQA.
|
183 |
+
prompt (str): The prompt to be used in the chain type.
|
184 |
+
db (any): The database to be used as the retriever.
|
185 |
+
|
186 |
+
Returns:
|
187 |
+
ConversationalRetrievalChain: The initialized conversational chain.
|
188 |
+
"""
|
189 |
+
memory = ConversationTokenBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, input_key='question', max_token_limit=1000)
|
190 |
+
chain = ConversationalRetrievalChain.from_llm(
|
191 |
+
llm=llm,
|
192 |
+
chain_type="stuff",
|
193 |
+
retriever=db.as_retriever(search_kwargs={"k": 3}),
|
194 |
+
return_source_documents=True,
|
195 |
+
combine_docs_chain_kwargs={"prompt": prompt},
|
196 |
+
condense_question_prompt=CONDENSE_QUESTION_PROMPT,
|
197 |
+
memory=memory,
|
198 |
+
)
|
199 |
+
return chain
|
200 |
+
|
201 |
+
def create_retrieval_qa_bot():
|
202 |
+
if not os.path.exists(persist_dir):
|
203 |
+
raise FileNotFoundError(f"No directory found at {persist_dir}")
|
204 |
+
|
205 |
+
try:
|
206 |
+
llm = load_model() # Assuming this function exists and works as expected
|
207 |
+
except Exception as e:
|
208 |
+
raise Exception(f"Failed to load model: {str(e)}")
|
209 |
+
|
210 |
+
try:
|
211 |
+
prompt = set_custom_prompt() # Assuming this function exists and works as expected
|
212 |
+
except Exception as e:
|
213 |
+
raise Exception(f"Failed to get prompt: {str(e)}")
|
214 |
+
|
215 |
+
try:
|
216 |
+
CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense() # Assuming this function exists and works as expected
|
217 |
+
except Exception as e:
|
218 |
+
raise Exception(f"Failed to get condense prompt: {str(e)}")
|
219 |
+
|
220 |
+
try:
|
221 |
+
db = create_vector_database() # Assuming this function exists and works as expected
|
222 |
+
except Exception as e:
|
223 |
+
raise Exception(f"Failed to get database: {str(e)}")
|
224 |
+
|
225 |
+
try:
|
226 |
+
qa = create_chain(
|
227 |
+
llm=llm, prompt=prompt,CONDENSE_QUESTION_PROMPT=CONDENSE_QUESTION_PROMPT, db=db
|
228 |
+
) # Assuming this function exists and works as expected
|
229 |
+
except Exception as e:
|
230 |
+
raise Exception(f"Failed to create retrieval QA chain: {str(e)}")
|
231 |
+
|
232 |
+
return qa
|
233 |
+
|
234 |
+
def retrieve_bot_answer(query):
|
235 |
+
"""
|
236 |
+
Retrieves the answer to a given query using a QA bot.
|
237 |
+
|
238 |
+
This function creates an instance of a QA bot, passes the query to it,
|
239 |
+
and returns the bot's response.
|
240 |
+
|
241 |
+
Args:
|
242 |
+
query (str): The question to be answered by the QA bot.
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
dict: The QA bot's response, typically a dictionary with response details.
|
246 |
+
"""
|
247 |
+
qa_bot_instance = create_retrieval_qa_bot()
|
248 |
+
bot_response = qa_bot_instance({"query": query})
|
249 |
+
return bot_response
|
250 |
+
|
251 |
+
import streamlit as st
|
252 |
+
from your_module import load_model, set_custom_prompt, set_custom_prompt_condense, create_vector_database, retrieve_bot_answer
|
253 |
+
|
254 |
+
def main():
|
255 |
+
st.title("Docuverse")
|
256 |
+
|
257 |
+
# Upload files
|
258 |
+
uploaded_files = st.file_uploader("Upload your documents", type=["pdf", "md", "txt", "csv", "py", "epub", "html", "ppt", "pptx", "doc", "docx", "odt", "ipynb"], accept_multiple_files=True)
|
259 |
+
|
260 |
+
if uploaded_files:
|
261 |
+
# Process uploaded files
|
262 |
+
for uploaded_file in uploaded_files:
|
263 |
+
st.write(f"Uploaded: {uploaded_file.name}")
|
264 |
+
|
265 |
+
st.write("Chat with the Document:")
|
266 |
+
query = st.text_input("Ask a question:")
|
267 |
+
|
268 |
+
if st.button("Get Answer"):
|
269 |
+
if query:
|
270 |
+
# Load model, set prompts, create vector database, and retrieve answer
|
271 |
+
try:
|
272 |
+
llm = load_model()
|
273 |
+
prompt = set_custom_prompt()
|
274 |
+
CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()
|
275 |
+
db = create_vector_database()
|
276 |
+
response = retrieve_bot_answer(query)
|
277 |
+
|
278 |
+
# Display bot response
|
279 |
+
st.write("Bot Response:")
|
280 |
+
st.write(response)
|
281 |
+
except Exception as e:
|
282 |
+
st.error(f"An error occurred: {str(e)}")
|
283 |
+
else:
|
284 |
+
st.warning("Please enter a question.")
|
285 |
+
|
286 |
+
if __name__ == "__main__":
|
287 |
+
main()
|
reqirements.rtf
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{\rtf1\ansi\ansicpg1252\cocoartf2709
|
2 |
+
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fmodern\fcharset0 Courier;}
|
3 |
+
{\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red255\green255\blue255;\red255\green255\blue255;
|
4 |
+
\red203\green203\blue202;\red202\green202\blue202;\red203\green203\blue202;}
|
5 |
+
{\*\expandedcolortbl;;\cssrgb\c0\c0\c0;\cssrgb\c100000\c100000\c100000\c0;\cssrgb\c100000\c100000\c99956;
|
6 |
+
\cssrgb\c83320\c83320\c83112;\cssrgb\c83229\c83229\c83125;\cssrgb\c83411\c83411\c83099;}
|
7 |
+
\margl1440\margr1440\vieww11520\viewh8400\viewkind0
|
8 |
+
\deftab720
|
9 |
+
\pard\pardeftab720\partightenfactor0
|
10 |
+
|
11 |
+
\f0\fs28 \cf2 \cb3 \expnd0\expndtw0\kerning0
|
12 |
+
\outl0\strokewidth0 \strokec4 langchain\
|
13 |
+
PyPDF2\
|
14 |
+
streamlit\
|
15 |
+
#openai\
|
16 |
+
faiss-cpu\
|
17 |
+
\pard\pardeftab720\partightenfactor0
|
18 |
+
\cf2 \strokec5 safetensors\strokec4 \
|
19 |
+
\pard\pardeftab720\partightenfactor0
|
20 |
+
\cf2 \strokec4 huggingface-hub\
|
21 |
+
InstructorEmbedding\
|
22 |
+
sentence-transformers\
|
23 |
+
\pard\pardeftab720\partightenfactor0
|
24 |
+
\cf2 \strokec5 torch\
|
25 |
+
sentence_transformers\
|
26 |
+
einops\strokec4 \
|
27 |
+
\pard\pardeftab720\partightenfactor0
|
28 |
+
\cf2 \strokec5 bitsandbytes\
|
29 |
+
accelerate\
|
30 |
+
peft\cb1 \strokec6 \
|
31 |
+
\cb3 \strokec7 transformers\cb1 \strokec6 \
|
32 |
+
}
|