Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pypdf import PdfReader
|
2 |
+
import torch
|
3 |
+
import PyPDF2
|
4 |
+
from io import BytesIO
|
5 |
+
from langchain.prompts import PromptTemplate
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
from langchain.chains import RetrievalQA
|
10 |
+
import gradio as gr
|
11 |
+
import time
|
12 |
+
|
13 |
+
from langchain.memory import ConversationBufferMemory
|
14 |
+
|
15 |
+
|
16 |
+
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
|
17 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
|
18 |
+
from langchain.document_loaders import PyPDFDirectoryLoader
|
19 |
+
|
20 |
+
CHUNK_SIZE = 1000
|
21 |
+
# Using HuggingFaceEmbeddings with the chosen embedding model
|
22 |
+
embeddings = HuggingFaceEmbeddings(
|
23 |
+
model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs = {"device": "cuda"})
|
24 |
+
|
25 |
+
# transformer model configuration
|
26 |
+
quant_config = BitsAndBytesConfig(
|
27 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
def load_llm():
|
32 |
+
|
33 |
+
model_id = "Deci/DeciLM-6b-instruct"
|
34 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
35 |
+
model = AutoModelForCausalLM.from_pretrained(model_id,
|
36 |
+
trust_remote_code=True,
|
37 |
+
device_map = "auto",
|
38 |
+
quantization_config=quant_config)
|
39 |
+
pipe = pipeline("text-generation",
|
40 |
+
model=model,
|
41 |
+
tokenizer=tokenizer,
|
42 |
+
temperature=0,
|
43 |
+
num_beams=5,
|
44 |
+
no_repeat_ngram_size=4,
|
45 |
+
early_stopping=True,
|
46 |
+
max_new_tokens=50,
|
47 |
+
)
|
48 |
+
|
49 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
50 |
+
|
51 |
+
return llm
|
52 |
+
|
53 |
+
def add_text(history, text):
|
54 |
+
if not text:
|
55 |
+
raise gr.Error('Enter text')
|
56 |
+
history = history + [(text, '')]
|
57 |
+
|
58 |
+
return history
|
59 |
+
|
60 |
+
def upload_file(file):
|
61 |
+
# file_path = [file.name for file in files]
|
62 |
+
print(type(file))
|
63 |
+
return file
|
64 |
+
|
65 |
+
def process_file(files):
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
# loader = PyPDFLoader(file_path= file.name)
|
70 |
+
# document = loader.load()
|
71 |
+
|
72 |
+
pdf_text = ""
|
73 |
+
for file in files:
|
74 |
+
# pdf_stream = BytesIO(file.name.content)
|
75 |
+
pdf = PyPDF2.PdfReader(file.name)
|
76 |
+
for page in pdf.pages:
|
77 |
+
pdf_text += page.extract_text()
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
# split into smaller chunks
|
87 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=200)
|
88 |
+
|
89 |
+
splits = text_splitter.create_documents([pdf_text])
|
90 |
+
|
91 |
+
# create a FAISS vector store db
|
92 |
+
|
93 |
+
# embedd the chunks and store in the db
|
94 |
+
vectorstore_db = FAISS.from_documents(splits, embeddings)
|
95 |
+
|
96 |
+
#create a custom prompt
|
97 |
+
custom_prompt_template = """Given the uploaded files, generate a pecise answer to the question asked by the user.
|
98 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
99 |
+
Context= {context}
|
100 |
+
History = {history}
|
101 |
+
Question= {question}
|
102 |
+
Helpful Answer:
|
103 |
+
"""
|
104 |
+
prompt = PromptTemplate(template=custom_prompt_template, input_variables=["question", "context", "history"])
|
105 |
+
|
106 |
+
|
107 |
+
# set QA chain with memory
|
108 |
+
qa_chain_with_memory = RetrievalQA.from_chain_type(llm=load_llm(),
|
109 |
+
chain_type='stuff',
|
110 |
+
return_source_documents=True,
|
111 |
+
retriever=vectorstore_db.as_retriever(),
|
112 |
+
chain_type_kwargs={"verbose": True,
|
113 |
+
"prompt": prompt,
|
114 |
+
"memory": ConversationBufferMemory(
|
115 |
+
input_key="question",
|
116 |
+
memory_key="history",
|
117 |
+
return_messages=True) })
|
118 |
+
|
119 |
+
# get answers
|
120 |
+
return qa_chain_with_memory
|
121 |
+
|
122 |
+
|
123 |
+
def generate_bot_response(history,query, btn):
|
124 |
+
|
125 |
+
if not btn:
|
126 |
+
raise gr.Error(message='Upload a PDF')
|
127 |
+
|
128 |
+
qa_chain_with_memory = process_file(btn)
|
129 |
+
|
130 |
+
|
131 |
+
bot_response = qa_chain_with_memory({"query": query})
|
132 |
+
|
133 |
+
# return bot_response["result"]
|
134 |
+
for char in bot_response['result']:
|
135 |
+
history[-1][-1] += char
|
136 |
+
time.sleep(0.05)
|
137 |
+
yield history,''
|
138 |
+
|
139 |
+
with gr.Blocks() as demo:
|
140 |
+
with gr.Row():
|
141 |
+
with gr.Row():
|
142 |
+
chatbot = gr.Chatbot(label="DeciLM-6b-instruct bot", value=[], elem_id='chatbot')
|
143 |
+
with gr.Row():
|
144 |
+
file_output = gr.File(label="Your PDFs")
|
145 |
+
with gr.Column():
|
146 |
+
btn = gr.UploadButton("π Upload a PDF(s)", file_types=[".pdf"], file_count="multiple")
|
147 |
+
|
148 |
+
|
149 |
+
with gr.Column():
|
150 |
+
with gr.Column():
|
151 |
+
txt = gr.Text(show_label=False, placeholder="Enter question")
|
152 |
+
|
153 |
+
with gr.Column():
|
154 |
+
submit_btn = gr.Button('Ask')
|
155 |
+
|
156 |
+
|
157 |
+
# Event handler for uploading a PDF
|
158 |
+
btn.upload(fn=upload_file, inputs=[btn], outputs=[file_output])
|
159 |
+
|
160 |
+
|
161 |
+
submit_btn.click(
|
162 |
+
fn= add_text,
|
163 |
+
inputs=[chatbot, txt],
|
164 |
+
outputs=[chatbot],
|
165 |
+
queue=False
|
166 |
+
).success(
|
167 |
+
fn=generate_bot_response,
|
168 |
+
inputs=[chatbot, txt, btn],
|
169 |
+
outputs=[chatbot, txt]
|
170 |
+
).success(
|
171 |
+
fn=upload_file,
|
172 |
+
inputs=[btn],
|
173 |
+
outputs=[file_output]
|
174 |
+
)
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
demo.launch()
|