dfgnota commited on
Commit
ea4ad8a
β€’
1 Parent(s): 6091497

add gpt-doc-mem app

Browse files
Files changed (2) hide show
  1. app.py +206 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary modules
2
+ import os
3
+ import re
4
+ import time
5
+ from io import BytesIO
6
+ from typing import Any, Dict, List
7
+
8
+ import openai
9
+ import streamlit as st
10
+ from langchain import LLMChain, OpenAI
11
+ from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
12
+ from langchain.chains import RetrievalQA
13
+ from langchain.chains.question_answering import load_qa_chain
14
+ from langchain.docstore.document import Document
15
+ from langchain.document_loaders import PyPDFLoader
16
+ from langchain.embeddings.openai import OpenAIEmbeddings
17
+ from langchain.llms import OpenAI
18
+ from langchain.memory import ConversationBufferMemory
19
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
20
+ from langchain.vectorstores import VectorStore
21
+ from langchain.vectorstores.faiss import FAISS
22
+ from pypdf import PdfReader
23
+
24
+
25
+ # Define a function to parse a PDF file and extract its text content
26
+ @st.cache_data
27
+ def parse_pdf(file: BytesIO) -> List[str]:
28
+ pdf = PdfReader(file)
29
+ output = []
30
+ for page in pdf.pages:
31
+ text = page.extract_text()
32
+ # Merge hyphenated words
33
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
34
+ # Fix newlines in the middle of sentences
35
+ text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
36
+ # Remove multiple newlines
37
+ text = re.sub(r"\n\s*\n", "\n\n", text)
38
+ output.append(text)
39
+ return output
40
+
41
+
42
+ # Define a function to convert text content to a list of documents
43
+ @st.cache_data
44
+ def text_to_docs(text: str) -> List[Document]:
45
+ """Converts a string or list of strings to a list of Documents
46
+ with metadata."""
47
+ if isinstance(text, str):
48
+ # Take a single string as one page
49
+ text = [text]
50
+ page_docs = [Document(page_content=page) for page in text]
51
+
52
+ # Add page numbers as metadata
53
+ for i, doc in enumerate(page_docs):
54
+ doc.metadata["page"] = i + 1
55
+
56
+ # Split pages into chunks
57
+ doc_chunks = []
58
+
59
+ for doc in page_docs:
60
+ text_splitter = RecursiveCharacterTextSplitter(
61
+ chunk_size=4000,
62
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
63
+ chunk_overlap=0,
64
+ )
65
+ chunks = text_splitter.split_text(doc.page_content)
66
+ for i, chunk in enumerate(chunks):
67
+ doc = Document(
68
+ page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
69
+ )
70
+ # Add sources a metadata
71
+ doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
72
+ doc_chunks.append(doc)
73
+ return doc_chunks
74
+
75
+
76
+ # Define a function for the embeddings
77
+ @st.cache_data
78
+ def test_embed():
79
+ embeddings = OpenAIEmbeddings(openai_api_key=api)
80
+ # Indexing
81
+ # Save in a Vector DB
82
+ with st.spinner("It's indexing..."):
83
+ index = FAISS.from_documents(pages, embeddings)
84
+ st.success("Embeddings done.", icon="βœ…")
85
+ return index
86
+
87
+
88
+ # Set up the Streamlit app
89
+ st.title("πŸ€– Document AI with Memory 🧠 ")
90
+ st.markdown(
91
+ """
92
+ #### πŸ—¨οΈ Chat with your PDF files πŸ“„ + `Conversational Buffer Memory`
93
+ > *powered by [LangChain]('https://langchain.readthedocs.io/en/latest/modules/memory.html#memory') +
94
+ [OpenAI]('https://platform.openai.com/docs/models/gpt-3-5') + [HuggingFace](https://www.huggingface.co/)*
95
+ """
96
+ )
97
+
98
+ st.markdown(
99
+ """
100
+ `openai`
101
+ `langchain`
102
+ `tiktoken`
103
+ `pypdf`
104
+ `faiss-cpu`
105
+
106
+ ---------
107
+ """
108
+ )
109
+
110
+ # Set up the sidebar
111
+ st.sidebar.markdown(
112
+ """
113
+ ### Steps:
114
+ 1. Upload PDF File
115
+ 2. Enter Your Secret Key for Embeddings
116
+ 3. Perform Q&A
117
+
118
+ **Note : File content and API key not stored in any form.**
119
+ """
120
+ )
121
+
122
+ # Allow the user to upload a PDF file
123
+ uploaded_file = st.file_uploader("**Upload Your PDF File**", type=["pdf"])
124
+
125
+ if uploaded_file:
126
+ name_of_file = uploaded_file.name
127
+ doc = parse_pdf(uploaded_file)
128
+ pages = text_to_docs(doc)
129
+ if pages:
130
+ # Allow the user to select a page and view its content
131
+ with st.expander("Show Page Content", expanded=False):
132
+ page_sel = st.number_input(
133
+ label="Select Page", min_value=1, max_value=len(pages), step=1
134
+ )
135
+ pages[page_sel - 1]
136
+ # Use OpenAI API key from environment or allow the user to enter it
137
+ api = os.environ.get("OPENAI_API_KEY") or st.text_input(
138
+ "**Enter OpenAI API Key**",
139
+ type="password",
140
+ placeholder="sk-",
141
+ help="https://platform.openai.com/account/api-keys",
142
+ )
143
+ if api:
144
+ # Test the embeddings and save the index in a vector database
145
+ index = test_embed()
146
+ # Set up the question-answering system
147
+ qa = RetrievalQA.from_chain_type(
148
+ llm=OpenAI(openai_api_key=api),
149
+ chain_type="stuff",
150
+ retriever=index.as_retriever(),
151
+ )
152
+ # Set up the conversational agent
153
+ tools = [
154
+ Tool(
155
+ name="PDF QA System",
156
+ func=qa.run,
157
+ description="Useful for when you need to answer questions about the aspects asked. Input may be a partial or fully formed question.",
158
+ )
159
+ ]
160
+ prefix = """Have a conversation with a human, answering the following questions as best you can based on the context and memory available.
161
+ You have access to a single tool:"""
162
+ suffix = """Begin!"
163
+
164
+ {chat_history}
165
+ Question: {input}
166
+ {agent_scratchpad}"""
167
+
168
+ prompt = ZeroShotAgent.create_prompt(
169
+ tools,
170
+ prefix=prefix,
171
+ suffix=suffix,
172
+ input_variables=["input", "chat_history", "agent_scratchpad"],
173
+ )
174
+
175
+ if "memory" not in st.session_state:
176
+ st.session_state.memory = ConversationBufferMemory(
177
+ memory_key="chat_history"
178
+ )
179
+
180
+ llm_chain = LLMChain(
181
+ llm=OpenAI(
182
+ temperature=0, openai_api_key=api, model_name="gpt-3.5-turbo"
183
+ ),
184
+ prompt=prompt,
185
+ )
186
+ agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
187
+ agent_chain = AgentExecutor.from_agent_and_tools(
188
+ agent=agent, tools=tools, verbose=True, memory=st.session_state.memory
189
+ )
190
+
191
+ # Allow the user to enter a query and generate a response
192
+ query = st.text_input(
193
+ "**What's on your mind?**",
194
+ placeholder="Ask me anything from {}".format(name_of_file),
195
+ )
196
+
197
+ if query:
198
+ with st.spinner(
199
+ "Generating Answer to your Query : `{}` ".format(query)
200
+ ):
201
+ res = agent_chain.run(query)
202
+ st.info(res, icon="πŸ€–")
203
+
204
+ # Allow the user to view the conversation history and other information stored in the agent's memory
205
+ with st.expander("History/Memory"):
206
+ st.session_state.memory
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ tiktoken
4
+ faiss-cpu
5
+ pypdf