Spaces:

rajesh1729
/

Streamlit-RAG-Chat-with-PDF

Sleeping

App Files Files Community

rajesh1729 commited on Nov 11, 2024

Commit

c316c4f

verified ·

1 Parent(s): bca9228

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -43

app.py CHANGED Viewed

@@ -13,8 +13,8 @@ if "messages" not in st.session_state:
     st.session_state.messages = []
 if "chain" not in st.session_state:
     st.session_state.chain = None
-if "processed_pdfs" not in st.session_state:
-    st.session_state.processed_pdfs = False
 def create_sidebar():
     with st.sidebar:
@@ -35,46 +35,62 @@ def create_sidebar():
         return api_key
 def process_pdfs(papers, api_key):
-    if papers and not st.session_state.processed_pdfs:
-        with st.spinner("Processing PDFs..."):
-            texts = []
-            for paper in papers:
-                try:
-                    file_path = os.path.join('./uploads', paper.name)
-                    os.makedirs('./uploads', exist_ok=True)
-                    with open(file_path, "wb") as f:
-                        f.write(paper.getbuffer())
-                    loader = PyPDFLoader(file_path)
-                    documents = loader.load()
-                    text_splitter = RecursiveCharacterTextSplitter(
-                        chunk_size=1000,
-                        chunk_overlap=200,
-                        length_function=len,
-                        is_separator_regex=False,
-                    )
-                    texts.extend(text_splitter.split_documents(documents))
-                    os.remove(file_path)
-                except Exception as e:
-                    st.error(f"Error processing {paper.name}: {str(e)}")
-            if texts:
-                embedding = OpenAIEmbeddings(openai_api_key=api_key)
-                vectorstore = Chroma(embedding_function=embedding, persist_directory="db")
-                vectorstore.add_documents(texts)
-                st.session_state.chain = ConversationalRetrievalChain.from_llm(
-                    ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=api_key),
-                    vectorstore.as_retriever(),
-                    memory=ConversationBufferMemory(
-                        memory_key="chat_history",
-                        return_messages=True
-                    )
                 )
-                st.session_state.processed_pdfs = True
-                st.success("PDFs processed successfully!")
-            return texts
-    return []
 def main():
     st.set_page_config(page_title="PDF Chat")
@@ -91,8 +107,10 @@ def main():
     # File uploader
     papers = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
-    # Process PDFs if needed
-    texts = process_pdfs(papers, api_key)
     # Display chat messages from history
     for message in st.session_state.messages:
@@ -110,12 +128,23 @@ def main():
         # Generate and display assistant response
         with st.chat_message("assistant"):
-            if not st.session_state.processed_pdfs:
-                response = "Please upload a PDF first."
             else:
                 with st.spinner("Thinking..."):
                     result = st.session_state.chain({"question": prompt})
                     response = result["answer"]
             st.markdown(response)
             st.session_state.messages.append({"role": "assistant", "content": response})

     st.session_state.messages = []
 if "chain" not in st.session_state:
     st.session_state.chain = None
+if "vectorstore" not in st.session_state:  # Added vectorstore to session state
+    st.session_state.vectorstore = None
 def create_sidebar():
     with st.sidebar:
         return api_key
 def process_pdfs(papers, api_key):
+    """Process PDFs and return whether processing was successful"""
+    if not papers:
+        return False
+    with st.spinner("Processing PDFs..."):
+        try:
+            # Create embeddings instance
+            embeddings = OpenAIEmbeddings(openai_api_key=api_key)
+            # Process all PDFs
+            all_texts = []
+            for paper in papers:
+                # Save and load PDF
+                file_path = os.path.join('./uploads', paper.name)
+                os.makedirs('./uploads', exist_ok=True)
+                with open(file_path, "wb") as f:
+                    f.write(paper.getbuffer())
+                # Load and split the PDF
+                loader = PyPDFLoader(file_path)
+                documents = loader.load()
+                text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=1000,
+                    chunk_overlap=200,
                 )
+                texts = text_splitter.split_documents(documents)
+                all_texts.extend(texts)
+                # Cleanup
+                os.remove(file_path)
+            # Create new vectorstore
+            st.session_state.vectorstore = Chroma.from_documents(
+                documents=all_texts,
+                embedding=embeddings,
+            )
+            # Create chain
+            st.session_state.chain = ConversationalRetrievalChain.from_llm(
+                llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=api_key),
+                retriever=st.session_state.vectorstore.as_retriever(
+                    search_kwargs={"k": 3}  # Retrieve top 3 most relevant chunks
+                ),
+                memory=ConversationBufferMemory(
+                    memory_key="chat_history",
+                    return_messages=True,
+                ),
+                return_source_documents=True,  # Include source documents in response
+            )
+            st.success(f"Processed {len(papers)} PDF(s) successfully!")
+            return True
+        except Exception as e:
+            st.error(f"Error processing PDFs: {str(e)}")
+            return False
 def main():
     st.set_page_config(page_title="PDF Chat")
     # File uploader
     papers = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
+    # Process PDFs button
+    if papers:
+        if st.button("Process PDFs"):
+            process_pdfs(papers, api_key)
     # Display chat messages from history
     for message in st.session_state.messages:
         # Generate and display assistant response
         with st.chat_message("assistant"):
+            if st.session_state.chain is None:
+                response = "Please upload and process a PDF first."
             else:
                 with st.spinner("Thinking..."):
+                    # Get response with source documents
                     result = st.session_state.chain({"question": prompt})
                     response = result["answer"]
+                    # Optionally show sources
+                    if "source_documents" in result:
+                        sources = result["source_documents"]
+                        if sources:
+                            response += "\n\nSources:"
+                            for i, doc in enumerate(sources, 1):
+                                # Add page numbers if available
+                                page_info = f" (Page {doc.metadata['page'] + 1})" if 'page' in doc.metadata else ""
+                                response += f"\n{i}.{page_info} {doc.page_content[:200]}..."
             st.markdown(response)
             st.session_state.messages.append({"role": "assistant", "content": response})