rajesh1729 commited on
Commit
c316c4f
·
verified ·
1 Parent(s): bca9228

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -43
app.py CHANGED
@@ -13,8 +13,8 @@ if "messages" not in st.session_state:
13
  st.session_state.messages = []
14
  if "chain" not in st.session_state:
15
  st.session_state.chain = None
16
- if "processed_pdfs" not in st.session_state:
17
- st.session_state.processed_pdfs = False
18
 
19
  def create_sidebar():
20
  with st.sidebar:
@@ -35,46 +35,62 @@ def create_sidebar():
35
  return api_key
36
 
37
  def process_pdfs(papers, api_key):
38
- if papers and not st.session_state.processed_pdfs:
39
- with st.spinner("Processing PDFs..."):
40
- texts = []
41
- for paper in papers:
42
- try:
43
- file_path = os.path.join('./uploads', paper.name)
44
- os.makedirs('./uploads', exist_ok=True)
45
- with open(file_path, "wb") as f:
46
- f.write(paper.getbuffer())
47
-
48
- loader = PyPDFLoader(file_path)
49
- documents = loader.load()
50
- text_splitter = RecursiveCharacterTextSplitter(
51
- chunk_size=1000,
52
- chunk_overlap=200,
53
- length_function=len,
54
- is_separator_regex=False,
55
- )
56
- texts.extend(text_splitter.split_documents(documents))
57
- os.remove(file_path)
58
- except Exception as e:
59
- st.error(f"Error processing {paper.name}: {str(e)}")
60
 
61
- if texts:
62
- embedding = OpenAIEmbeddings(openai_api_key=api_key)
63
- vectorstore = Chroma(embedding_function=embedding, persist_directory="db")
64
- vectorstore.add_documents(texts)
 
 
 
 
65
 
66
- st.session_state.chain = ConversationalRetrievalChain.from_llm(
67
- ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=api_key),
68
- vectorstore.as_retriever(),
69
- memory=ConversationBufferMemory(
70
- memory_key="chat_history",
71
- return_messages=True
72
- )
73
  )
74
- st.session_state.processed_pdfs = True
75
- st.success("PDFs processed successfully!")
76
- return texts
77
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def main():
80
  st.set_page_config(page_title="PDF Chat")
@@ -91,8 +107,10 @@ def main():
91
  # File uploader
92
  papers = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
93
 
94
- # Process PDFs if needed
95
- texts = process_pdfs(papers, api_key)
 
 
96
 
97
  # Display chat messages from history
98
  for message in st.session_state.messages:
@@ -110,12 +128,23 @@ def main():
110
 
111
  # Generate and display assistant response
112
  with st.chat_message("assistant"):
113
- if not st.session_state.processed_pdfs:
114
- response = "Please upload a PDF first."
115
  else:
116
  with st.spinner("Thinking..."):
 
117
  result = st.session_state.chain({"question": prompt})
118
  response = result["answer"]
 
 
 
 
 
 
 
 
 
 
119
 
120
  st.markdown(response)
121
  st.session_state.messages.append({"role": "assistant", "content": response})
 
13
  st.session_state.messages = []
14
  if "chain" not in st.session_state:
15
  st.session_state.chain = None
16
+ if "vectorstore" not in st.session_state: # Added vectorstore to session state
17
+ st.session_state.vectorstore = None
18
 
19
  def create_sidebar():
20
  with st.sidebar:
 
35
  return api_key
36
 
37
  def process_pdfs(papers, api_key):
38
+ """Process PDFs and return whether processing was successful"""
39
+ if not papers:
40
+ return False
41
+
42
+ with st.spinner("Processing PDFs..."):
43
+ try:
44
+ # Create embeddings instance
45
+ embeddings = OpenAIEmbeddings(openai_api_key=api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Process all PDFs
48
+ all_texts = []
49
+ for paper in papers:
50
+ # Save and load PDF
51
+ file_path = os.path.join('./uploads', paper.name)
52
+ os.makedirs('./uploads', exist_ok=True)
53
+ with open(file_path, "wb") as f:
54
+ f.write(paper.getbuffer())
55
 
56
+ # Load and split the PDF
57
+ loader = PyPDFLoader(file_path)
58
+ documents = loader.load()
59
+ text_splitter = RecursiveCharacterTextSplitter(
60
+ chunk_size=1000,
61
+ chunk_overlap=200,
 
62
  )
63
+ texts = text_splitter.split_documents(documents)
64
+ all_texts.extend(texts)
65
+
66
+ # Cleanup
67
+ os.remove(file_path)
68
+
69
+ # Create new vectorstore
70
+ st.session_state.vectorstore = Chroma.from_documents(
71
+ documents=all_texts,
72
+ embedding=embeddings,
73
+ )
74
+
75
+ # Create chain
76
+ st.session_state.chain = ConversationalRetrievalChain.from_llm(
77
+ llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key=api_key),
78
+ retriever=st.session_state.vectorstore.as_retriever(
79
+ search_kwargs={"k": 3} # Retrieve top 3 most relevant chunks
80
+ ),
81
+ memory=ConversationBufferMemory(
82
+ memory_key="chat_history",
83
+ return_messages=True,
84
+ ),
85
+ return_source_documents=True, # Include source documents in response
86
+ )
87
+
88
+ st.success(f"Processed {len(papers)} PDF(s) successfully!")
89
+ return True
90
+
91
+ except Exception as e:
92
+ st.error(f"Error processing PDFs: {str(e)}")
93
+ return False
94
 
95
  def main():
96
  st.set_page_config(page_title="PDF Chat")
 
107
  # File uploader
108
  papers = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
109
 
110
+ # Process PDFs button
111
+ if papers:
112
+ if st.button("Process PDFs"):
113
+ process_pdfs(papers, api_key)
114
 
115
  # Display chat messages from history
116
  for message in st.session_state.messages:
 
128
 
129
  # Generate and display assistant response
130
  with st.chat_message("assistant"):
131
+ if st.session_state.chain is None:
132
+ response = "Please upload and process a PDF first."
133
  else:
134
  with st.spinner("Thinking..."):
135
+ # Get response with source documents
136
  result = st.session_state.chain({"question": prompt})
137
  response = result["answer"]
138
+
139
+ # Optionally show sources
140
+ if "source_documents" in result:
141
+ sources = result["source_documents"]
142
+ if sources:
143
+ response += "\n\nSources:"
144
+ for i, doc in enumerate(sources, 1):
145
+ # Add page numbers if available
146
+ page_info = f" (Page {doc.metadata['page'] + 1})" if 'page' in doc.metadata else ""
147
+ response += f"\n{i}.{page_info} {doc.page_content[:200]}..."
148
 
149
  st.markdown(response)
150
  st.session_state.messages.append({"role": "assistant", "content": response})