HareemFatima commited on
Commit
0fd5dbd
·
verified ·
1 Parent(s): 847496f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -0
app.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import json
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import requests
7
+ import streamlit as st
8
+ from PyPDF2 import PdfReader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_community.vectorstores import FAISS
11
+ from langchain_community.llms import Ollama
12
+ from langchain.chains.question_answering import load_qa_chain
13
+ from langchain.prompts import PromptTemplate
14
+ from dotenv import load_dotenv
15
+ from langchain_community.embeddings import HuggingFaceEmbeddings
16
+ import nltk
17
+ from urllib.parse import urljoin, urlparse
18
+ from langchain.memory import ConversationBufferMemory
19
+
20
+ # Load environment variables (if needed for API keys)
21
+ load_dotenv()
22
+
23
+ # Initialize HuggingFace Embeddings
24
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
25
+
26
+ # Download NLTK stopwords
27
+ nltk.download('stopwords')
28
+ from nltk.corpus import stopwords
29
+ STOPWORDS = set(stopwords.words('english'))
30
+
31
+ # Text Preprocessing Function
32
+ def preprocess_text(text):
33
+ text = re.sub(r'[^A-Za-z\s]', '', text) # Remove special characters
34
+ text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
35
+ text = text.lower() # Convert to lowercase
36
+ tokens = text.split()
37
+ cleaned_text = " ".join([word for word in tokens if word not in STOPWORDS]) # Remove stopwords
38
+ return cleaned_text
39
+
40
+ # Function to Save Processed Data to a Document
41
+ def save_data_to_document(data, filename="processed_data.json"):
42
+ with open(filename, 'w') as f:
43
+ json.dump(data, f, indent=4)
44
+ st.success(f"Data has been saved to {filename}")
45
+
46
+ # Scrape Website with BeautifulSoup
47
+ def scrape_website(url):
48
+ visited_urls = set()
49
+ scraped_data = {}
50
+
51
+ def scrape_page(url):
52
+ if url in visited_urls:
53
+ return
54
+ visited_urls.add(url)
55
+
56
+ try:
57
+ headers = {
58
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
59
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
60
+ 'Accept-Language': 'en-US,en;q=0.5',
61
+ 'Connection': 'keep-alive',
62
+ }
63
+ response = requests.get(url, headers=headers)
64
+
65
+ except requests.RequestException as e:
66
+ st.error(f"Failed to retrieve {url}: {e}")
67
+ return
68
+
69
+ soup = BeautifulSoup(response.content, 'html.parser')
70
+
71
+ # Extract relevant content
72
+ relevant_tags = ['p', 'strong', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']
73
+ content = []
74
+ for tag in relevant_tags:
75
+ for element in soup.find_all(tag):
76
+ text = element.get_text(strip=True)
77
+ if text:
78
+ content.append(text)
79
+
80
+ if content:
81
+ scraped_data[url] = " ".join(content)
82
+
83
+ # Find and process all internal links on the page
84
+ for link in soup.find_all('a', href=True):
85
+ next_url = urljoin(url, link['href'])
86
+ if urlparse(next_url).netloc == urlparse(url).netloc and next_url not in visited_urls:
87
+ scrape_page(next_url)
88
+
89
+ scrape_page(url)
90
+ return scraped_data
91
+
92
+ # PDF Text Extraction
93
+ def get_pdf_text(pdf_docs):
94
+ text = ""
95
+ for pdf in pdf_docs:
96
+ pdf_reader = PdfReader(pdf)
97
+ for page in pdf_reader.pages:
98
+ text += page.extract_text() or "" # Handle None
99
+ return preprocess_text(text)
100
+
101
+ # Split Text into Manageable Chunks
102
+ def get_text_chunks(text):
103
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=15000, chunk_overlap=1000)
104
+ chunks = text_splitter.split_text(text)
105
+ return chunks
106
+
107
+ # Create FAISS Vector Store with UUID
108
+ def create_faiss_with_uuid(text_chunks):
109
+ # Generate a unique UUID for this document
110
+ unique_id = str(uuid.uuid4()) # Generate unique identifier
111
+
112
+ # Create a new FAISS index for the document
113
+ vector_store = FAISS.from_texts(text_chunks, embeddings) # Create FAISS from chunks
114
+
115
+ # Define a directory to store the FAISS index (using the UUID as part of the directory name)
116
+ faiss_directory = f'./faiss_index_{unique_id}'
117
+ os.makedirs(faiss_directory, exist_ok=True)
118
+
119
+ # Save the FAISS index in a directory with the UUID
120
+ vector_store.save_local(faiss_directory) # Save locally with a unique directory name
121
+
122
+ return unique_id, faiss_directory # Return the UUID and the directory path
123
+
124
+ # Build Conversational Chain
125
+ def get_conversational_chain(memory):
126
+ prompt_template = """
127
+ Answer the question as detailed as possible from the provided context. If the answer is not in
128
+ provided context, just say, "answer is not available in the context." Don't provide the wrong answer.\n\n
129
+ Context:\n {context}\n
130
+ Question: \n{question}\n
131
+
132
+ Answer:
133
+ """
134
+ model = Ollama(model="phi") # Initialize LLaMA model
135
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
136
+
137
+ # Add memory to the chain
138
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt, memory=memory)
139
+
140
+ return chain
141
+
142
+ # Handle User Input and Process Questions with UUID-based FAISS Index
143
+ def user_input(user_question, faiss_directory, memory):
144
+ # Load the FAISS index based on the given directory (UUID-based)
145
+ new_db = FAISS.load_local(faiss_directory, embeddings, allow_dangerous_deserialization=True)
146
+
147
+ # Perform similarity search and answer the user's question
148
+ docs = new_db.similarity_search(user_question)
149
+ chain = get_conversational_chain(memory)
150
+
151
+ # Update memory with the question and response
152
+ response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
153
+ memory.save_context({"input": user_question}, {"output": response["output_text"]})
154
+
155
+ st.write("Reply: ", response["output_text"])
156
+
157
+ # Main Function for Streamlit App
158
+ def main():
159
+ st.set_page_config("Chat PDF & URL", layout="wide")
160
+ st.header("Chat with PDF or URL using Ollama 💁")
161
+
162
+ # Initialize memory for conversation history
163
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
164
+
165
+ user_question = st.text_input("Ask a Question from the Processed Data")
166
+
167
+ if user_question and 'faiss_directory' in st.session_state:
168
+ faiss_directory = st.session_state['faiss_directory']
169
+ user_input(user_question, faiss_directory, memory)
170
+
171
+ with st.sidebar:
172
+ st.title("Menu:")
173
+ # User selects between PDF or URL
174
+ option = st.radio("Choose input type:", ("PDF", "URL"))
175
+
176
+ if option == "PDF":
177
+ pdf_docs = st.file_uploader("Upload PDF Files:", accept_multiple_files=True)
178
+ if st.button("Submit & Process"):
179
+ with st.spinner("Processing..."):
180
+ if pdf_docs:
181
+ raw_text = get_pdf_text(pdf_docs)
182
+ text_chunks = get_text_chunks(raw_text)
183
+ unique_id, faiss_directory = create_faiss_with_uuid(text_chunks)
184
+ st.session_state['faiss_directory'] = faiss_directory
185
+
186
+ # Save the cleaned PDF data to a document
187
+ save_data_to_document({"pdf_data": raw_text}, f"pdf_data_{unique_id}.json")
188
+
189
+ st.success("PDF data is ready for queries!")
190
+ else:
191
+ st.error("No PDF files were uploaded.")
192
+
193
+ elif option == "URL":
194
+ url_input = st.text_input("Enter a URL to scrape text:")
195
+ if st.button("Submit & Process"):
196
+ with st.spinner("Processing..."):
197
+ if url_input:
198
+ try:
199
+ # Run BeautifulSoup and get scraped data
200
+ scraped_data = scrape_website(url_input)
201
+
202
+ # Combine and preprocess scraped data
203
+ raw_text = preprocess_text(" ".join(scraped_data.values()))
204
+
205
+ # Split text into chunks and index in FAISS
206
+ text_chunks = get_text_chunks(raw_text)
207
+ unique_id, faiss_directory = create_faiss_with_uuid(text_chunks)
208
+ st.session_state['faiss_directory'] = faiss_directory
209
+
210
+ # Save the cleaned URL data to a document
211
+ save_data_to_document({"url_data": scraped_data}, f"url_data_{unique_id}.json")
212
+
213
+ st.success("Scraped data is ready for queries!")
214
+ except Exception as e:
215
+ st.error(f"Failed to scrape or process data: {e}")
216
+ else:
217
+ st.error("No URL was provided.")
218
+
219
+ if __name__ == "__main__":
220
+ main()