Spaces:

tahirsher
/

GenAI_Lawyers_Guide

Sleeping

App Files Files Community

tahirsher commited on Nov 8, 2024

Commit

ed3b297

verified ·

1 Parent(s): 3cf9170

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -6

app.py CHANGED Viewed

@@ -1,21 +1,28 @@
 import os
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
-import streamlit as st
-import requests
-from io import BytesIO
 # Set up Groq API key
 GROQ_API_KEY = os.getenv("GROQ_Api_Key")
 # List of GitHub PDF URLs
 PDF_URLS = [
-    "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
     "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
     # Add more document links as needed
 ]
 def fetch_pdf_text_from_github(urls):
     text = ""
     for url in urls:
@@ -34,16 +41,30 @@ def fetch_pdf_text_from_github(urls):
             st.error(f"Failed to fetch PDF from URL: {url}")
     return text
 @st.cache_data
 def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
     chunks = text_splitter.split_text(text)
     return chunks
 @st.cache_resource
 def load_or_create_vector_store(text_chunks):
-    embeddings = FAISS.get_default_embeddings()
-    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
     return vector_store
 # Call Groq API for generating summary based on the query and retrieved text
@@ -66,11 +87,13 @@ def generate_summary_with_groq(query, retrieved_text):
         st.error("Failed to generate summary with Groq API")
         return "Error in Groq API response"
 def user_input(user_question, vector_store):
     docs = vector_store.similarity_search(user_question)
     context_text = " ".join([doc.page_content for doc in docs])
     return generate_summary_with_groq(user_question, context_text)
 def main():
     st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
     st.title("📄 Query PDF Documents on GitHub")

 import os
+import requests
+import streamlit as st
+from io import BytesIO
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
+from transformers import AutoModel, AutoTokenizer
+import torch
 # Set up Groq API key
 GROQ_API_KEY = os.getenv("GROQ_Api_Key")
+# Initialize embedding model (using sentence-transformers model)
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 # List of GitHub PDF URLs
 PDF_URLS = [
     "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
+    "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
     # Add more document links as needed
 ]
+# Fetch and extract text from PDF files hosted on GitHub
 def fetch_pdf_text_from_github(urls):
     text = ""
     for url in urls:
             st.error(f"Failed to fetch PDF from URL: {url}")
     return text
+# Split text into manageable chunks
 @st.cache_data
 def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
     chunks = text_splitter.split_text(text)
     return chunks
+# Compute embeddings for text chunks
+def compute_embeddings(text_chunks):
+    embeddings = []
+    for text in text_chunks:
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        with torch.no_grad():
+            model_output = embedding_model(**inputs)
+        embeddings.append(model_output.last_hidden_state.mean(dim=1).squeeze().numpy())
+    return embeddings
+# Create a FAISS vector store with embeddings
 @st.cache_resource
 def load_or_create_vector_store(text_chunks):
+    # Compute embeddings for text chunks
+    embeddings = compute_embeddings(text_chunks)
+    # Create FAISS vector store
+    vector_store = FAISS.from_texts(text_chunks, embeddings)
     return vector_store
 # Call Groq API for generating summary based on the query and retrieved text
         st.error("Failed to generate summary with Groq API")
         return "Error in Groq API response"
+# Generate response for user query
 def user_input(user_question, vector_store):
     docs = vector_store.similarity_search(user_question)
     context_text = " ".join([doc.page_content for doc in docs])
     return generate_summary_with_groq(user_question, context_text)
+# Main function to run the Streamlit app
 def main():
     st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
     st.title("📄 Query PDF Documents on GitHub")