Spaces:

tahirsher
/

GenAI_Lawyers_Guide

Sleeping

App Files Files Community

tahirsher commited on 15 days ago

Commit

dd56502

•

1 Parent(s): e3d9b11

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -12

app.py CHANGED Viewed

@@ -15,18 +15,25 @@ GROQ_API_KEY = os.getenv("GROQ_Api_Key")
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
-# List of GitHub PDF URLs
 PDF_URLS = [
-    "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
-    "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
     # Add more document links as needed
 ]
-# Fetch and extract text from PDF files hosted on GitHub
-def fetch_pdf_text_from_github(urls):
     text = ""
     for url in urls:
-        response = requests.get(url)
         if response.status_code == 200:
             pdf_file = BytesIO(response.content)
             try:
@@ -61,15 +68,13 @@ def compute_embeddings(text_chunks):
 # Create a FAISS vector store with embeddings
 @st.cache_resource
 def load_or_create_vector_store(text_chunks):
-    # Compute embeddings for text chunks
     embeddings = compute_embeddings(text_chunks)
-    # Create FAISS vector store
     vector_store = FAISS.from_texts(text_chunks, embeddings)
     return vector_store
 # Call Groq API for generating summary based on the query and retrieved text
 def generate_summary_with_groq(query, retrieved_text):
-    url = "https://api.groq.com/v1/chat/completions"  # Update with actual Groq API endpoint
     headers = {
         "Authorization": f"Bearer {GROQ_API_KEY}",
         "Content-Type": "application/json"
@@ -96,10 +101,10 @@ def user_input(user_question, vector_store):
 # Main function to run the Streamlit app
 def main():
     st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
-    st.title("📄 Query PDF Documents on GitHub")
-    # Load documents from GitHub
-    raw_text = fetch_pdf_text_from_github(PDF_URLS)
     text_chunks = get_text_chunks(raw_text)
     vector_store = load_or_create_vector_store(text_chunks)

 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+# List of Hugging Face PDF URLs
 PDF_URLS = [
+    "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
+    "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
     # Add more document links as needed
 ]
+# Helper function to convert Hugging Face blob URLs to direct download URLs
+def get_huggingface_raw_url(url):
+    if "huggingface.co" in url and "/blob/" in url:
+        return url.replace("/blob/", "/resolve/")
+    return url
+# Fetch and extract text from PDF files hosted on Hugging Face
+def fetch_pdf_text_from_huggingface(urls):
     text = ""
     for url in urls:
+        raw_url = get_huggingface_raw_url(url)  # Convert to direct download link
+        response = requests.get(raw_url)
         if response.status_code == 200:
             pdf_file = BytesIO(response.content)
             try:
 # Create a FAISS vector store with embeddings
 @st.cache_resource
 def load_or_create_vector_store(text_chunks):
     embeddings = compute_embeddings(text_chunks)
     vector_store = FAISS.from_texts(text_chunks, embeddings)
     return vector_store
 # Call Groq API for generating summary based on the query and retrieved text
 def generate_summary_with_groq(query, retrieved_text):
+    url = "https://api.groq.com/v1/chat/completions"
     headers = {
         "Authorization": f"Bearer {GROQ_API_KEY}",
         "Content-Type": "application/json"
 # Main function to run the Streamlit app
 def main():
     st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
+    st.title("📄 Query PDF Documents on Hugging Face")
+    # Load documents from Hugging Face
+    raw_text = fetch_pdf_text_from_huggingface(PDF_URLS)
     text_chunks = get_text_chunks(raw_text)
     vector_store = load_or_create_vector_store(text_chunks)