Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,21 +1,28 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
from PyPDF2 import PdfReader
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain_community.vectorstores import FAISS
|
| 5 |
-
import
|
| 6 |
-
import
|
| 7 |
-
from io import BytesIO
|
| 8 |
|
| 9 |
# Set up Groq API key
|
| 10 |
GROQ_API_KEY = os.getenv("GROQ_Api_Key")
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# List of GitHub PDF URLs
|
| 13 |
PDF_URLS = [
|
| 14 |
-
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
|
| 15 |
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
|
|
|
|
| 16 |
# Add more document links as needed
|
| 17 |
]
|
| 18 |
|
|
|
|
| 19 |
def fetch_pdf_text_from_github(urls):
|
| 20 |
text = ""
|
| 21 |
for url in urls:
|
|
@@ -34,16 +41,30 @@ def fetch_pdf_text_from_github(urls):
|
|
| 34 |
st.error(f"Failed to fetch PDF from URL: {url}")
|
| 35 |
return text
|
| 36 |
|
|
|
|
| 37 |
@st.cache_data
|
| 38 |
def get_text_chunks(text):
|
| 39 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
| 40 |
chunks = text_splitter.split_text(text)
|
| 41 |
return chunks
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
@st.cache_resource
|
| 44 |
def load_or_create_vector_store(text_chunks):
|
| 45 |
-
embeddings
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
return vector_store
|
| 48 |
|
| 49 |
# Call Groq API for generating summary based on the query and retrieved text
|
|
@@ -66,11 +87,13 @@ def generate_summary_with_groq(query, retrieved_text):
|
|
| 66 |
st.error("Failed to generate summary with Groq API")
|
| 67 |
return "Error in Groq API response"
|
| 68 |
|
|
|
|
| 69 |
def user_input(user_question, vector_store):
|
| 70 |
docs = vector_store.similarity_search(user_question)
|
| 71 |
context_text = " ".join([doc.page_content for doc in docs])
|
| 72 |
return generate_summary_with_groq(user_question, context_text)
|
| 73 |
|
|
|
|
| 74 |
def main():
|
| 75 |
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
|
| 76 |
st.title("📄 Query PDF Documents on GitHub")
|
|
|
|
| 1 |
import os
|
| 2 |
+
import requests
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from io import BytesIO
|
| 5 |
from PyPDF2 import PdfReader
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_community.vectorstores import FAISS
|
| 8 |
+
from transformers import AutoModel, AutoTokenizer
|
| 9 |
+
import torch
|
|
|
|
| 10 |
|
| 11 |
# Set up Groq API key
|
| 12 |
GROQ_API_KEY = os.getenv("GROQ_Api_Key")
|
| 13 |
|
| 14 |
+
# Initialize embedding model (using sentence-transformers model)
|
| 15 |
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
| 16 |
+
embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
| 17 |
+
|
| 18 |
# List of GitHub PDF URLs
|
| 19 |
PDF_URLS = [
|
|
|
|
| 20 |
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
|
| 21 |
+
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
|
| 22 |
# Add more document links as needed
|
| 23 |
]
|
| 24 |
|
| 25 |
+
# Fetch and extract text from PDF files hosted on GitHub
|
| 26 |
def fetch_pdf_text_from_github(urls):
|
| 27 |
text = ""
|
| 28 |
for url in urls:
|
|
|
|
| 41 |
st.error(f"Failed to fetch PDF from URL: {url}")
|
| 42 |
return text
|
| 43 |
|
| 44 |
+
# Split text into manageable chunks
|
| 45 |
@st.cache_data
|
| 46 |
def get_text_chunks(text):
|
| 47 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
| 48 |
chunks = text_splitter.split_text(text)
|
| 49 |
return chunks
|
| 50 |
|
| 51 |
+
# Compute embeddings for text chunks
|
| 52 |
+
def compute_embeddings(text_chunks):
|
| 53 |
+
embeddings = []
|
| 54 |
+
for text in text_chunks:
|
| 55 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
| 56 |
+
with torch.no_grad():
|
| 57 |
+
model_output = embedding_model(**inputs)
|
| 58 |
+
embeddings.append(model_output.last_hidden_state.mean(dim=1).squeeze().numpy())
|
| 59 |
+
return embeddings
|
| 60 |
+
|
| 61 |
+
# Create a FAISS vector store with embeddings
|
| 62 |
@st.cache_resource
|
| 63 |
def load_or_create_vector_store(text_chunks):
|
| 64 |
+
# Compute embeddings for text chunks
|
| 65 |
+
embeddings = compute_embeddings(text_chunks)
|
| 66 |
+
# Create FAISS vector store
|
| 67 |
+
vector_store = FAISS.from_texts(text_chunks, embeddings)
|
| 68 |
return vector_store
|
| 69 |
|
| 70 |
# Call Groq API for generating summary based on the query and retrieved text
|
|
|
|
| 87 |
st.error("Failed to generate summary with Groq API")
|
| 88 |
return "Error in Groq API response"
|
| 89 |
|
| 90 |
+
# Generate response for user query
|
| 91 |
def user_input(user_question, vector_store):
|
| 92 |
docs = vector_store.similarity_search(user_question)
|
| 93 |
context_text = " ".join([doc.page_content for doc in docs])
|
| 94 |
return generate_summary_with_groq(user_question, context_text)
|
| 95 |
|
| 96 |
+
# Main function to run the Streamlit app
|
| 97 |
def main():
|
| 98 |
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="📄")
|
| 99 |
st.title("📄 Query PDF Documents on GitHub")
|