Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,18 +15,25 @@ GROQ_API_KEY = os.getenv("GROQ_Api_Key")
|
|
15 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
16 |
embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
17 |
|
18 |
-
# List of
|
19 |
PDF_URLS = [
|
20 |
-
"https://
|
21 |
-
"https://
|
22 |
# Add more document links as needed
|
23 |
]
|
24 |
|
25 |
-
#
|
26 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
text = ""
|
28 |
for url in urls:
|
29 |
-
|
|
|
30 |
if response.status_code == 200:
|
31 |
pdf_file = BytesIO(response.content)
|
32 |
try:
|
@@ -61,15 +68,13 @@ def compute_embeddings(text_chunks):
|
|
61 |
# Create a FAISS vector store with embeddings
|
62 |
@st.cache_resource
|
63 |
def load_or_create_vector_store(text_chunks):
|
64 |
-
# Compute embeddings for text chunks
|
65 |
embeddings = compute_embeddings(text_chunks)
|
66 |
-
# Create FAISS vector store
|
67 |
vector_store = FAISS.from_texts(text_chunks, embeddings)
|
68 |
return vector_store
|
69 |
|
70 |
# Call Groq API for generating summary based on the query and retrieved text
|
71 |
def generate_summary_with_groq(query, retrieved_text):
|
72 |
-
url = "https://api.groq.com/v1/chat/completions"
|
73 |
headers = {
|
74 |
"Authorization": f"Bearer {GROQ_API_KEY}",
|
75 |
"Content-Type": "application/json"
|
@@ -96,10 +101,10 @@ def user_input(user_question, vector_store):
|
|
96 |
# Main function to run the Streamlit app
|
97 |
def main():
|
98 |
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="π")
|
99 |
-
st.title("π Query PDF Documents on
|
100 |
|
101 |
-
# Load documents from
|
102 |
-
raw_text =
|
103 |
text_chunks = get_text_chunks(raw_text)
|
104 |
vector_store = load_or_create_vector_store(text_chunks)
|
105 |
|
|
|
15 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
16 |
embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
17 |
|
18 |
+
# List of Hugging Face PDF URLs
|
19 |
PDF_URLS = [
|
20 |
+
"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
|
21 |
+
"https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
|
22 |
# Add more document links as needed
|
23 |
]
|
24 |
|
25 |
+
# Helper function to convert Hugging Face blob URLs to direct download URLs
|
26 |
+
def get_huggingface_raw_url(url):
|
27 |
+
if "huggingface.co" in url and "/blob/" in url:
|
28 |
+
return url.replace("/blob/", "/resolve/")
|
29 |
+
return url
|
30 |
+
|
31 |
+
# Fetch and extract text from PDF files hosted on Hugging Face
|
32 |
+
def fetch_pdf_text_from_huggingface(urls):
|
33 |
text = ""
|
34 |
for url in urls:
|
35 |
+
raw_url = get_huggingface_raw_url(url) # Convert to direct download link
|
36 |
+
response = requests.get(raw_url)
|
37 |
if response.status_code == 200:
|
38 |
pdf_file = BytesIO(response.content)
|
39 |
try:
|
|
|
68 |
# Create a FAISS vector store with embeddings
|
69 |
@st.cache_resource
|
70 |
def load_or_create_vector_store(text_chunks):
|
|
|
71 |
embeddings = compute_embeddings(text_chunks)
|
|
|
72 |
vector_store = FAISS.from_texts(text_chunks, embeddings)
|
73 |
return vector_store
|
74 |
|
75 |
# Call Groq API for generating summary based on the query and retrieved text
|
76 |
def generate_summary_with_groq(query, retrieved_text):
|
77 |
+
url = "https://api.groq.com/v1/chat/completions"
|
78 |
headers = {
|
79 |
"Authorization": f"Bearer {GROQ_API_KEY}",
|
80 |
"Content-Type": "application/json"
|
|
|
101 |
# Main function to run the Streamlit app
|
102 |
def main():
|
103 |
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="π")
|
104 |
+
st.title("π Query PDF Documents on Hugging Face")
|
105 |
|
106 |
+
# Load documents from Hugging Face
|
107 |
+
raw_text = fetch_pdf_text_from_huggingface(PDF_URLS)
|
108 |
text_chunks = get_text_chunks(raw_text)
|
109 |
vector_store = load_or_create_vector_store(text_chunks)
|
110 |
|