Spaces:
Sleeping
Sleeping
File size: 3,010 Bytes
fb115f3 73e00ea 8df49ab 73e00ea cabb4c3 8df49ab 73e00ea 8df49ab 73e00ea cabb4c3 8df49ab cabb4c3 73e00ea cabb4c3 73e00ea cabb4c3 73e00ea cabb4c3 73e00ea 8df49ab 73e00ea cabb4c3 73e00ea cabb4c3 8df49ab 73e00ea cabb4c3 73e00ea cabb4c3 73e00ea cabb4c3 73e00ea cabb4c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import streamlit as st
import requests
from io import BytesIO
import groq.client as client # Ensure Groq client is properly installed
# Set up Groq API key
GROQ_API_KEY = os.getenv("Groq_Api_Key")
client.configure(api_key=GROQ_API_KEY)
# List of GitHub PDF URLs
PDF_URLS = [
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
# Add more document links as needed
]
def fetch_pdf_text_from_github(urls):
text = ""
for url in urls:
response = requests.get(url)
if response.status_code == 200:
pdf_file = BytesIO(response.content)
pdf_reader = PdfReader(pdf_file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
else:
st.error(f"Failed to fetch PDF from URL: {url}")
return text
@st.cache_data
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = text_splitter.split_text(text)
return chunks
@st.cache_resource
def load_or_create_vector_store(text_chunks):
embeddings = FAISS.get_default_embeddings()
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
return vector_store
# Call Groq API for generating summary based on the query and retrieved text
def generate_summary_with_groq(query, retrieved_text):
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content": f"{query}\n\nRelated information:\n{retrieved_text}"}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
def user_input(user_question, vector_store):
docs = vector_store.similarity_search(user_question)
context_text = " ".join([doc.page_content for doc in docs])
return generate_summary_with_groq(user_question, context_text)
def main():
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="π")
st.title("π Query PDF Documents on GitHub")
# Load documents from GitHub
raw_text = fetch_pdf_text_from_github(PDF_URLS)
text_chunks = get_text_chunks(raw_text)
vector_store = load_or_create_vector_store(text_chunks)
# User question input
user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
if st.button("Get Response"):
if not user_question:
st.warning("Please enter a question before submitting.")
else:
with st.spinner("Generating response..."):
answer = user_input(user_question, vector_store)
st.markdown(f"**π€ AI:** {answer}")
if __name__ == "__main__":
main()
|