File size: 3,010 Bytes
fb115f3
73e00ea
 
8df49ab
73e00ea
cabb4c3
 
8df49ab
73e00ea
8df49ab
 
 
73e00ea
cabb4c3
 
 
8df49ab
cabb4c3
 
73e00ea
cabb4c3
73e00ea
cabb4c3
 
 
 
 
 
 
 
 
 
 
73e00ea
 
 
 
 
 
 
 
 
 
cabb4c3
73e00ea
 
 
8df49ab
 
 
 
 
 
 
 
 
73e00ea
cabb4c3
73e00ea
cabb4c3
8df49ab
73e00ea
 
cabb4c3
 
73e00ea
cabb4c3
 
 
 
73e00ea
 
 
 
 
 
 
 
 
cabb4c3
 
73e00ea
 
cabb4c3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import streamlit as st
import requests
from io import BytesIO
import groq.client as client  # Ensure Groq client is properly installed

# Set up Groq API key
GROQ_API_KEY = os.getenv("Groq_Api_Key")
client.configure(api_key=GROQ_API_KEY)

# List of GitHub PDF URLs
PDF_URLS = [
    "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
    "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
    # Add more document links as needed
]

def fetch_pdf_text_from_github(urls):
    text = ""
    for url in urls:
        response = requests.get(url)
        if response.status_code == 200:
            pdf_file = BytesIO(response.content)
            pdf_reader = PdfReader(pdf_file)
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        else:
            st.error(f"Failed to fetch PDF from URL: {url}")
    return text

@st.cache_data
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

@st.cache_resource
def load_or_create_vector_store(text_chunks):
    embeddings = FAISS.get_default_embeddings()
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    return vector_store

# Call Groq API for generating summary based on the query and retrieved text
def generate_summary_with_groq(query, retrieved_text):
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "user", "content": f"{query}\n\nRelated information:\n{retrieved_text}"}
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

def user_input(user_question, vector_store):
    docs = vector_store.similarity_search(user_question)
    context_text = " ".join([doc.page_content for doc in docs])
    return generate_summary_with_groq(user_question, context_text)

def main():
    st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
    st.title("πŸ“„ Query PDF Documents on GitHub")

    # Load documents from GitHub
    raw_text = fetch_pdf_text_from_github(PDF_URLS)
    text_chunks = get_text_chunks(raw_text)
    vector_store = load_or_create_vector_store(text_chunks)

    # User question input
    user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")

    if st.button("Get Response"):
        if not user_question:
            st.warning("Please enter a question before submitting.")
        else:
            with st.spinner("Generating response..."):
                answer = user_input(user_question, vector_store)
                st.markdown(f"**πŸ€– AI:** {answer}")

if __name__ == "__main__":
    main()