File size: 4,468 Bytes
fb115f3
ed3b297
 
 
73e00ea
 
ed56b0d
 
ed3b297
 
73e00ea
8df49ab
8d5edcc
73e00ea
ed3b297
 
 
 
dd56502
cabb4c3
293af25
 
cabb4c3
 
73e00ea
dd56502
 
 
 
 
 
 
 
73e00ea
cabb4c3
dd56502
 
cabb4c3
 
3cf9170
 
 
 
 
 
 
 
cabb4c3
 
73e00ea
 
ed3b297
73e00ea
 
 
 
 
 
ed56b0d
 
ed3b297
 
73e00ea
 
ed56b0d
73e00ea
 
8df49ab
 
dd56502
0fa729c
 
 
 
 
 
8df49ab
 
0fa729c
 
 
 
 
 
 
 
73e00ea
ed3b297
cabb4c3
73e00ea
cabb4c3
8df49ab
73e00ea
ed3b297
73e00ea
cabb4c3
dd56502
73e00ea
dd56502
 
cabb4c3
 
73e00ea
 
 
 
 
 
 
 
 
cabb4c3
 
73e00ea
 
cabb4c3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import requests
import streamlit as st
from io import BytesIO
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoModel, AutoTokenizer
import torch

# Set up Groq API key
GROQ_API_KEY = os.getenv("LawersGuideAPIKey")

# Initialize embedding model (using sentence-transformers model)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# List of Hugging Face PDF URLs
PDF_URLS = [
    "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/administrator92ada0936848e501425591b4ad0cd417.pdf",
    "https://huggingface.co/spaces/tahirsher/GenAI_Lawyers_Guide/blob/main/Pakistan%20Penal%20Code.pdf",
    # Add more document links as needed
]

# Helper function to convert Hugging Face blob URLs to direct download URLs
def get_huggingface_raw_url(url):
    if "huggingface.co" in url and "/blob/" in url:
        return url.replace("/blob/", "/resolve/")
    return url

# Fetch and extract text from PDF files hosted on Hugging Face
def fetch_pdf_text_from_huggingface(urls):
    text = ""
    for url in urls:
        raw_url = get_huggingface_raw_url(url)  # Convert to direct download link
        response = requests.get(raw_url)
        if response.status_code == 200:
            pdf_file = BytesIO(response.content)
            try:
                pdf_reader = PdfReader(pdf_file)
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text
            except Exception as e:
                st.error(f"Failed to read PDF from URL {url}: {e}")
        else:
            st.error(f"Failed to fetch PDF from URL: {url}")
    return text

# Split text into manageable chunks
@st.cache_data
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks

# Initialize embedding function
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS vector store with embeddings
@st.cache_resource
def load_or_create_vector_store(text_chunks):
    vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
    return vector_store

# Call Groq API for generating summary based on the query and retrieved text
def generate_summary_with_groq(query, retrieved_text):
    url = "https://api.groq.com/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "messages": [
            {"role": "user", "content": f"{query}\n\nRelated information:\n{retrieved_text}"}
        ],
        "model": "llama3-8b-8192",
    }
    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        st.error("Failed to generate summary with Groq API")
        return "Error in Groq API response"

# Generate response for user query
def user_input(user_question, vector_store):
    docs = vector_store.similarity_search(user_question)
    context_text = " ".join([doc.page_content for doc in docs])
    return generate_summary_with_groq(user_question, context_text)

# Main function to run the Streamlit app
def main():
    st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
    st.title("πŸ“„ Query PDF Documents on Hugging Face")

    # Load documents from Hugging Face
    raw_text = fetch_pdf_text_from_huggingface(PDF_URLS)
    text_chunks = get_text_chunks(raw_text)
    vector_store = load_or_create_vector_store(text_chunks)

    # User question input
    user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")

    if st.button("Get Response"):
        if not user_question:
            st.warning("Please enter a question before submitting.")
        else:
            with st.spinner("Generating response..."):
                answer = user_input(user_question, vector_store)
                st.markdown(f"**πŸ€– AI:** {answer}")

if __name__ == "__main__":
    main()