File size: 2,621 Bytes
62d725c
 
d66126a
 
62d725c
d66126a
62d725c
 
d66126a
 
62d725c
d66126a
 
 
067fc57
d66126a
 
 
067fc57
d66126a
62d725c
 
d66126a
 
 
 
 
 
 
 
 
 
 
 
62d725c
d66126a
 
62d725c
d66126a
 
62d725c
d66126a
 
62d725c
d66126a
 
62d725c
d66126a
 
 
 
 
62d725c
d66126a
 
62d725c
d66126a
 
62d725c
d66126a
 
 
 
 
 
 
62d725c
d66126a
 
 
 
 
 
62d725c
d66126a
 
 
 
62d725c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
import os

# Set up the directories for data and vector DB
DATA_DIR = "/content/MyData"
DB_DIR = "/content/VectorDB"

# Create directories if they don't exist
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DB_DIR, exist_ok=True)

# Initialize the embeddings model
embeddings_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Load and process PDF documents
def load_data():
    loader = PyPDFDirectoryLoader(DATA_DIR)
    data_on_pdf = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=200
    )
    splits = text_splitter.split_documents(data_on_pdf)
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=DB_DIR)
    return vectorstore

# Set up the generative AI model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key="YOUR_GOOGLE_API_KEY")

# Load vector store
vectorstore = load_data()

# Streamlit interface
st.title("RAG App: Question-Answering with PDFs")

# File uploader for PDF documents
uploaded_files = st.file_uploader("Upload PDF files", accept_multiple_files=True, type=["pdf"])

if uploaded_files:
    for uploaded_file in uploaded_files:
        with open(os.path.join(DATA_DIR, uploaded_file.name), "wb") as f:
            f.write(uploaded_file.getbuffer())
    st.success("PDF files uploaded successfully!")

    # Reload vector store after uploading new files
    vectorstore = load_data()

# User input for question
question = st.text_input("Ask a question about the documents:")

if st.button("Submit"):
    if question:
        retriever = vectorstore.as_retriever()
        prompt = hub.pull("rlm/rag-prompt")
        
        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

        response = rag_chain.invoke(question)
        st.markdown(response)
    else:
        st.warning("Please enter a question.")