import streamlit as st from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import SentenceTransformerEmbeddings from langchain import hub from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_google_genai import ChatGoogleGenerativeAI import os # Set up the directories for data and vector DB DATA_DIR = "/content/MyData" DB_DIR = "/content/VectorDB" # Create directories if they don't exist os.makedirs(DATA_DIR, exist_ok=True) os.makedirs(DB_DIR, exist_ok=True) # Initialize the embeddings model embeddings_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # Load and process PDF documents def load_data(): loader = PyPDFDirectoryLoader(DATA_DIR) data_on_pdf = loader.load() text_splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1000, chunk_overlap=200 ) splits = text_splitter.split_documents(data_on_pdf) vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=DB_DIR) return vectorstore # Set up the generative AI model llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key="YOUR_GOOGLE_API_KEY") # Load vector store vectorstore = load_data() # Streamlit interface st.title("RAG App: Question-Answering with PDFs") # File uploader for PDF documents uploaded_files = st.file_uploader("Upload PDF files", accept_multiple_files=True, type=["pdf"]) if uploaded_files: for uploaded_file in uploaded_files: with open(os.path.join(DATA_DIR, uploaded_file.name), "wb") as f: f.write(uploaded_file.getbuffer()) st.success("PDF files uploaded successfully!") # Reload vector store after uploading new files vectorstore = load_data() # User input for question question = st.text_input("Ask a question about the documents:") if st.button("Submit"): if question: retriever = vectorstore.as_retriever() prompt = hub.pull("rlm/rag-prompt") def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) response = rag_chain.invoke(question) st.markdown(response) else: st.warning("Please enter a question.")