import streamlit as st from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import SentenceTransformerEmbeddings from langchain import hub from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_google_genai import ChatGoogleGenerativeAI import os # Set up the directories for data and vector DB DATA_DIR = "MyData" DB_DIR = "MyData" # Initialize the embeddings model embeddings_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # Load and process PDF documents def load_data(): loader = PyPDFDirectoryLoader(DATA_DIR) data_on_pdf = loader.load() text_splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n", ". ", " ", ""], chunk_size=2000, chunk_overlap=200 ) splits = text_splitter.split_documents(data_on_pdf) vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=DB_DIR) return vectorstore # Set up the generative AI model llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key="AIzaSyAnsIVS4x_7lJLe9AYXGLV8FRwUTQkB-1w") # Load vector store vectorstore = load_data() # Streamlit interface st.title("RAG App: Question-Answering with PDFs") # User input for question question = st.text_input("Ask a question about the documents:") if st.button("Submit"): if question: retriever = vectorstore.as_retriever() prompt = hub.pull("rlm/rag-prompt") def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) response = rag_chain.invoke(question) st.markdown(response) else: st.warning("Please enter a question.")