import streamlit as st | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import SentenceTransformerEmbeddings | |
from langchain import hub | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_google_genai import ChatGoogleGenerativeAI | |
import os | |
# Set up the directories for data and vector DB | |
DATA_DIR = "/content/MyData" | |
DB_DIR = "/content/VectorDB" | |
# Create directories if they don't exist | |
os.makedirs(DATA_DIR, exist_ok=True) | |
os.makedirs(DB_DIR, exist_ok=True) | |
# Initialize the embeddings model | |
embeddings_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
# Load and process PDF documents | |
def load_data(): | |
loader = PyPDFDirectoryLoader(DATA_DIR) | |
data_on_pdf = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators=["\n\n", "\n", ". ", " ", ""], | |
chunk_size=1000, | |
chunk_overlap=200 | |
) | |
splits = text_splitter.split_documents(data_on_pdf) | |
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=DB_DIR) | |
return vectorstore | |
# Set up the generative AI model | |
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key="YOUR_GOOGLE_API_KEY") | |
# Load vector store | |
vectorstore = load_data() | |
# Streamlit interface | |
st.title("RAG App: Question-Answering with PDFs") | |
# File uploader for PDF documents | |
uploaded_files = st.file_uploader("Upload PDF files", accept_multiple_files=True, type=["pdf"]) | |
if uploaded_files: | |
for uploaded_file in uploaded_files: | |
with open(os.path.join(DATA_DIR,, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.success("PDF files uploaded successfully!") | |
# Reload vector store after uploading new files | |
vectorstore = load_data() | |
# User input for question | |
question = st.text_input("Ask a question about the documents:") | |
if st.button("Submit"): | |
if question: | |
retriever = vectorstore.as_retriever() | |
prompt = hub.pull("rlm/rag-prompt") | |
def format_docs(docs): | |
return "\n\n".join(doc.page_content for doc in docs) | |
rag_chain = ( | |
{"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| prompt | |
| llm | |
| StrOutputParser() | |
) | |
response = rag_chain.invoke(question) | |
st.markdown(response) | |
else: | |
st.warning("Please enter a question.") | |