import streamlit as st import faiss import numpy as np import torch from pypdf import PdfReader from transformers import AutoTokenizer, AutoModel, pipeline from langchain.text_splitter import CharacterTextSplitter # Load embedding and QA models tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad") # PDF text extraction and text chunking def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text def split_text_into_chunks(text, chunk_size=500, overlap=50): splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) return splitter.split_text(text) # Function to embed text using the embedding model def embed_text(text): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): embeddings = model(**inputs).last_hidden_state.mean(dim=1) return embeddings.numpy() # Function to create FAISS index def create_faiss_index(embeddings): dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) return index # Function to answer questions based on retrieved context def answer_question(question, index, chunks, top_k=3): question_embedding = embed_text(question) _, indices = index.search(question_embedding, top_k) context = " ".join([chunks[i] for i in indices[0]]) result = qa_pipeline(question=question, context=context) return result['answer'] # Streamlit app layout st.title("PDF Question-Answering Chatbot with RAG") st.write("Upload a PDF, and ask questions based on its content.") # File uploader pdf_file = st.file_uploader("Upload PDF", type="pdf") if pdf_file is not None: # Extract and split text from PDF with st.spinner("Processing PDF..."): text = extract_text_from_pdf(pdf_file) chunks = split_text_into_chunks(text) # Embed and index the chunks embeddings = np.vstack([embed_text(chunk) for chunk in chunks]) index = create_faiss_index(embeddings) st.success("PDF processed and indexed successfully!") st.write("You can now ask questions based on the content of the PDF.") # Input for user question question = st.text_input("Ask a question:") if question: with st.spinner("Searching for the answer..."): answer = answer_question(question, index, chunks) st.write("**Answer:**", answer)