import streamlit as st
import faiss
import numpy as np
import torch
from pypdf import PdfReader
from transformers import AutoTokenizer, AutoModel, pipeline
from langchain.text_splitter import CharacterTextSplitter

# Load embedding and QA models
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad")

# PDF text extraction and text chunking
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def split_text_into_chunks(text, chunk_size=500, overlap=50):
    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_text(text)

# Function to embed text using the embedding model
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.numpy()

# Function to create FAISS index
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Function to answer questions based on retrieved context
def answer_question(question, index, chunks, top_k=3):
    question_embedding = embed_text(question)
    _, indices = index.search(question_embedding, top_k)
    context = " ".join([chunks[i] for i in indices[0]])
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Streamlit app layout
st.title("PDF Question-Answering Chatbot with RAG")
st.write("Upload a PDF, and ask questions based on its content.")

# File uploader
pdf_file = st.file_uploader("Upload PDF", type="pdf")
if pdf_file is not None:
    # Extract and split text from PDF
    with st.spinner("Processing PDF..."):
        text = extract_text_from_pdf(pdf_file)
        chunks = split_text_into_chunks(text)

        # Embed and index the chunks
        embeddings = np.vstack([embed_text(chunk) for chunk in chunks])
        index = create_faiss_index(embeddings)

        st.success("PDF processed and indexed successfully!")
        st.write("You can now ask questions based on the content of the PDF.")

    # Input for user question
    question = st.text_input("Ask a question:")
    if question:
        with st.spinner("Searching for the answer..."):
            answer = answer_question(question, index, chunks)
            st.write("**Answer:**", answer)