Spaces:

AlmasKanwal19
/

rag-pdf-qa-almas

Running

App Files Files Community

AlmasKanwal19 commited on 15 days ago

Commit

ababda9

•

1 Parent(s): 8bff4e6

Create app.py

Browse files

Files changed (1) hide show

app.py +72 -0

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import streamlit as st
+import faiss
+import numpy as np
+import torch
+from pypdf import PdfReader
+from transformers import AutoTokenizer, AutoModel, pipeline
+from langchain.text_splitter import CharacterTextSplitter
+# Load embedding and QA models
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad")
+# PDF text extraction and text chunking
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+def split_text_into_chunks(text, chunk_size=500, overlap=50):
+    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
+    return splitter.split_text(text)
+# Function to embed text using the embedding model
+def embed_text(text):
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
+    return embeddings.numpy()
+# Function to create FAISS index
+def create_faiss_index(embeddings):
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dimension)
+    index.add(embeddings)
+    return index
+# Function to answer questions based on retrieved context
+def answer_question(question, index, chunks, top_k=3):
+    question_embedding = embed_text(question)
+    _, indices = index.search(question_embedding, top_k)
+    context = " ".join([chunks[i] for i in indices[0]])
+    result = qa_pipeline(question=question, context=context)
+    return result['answer']
+# Streamlit app layout
+st.title("PDF Question-Answering Chatbot with RAG")
+st.write("Upload a PDF, and ask questions based on its content.")
+# File uploader
+pdf_file = st.file_uploader("Upload PDF", type="pdf")
+if pdf_file is not None:
+    # Extract and split text from PDF
+    with st.spinner("Processing PDF..."):
+        text = extract_text_from_pdf(pdf_file)
+        chunks = split_text_into_chunks(text)
+        # Embed and index the chunks
+        embeddings = np.vstack([embed_text(chunk) for chunk in chunks])
+        index = create_faiss_index(embeddings)
+        st.success("PDF processed and indexed successfully!")
+        st.write("You can now ask questions based on the content of the PDF.")
+    # Input for user question
+    question = st.text_input("Ask a question:")
+    if question:
+        with st.spinner("Searching for the answer..."):
+            answer = answer_question(question, index, chunks)
+            st.write("**Answer:**", answer)