AlmasKanwal19 commited on
Commit
ababda9
1 Parent(s): 8bff4e6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import faiss
3
+ import numpy as np
4
+ import torch
5
+ from pypdf import PdfReader
6
+ from transformers import AutoTokenizer, AutoModel, pipeline
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+
9
+ # Load embedding and QA models
10
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
11
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
12
+ qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad")
13
+
14
+ # PDF text extraction and text chunking
15
+ def extract_text_from_pdf(pdf_file):
16
+ reader = PdfReader(pdf_file)
17
+ text = ""
18
+ for page in reader.pages:
19
+ text += page.extract_text()
20
+ return text
21
+
22
+ def split_text_into_chunks(text, chunk_size=500, overlap=50):
23
+ splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
24
+ return splitter.split_text(text)
25
+
26
+ # Function to embed text using the embedding model
27
+ def embed_text(text):
28
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
29
+ with torch.no_grad():
30
+ embeddings = model(**inputs).last_hidden_state.mean(dim=1)
31
+ return embeddings.numpy()
32
+
33
+ # Function to create FAISS index
34
+ def create_faiss_index(embeddings):
35
+ dimension = embeddings.shape[1]
36
+ index = faiss.IndexFlatL2(dimension)
37
+ index.add(embeddings)
38
+ return index
39
+
40
+ # Function to answer questions based on retrieved context
41
+ def answer_question(question, index, chunks, top_k=3):
42
+ question_embedding = embed_text(question)
43
+ _, indices = index.search(question_embedding, top_k)
44
+ context = " ".join([chunks[i] for i in indices[0]])
45
+ result = qa_pipeline(question=question, context=context)
46
+ return result['answer']
47
+
48
+ # Streamlit app layout
49
+ st.title("PDF Question-Answering Chatbot with RAG")
50
+ st.write("Upload a PDF, and ask questions based on its content.")
51
+
52
+ # File uploader
53
+ pdf_file = st.file_uploader("Upload PDF", type="pdf")
54
+ if pdf_file is not None:
55
+ # Extract and split text from PDF
56
+ with st.spinner("Processing PDF..."):
57
+ text = extract_text_from_pdf(pdf_file)
58
+ chunks = split_text_into_chunks(text)
59
+
60
+ # Embed and index the chunks
61
+ embeddings = np.vstack([embed_text(chunk) for chunk in chunks])
62
+ index = create_faiss_index(embeddings)
63
+
64
+ st.success("PDF processed and indexed successfully!")
65
+ st.write("You can now ask questions based on the content of the PDF.")
66
+
67
+ # Input for user question
68
+ question = st.text_input("Ask a question:")
69
+ if question:
70
+ with st.spinner("Searching for the answer..."):
71
+ answer = answer_question(question, index, chunks)
72
+ st.write("**Answer:**", answer)