Spaces:

muhammadshaheryar
/

R-A-G

Sleeping

App Files Files Community

muhammadshaheryar commited on Nov 14, 2024

Commit

2a01b43

verified ·

1 Parent(s): 77a3df6

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -0

app.py CHANGED Viewed

	@@ -0,0 +1,93 @@

+# Install required libraries
+!pip install transformers faiss-cpu PyMuPDF streamlit sentence-transformers
+import os
+import fitz  # PyMuPDF for PDF extraction
+import faiss  # for efficient vector search
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, RagTokenizer, RagRetriever, RagSequenceForGeneration
+from sentence_transformers import SentenceTransformer
+import streamlit as st
+# Load the pre-trained RAG model and tokenizer
+model_name = "facebook/rag-token-nq"  # You can change this to a different open-source RAG model if needed
+tokenizer = RagTokenizer.from_pretrained(model_name)
+model = RagSequenceForGeneration.from_pretrained(model_name)
+# Initialize sentence transformer model for embeddings
+embedder = SentenceTransformer('all-MiniLM-L6-v2')
+# Function to extract text from a PDF file
+def extract_text_from_pdf(pdf_file):
+    pdf_document = fitz.open(pdf_file)
+    text = ""
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_num)
+        text += page.get_text("text")
+    return text
+# Function to create embeddings from text data
+def create_embeddings(text_data):
+    embeddings = embedder.encode(text_data, convert_to_tensor=True)
+    return embeddings
+# Function to create a FAISS index from the embeddings
+def create_faiss_index(embeddings):
+    index = faiss.IndexFlatL2(embeddings.shape[1])  # Using L2 distance metric
+    index.add(embeddings)
+    return index
+# Function to retrieve the most relevant text using FAISS
+def retrieve_relevant_text(query, faiss_index, texts, top_k=3):
+    query_embedding = embedder.encode([query], convert_to_tensor=True)
+    D, I = faiss_index.search(query_embedding, top_k)  # D: distances, I: indices
+    return [texts[i] for i in I[0]]
+# Main function to answer questions based on uploaded PDF
+def get_answer_from_pdf(pdf_file, query):
+    # Step 1: Extract text from the uploaded PDF file
+    document_text = extract_text_from_pdf(pdf_file)
+    # Step 2: Split the document text into chunks (optional but recommended for large docs)
+    text_chunks = document_text.split('\n')
+    # Step 3: Create embeddings for each chunk of text
+    embeddings = create_embeddings(text_chunks)
+    # Step 4: Create a FAISS index for efficient retrieval
+    faiss_index = create_faiss_index(embeddings)
+    # Step 5: Retrieve relevant text from the document based on the query
+    relevant_texts = retrieve_relevant_text(query, faiss_index, text_chunks)
+    # Step 6: Combine the relevant text and pass it to the RAG model
+    context = " ".join(relevant_texts)
+    inputs = tokenizer([query], return_tensors="pt", padding=True, truncation=True)
+    context_inputs = tokenizer(context, return_tensors="pt", padding=True, truncation=True)
+    # Generate the answer
+    outputs = model.generate(input_ids=inputs["input_ids"], context_input_ids=context_inputs["input_ids"])
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return answer
+# Streamlit UI
+def main():
+    st.title("RAG Application - PDF Q&A")
+    # Upload PDF file
+    uploaded_file = st.file_uploader("Upload a PDF Document", type="pdf")
+    if uploaded_file is not None:
+        # Ask a question from the uploaded PDF
+        question = st.text_input("Ask a question based on the document:")
+        if question:
+            # Get the answer from the PDF document
+            answer = get_answer_from_pdf(uploaded_file, question)
+            # Display the answer
+            st.write("Answer: ", answer)
+if __name__ == "__main__":
+    main()