Spaces:

Shankarm08
/

pdfcsvdatarag

Sleeping

App Files Files Community

Shankarm08 commited on Oct 6

Commit

93a3da9

•

1 Parent(s): 80bf310

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -93

app.py CHANGED Viewed

@@ -1,120 +1,73 @@
 import streamlit as st
 import pandas as pd
 import pdfplumber
-import torch
-import faiss
-import numpy as np
-from transformers import pipeline
-from sentence_transformers import SentenceTransformer
-# Load the Sentence Transformer model for embeddings
-@st.cache_resource
-def load_embedder():
-    return SentenceTransformer('all-MiniLM-L6-v2')
-embedder = load_embedder()
-# Load a generative model for answer generation
-@st.cache_resource
-def load_generator():
-    return pipeline('text-generation', model='gpt2', tokenizer='gpt2', device=0 if torch.cuda.is_available() else -1)
-generator = load_generator()
-# Function to extract text from PDF
 def extract_text_from_pdf(pdf_file):
-    text = ""
     with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
             page_text = page.extract_text()
-            if page_text:
                 text += page_text + "\n"
-    return text
-# Function to split text into chunks
-def split_text(text, chunk_size=500):
-    sentences = text.split('. ')
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) <= chunk_size:
-            current_chunk += sentence + ". "
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence + ". "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-# Function to build FAISS index
-def build_faiss_index(chunks):
-    embeddings = embedder.encode(chunks)
-    embeddings = np.array(embeddings).astype('float32')
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    return index, embeddings
-# Streamlit app
-st.title("PDF and CSV Chatbot with RAG")
-# Upload CSV file
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
-csv_text = ""
 if csv_file:
     csv_data = pd.read_csv(csv_file)
-    st.write("### CSV Data:")
     st.write(csv_data)
-    csv_text = csv_data.to_csv(index=False)
-# Upload PDF file
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
-pdf_text = ""
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
-    if pdf_text.strip():
-        st.write("### PDF Text:")
-        st.write(pdf_text)
     else:
         st.warning("No extractable text found in the PDF.")
-# Combine texts
-combined_text = csv_text + "\n" + pdf_text
-if combined_text.strip():
-    # Split text into chunks
-    chunks = split_text(combined_text)
-    # Build FAISS index
-    index, embeddings = build_faiss_index(chunks)
-    # Prepare for user input
-    user_input = st.text_input("Ask a question about the uploaded data:")
-    if st.button("Get Response"):
-        if user_input.strip():
-            # Get embedding of user question
-            question_embedding = embedder.encode([user_input])
-            question_embedding = np.array(question_embedding).astype('float32')
-            # Search FAISS index
-            k = 3  # number of nearest neighbors
-            distances, indices = index.search(question_embedding, k)
-            # Retrieve the most relevant chunks
-            retrieved_chunks = [chunks[idx] for idx in indices[0]]
-            # Combine retrieved chunks
-            context = " ".join(retrieved_chunks)
-            # Generate answer
-            prompt = context + "\n\nQuestion: " + user_input + "\nAnswer:"
-            response = generator(prompt, max_length=200, num_return_sequences=1)
-            # Display response
             st.write("### Response:")
-            st.write(response[0]['generated_text'].split("Answer:")[1].strip())
-        else:
-            st.warning("Please enter a question.")
-else:
-    st.info("Please upload a CSV file or a PDF file to proceed.")

 import streamlit as st
+import torch
+from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
+from datasets import load_dataset
 import pandas as pd
 import pdfplumber
+# Load RAG model, tokenizer, and retriever
+tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
+model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
+# Function to get RAG embeddings
+def get_rag_embeddings(question, context):
+    inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
+    with torch.no_grad():
+        output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+    return tokenizer.batch_decode(output, skip_special_tokens=True)[0]
+# Extract text from PDF
 def extract_text_from_pdf(pdf_file):
     with pdfplumber.open(pdf_file) as pdf:
+        text = ""
         for page in pdf.pages:
             page_text = page.extract_text()
+            if page_text:  # Check if the page has extractable text
                 text += page_text + "\n"
+    return text.strip()  # Return stripped text for better formatting
+# Store the PDF text and embeddings
+pdf_text = ""
+csv_data = None
+# Streamlit app UI
+st.title("RAG-Powered PDF & CSV Chatbot")
+# CSV file upload
 csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if csv_file:
     csv_data = pd.read_csv(csv_file)
+    st.write("CSV file loaded successfully!")
     st.write(csv_data)
+# PDF file upload
 pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
 if pdf_file:
     pdf_text = extract_text_from_pdf(pdf_file)
+    if pdf_text:
+        st.success("PDF loaded successfully!")
+        st.text_area("Extracted Text from PDF", pdf_text, height=200)
     else:
         st.warning("No extractable text found in the PDF.")
+# User input for chatbot
+user_input = st.text_input("Ask a question related to the PDF or CSV:")
+# Get response on button click
+if st.button("Get Response"):
+    if not pdf_text and csv_data is None:
+        st.warning("Please upload a PDF or CSV file first.")
+    else:
+        # Combine PDF text and CSV content for context in RAG
+        combined_context = pdf_text
+        if csv_data is not None:
+            combined_context += "\n" + csv_data.to_string()
+        # Get RAG-generated response
+        try:
+            response = get_rag_embeddings(user_input, combined_context)
             st.write("### Response:")
+            st.write(response)
+        except Exception as e:
+            st.error(f"Error while processing the question: {e}")