Spaces:
Sleeping
Sleeping
File size: 2,358 Bytes
663e818 93a3da9 a52a9bb 663e818 34d53c3 4995935 34d53c3 663e818 2ceb5b6 663e818 2ceb5b6 663e818 a52a9bb 663e818 f7f091e 663e818 f7f091e a52a9bb 2ceb5b6 93a3da9 663e818 93a3da9 663e818 f7f091e 663e818 93a3da9 663e818 93a3da9 663e818 f7f091e 663e818 93a3da9 a52a9bb 663e818 93a3da9 4995935 93a3da9 f7f091e b80a1ef f7f091e 34d53c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import streamlit as st
import torch
import pandas as pd
import pdfplumber
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
# Ensure you are logged in with `huggingface-cli login`
token = "YOUR_HUGGINGFACE_TOKEN" # Optional if you have logged in via CLI
# Load the tokenizer and model for RAG
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq", use_auth_token=token)
retriever = RagRetriever.from_pretrained("facebook/wikipedia-dpr", use_auth_token=token)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever, use_auth_token=token)
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
text = ""
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
# Streamlit app
st.title("RAG-Powered PDF & CSV Chatbot")
# CSV file upload
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
csv_data = None
if csv_file:
csv_data = pd.read_csv(csv_file)
st.write("CSV file loaded successfully!")
st.write(csv_data)
# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
pdf_text = ""
if pdf_file:
pdf_text = extract_text_from_pdf(pdf_file)
if pdf_text:
st.success("PDF loaded successfully!")
st.text_area("Extracted Text from PDF", pdf_text, height=200)
else:
st.warning("No extractable text found in the PDF.")
# User input for chatbot
user_input = st.text_input("Ask a question related to the PDF or CSV:")
# Get response on button click
if st.button("Get Response"):
if not pdf_text and csv_data is None:
st.warning("Please upload a PDF or CSV file first.")
else:
combined_context = pdf_text
if csv_data is not None:
combined_context += "\n" + csv_data.to_string()
# Generate response using RAG
inputs = tokenizer(user_input, return_tensors="pt", truncation=True)
with torch.no_grad():
output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
response = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
st.write("### Response:")
st.write(response)
|