import streamlit as st from transformers import pipeline from PyPDF2 import PdfReader import docx # Initialize the NLP pipeline nlp = pipeline( "document-question-answering", model="impira/layoutlm-document-qa", ) # Set the title of the app st.title("LayoutLM Example") # Create a file uploader that accepts various document formats uploaded_file = st.file_uploader("Drag and drop a document here", type=['txt', 'pdf', 'docx']) # Create a text box for user input question = st.text_area("What would you like to know?") def extract_text_from_file(uploaded_file): if uploaded_file.type == "text/plain": return uploaded_file.read().decode("utf-8") elif uploaded_file.type == "application/pdf": reader = PdfReader(uploaded_file) text = "" for page in reader.pages: text += page.extract_text() return text elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": doc = docx.Document(uploaded_file) text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) return text else: return None if uploaded_file and question: # Extract text from the uploaded document document_text = extract_text_from_file(uploaded_file) if document_text: # Run the NLP model on the extracted text and the user's question answer = nlp( { "context": document_text, "question": question } ) # Display the answer st.write("Answer:") st.write(answer['answer']) else: st.write("Unsupported file type or failed to extract text from the document.")