import streamlit as st # setting page config. for centered mode st.set_page_config(layout="centered") from utils.footer import cust_footer import docx2txt import requests import pdfplumber # function to run the enter button def run_function(documents): data = "" if documents is not None: for document in documents: document_details = { "filename": document.name, "filetype": document.type, "filesize": document.size } st.write(document_details) # Extract content from the txt file if document.type == "text/plain": # Read as bytes data += str(document.read(), "utf-8") # Extract content from the pdf file elif document.type == "application/pdf": # using PyPDF2 # data += read_pdf(document) # using pdfplumber try: with pdfplumber.open(document) as pdf: all_text = "" for page in pdf.pages: text = page.extract_text() all_text += text + "\n" data += all_text except requests.exceptions.RequestException as e: st.write("None") # Extract content from the docx file elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": data += docx2txt.process(document) # Display the extracted text content from file st.text_area("Extracted Text", value=data, height=200) # return extract status, and the data extracted return True, data else: st.error("Error: An error occurred while fetching content.") # return extract status, and the data extracted return False, data def main(): st.subheader("Extract Data from Documents") documents = st.file_uploader( "", type=["pdf", "txt", "docx"], accept_multiple_files=True ) if "button_enter_doc" not in st.session_state: st.session_state.button_enter_doc = False if "extracted_doc" not in st.session_state: st.session_state.extracted_doc = False data = "" if st.button("Enter"): st.session_state.button_enter_doc = True # the enter button if st.session_state.button_enter_doc: # check if it is a sitemap or not if not documents: documents = None else: for doc in documents: if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]: # if documents is not the relevant type st.error("Unsupported file: " + doc.name) st.session_state.extracted_doc, data = run_function(documents) if st.session_state.extracted_doc: col1, col2 = st.columns([0.5, 0.5]) with col1: saved_button = False if st.download_button( label="Save", data=data ): saved_button = True with col2: if st.button("Clear"): st.session_state.button_enter_doc = False st.session_state.extracted_doc = False st.experimental_rerun() if saved_button: # Confirmation message st.success(f"File saved successfully.") else: st.warning("Data not extracted") if st.button("clear"): st.session_state.button_enter_doc = False st.session_state.extracted_doc = False st.experimental_rerun() # Add a success message to the sidebar st.sidebar.success("Select a page above.") # importing the custom footer from utils cust_footer() if __name__ == "__main__": main()