Spaces:
Runtime error
Runtime error
import streamlit as st | |
# setting page config. for centered mode | |
st.set_page_config(layout="centered") | |
from utils.footer import cust_footer | |
import docx2txt | |
import requests | |
import pdfplumber | |
# function to run the enter button | |
def run_function(documents): | |
data = "" | |
if documents is not None: | |
for document in documents: | |
document_details = { | |
"filename": document.name, | |
"filetype": document.type, | |
"filesize": document.size | |
} | |
st.write(document_details) | |
# Extract content from the txt file | |
if document.type == "text/plain": | |
# Read as bytes | |
data += str(document.read(), "utf-8") | |
# Extract content from the pdf file | |
elif document.type == "application/pdf": | |
# using PyPDF2 | |
# data += read_pdf(document) | |
# using pdfplumber | |
try: | |
with pdfplumber.open(document) as pdf: | |
all_text = "" | |
for page in pdf.pages: | |
text = page.extract_text() | |
all_text += text + "\n" | |
data += all_text | |
except requests.exceptions.RequestException as e: | |
st.write("None") | |
# Extract content from the docx file | |
elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
data += docx2txt.process(document) | |
# Display the extracted text content from file | |
st.text_area("Extracted Text", value=data, height=200) | |
# return extract status, and the data extracted | |
return True, data | |
else: | |
st.error("Error: An error occurred while fetching content.") | |
# return extract status, and the data extracted | |
return False, data | |
def main(): | |
st.subheader("Extract Data from Documents") | |
documents = st.file_uploader( | |
"", type=["pdf", "txt", "docx"], accept_multiple_files=True | |
) | |
if "button_enter_doc" not in st.session_state: | |
st.session_state.button_enter_doc = False | |
if "extracted_doc" not in st.session_state: | |
st.session_state.extracted_doc = False | |
data = "" | |
if st.button("Enter"): | |
st.session_state.button_enter_doc = True | |
# the enter button | |
if st.session_state.button_enter_doc: | |
# check if it is a sitemap or not | |
if not documents: | |
documents = None | |
else: | |
for doc in documents: | |
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]: | |
# if documents is not the relevant type | |
st.error("Unsupported file: " + doc.name) | |
st.session_state.extracted_doc, data = run_function(documents) | |
if st.session_state.extracted_doc: | |
col1, col2 = st.columns([0.5, 0.5]) | |
with col1: | |
saved_button = False | |
if st.download_button( | |
label="Save", | |
data=data | |
): | |
saved_button = True | |
with col2: | |
if st.button("Clear"): | |
st.session_state.button_enter_doc = False | |
st.session_state.extracted_doc = False | |
st.experimental_rerun() | |
if saved_button: | |
# Confirmation message | |
st.success(f"File saved successfully.") | |
else: | |
st.warning("Data not extracted") | |
if st.button("clear"): | |
st.session_state.button_enter_doc = False | |
st.session_state.extracted_doc = False | |
st.experimental_rerun() | |
# Add a success message to the sidebar | |
st.sidebar.success("Select a page above.") | |
# importing the custom footer from utils | |
cust_footer() | |
if __name__ == "__main__": | |
main() | |