Spaces:
Runtime error
Runtime error
File size: 4,077 Bytes
26998f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import streamlit as st
# setting page config. for centered mode
st.set_page_config(layout="centered")
from utils.footer import cust_footer
import docx2txt
import requests
import pdfplumber
# function to run the enter button
def run_function(documents):
data = ""
if documents is not None:
for document in documents:
document_details = {
"filename": document.name,
"filetype": document.type,
"filesize": document.size
}
st.write(document_details)
# Extract content from the txt file
if document.type == "text/plain":
# Read as bytes
data += str(document.read(), "utf-8")
# Extract content from the pdf file
elif document.type == "application/pdf":
# using PyPDF2
# data += read_pdf(document)
# using pdfplumber
try:
with pdfplumber.open(document) as pdf:
all_text = ""
for page in pdf.pages:
text = page.extract_text()
all_text += text + "\n"
data += all_text
except requests.exceptions.RequestException as e:
st.write("None")
# Extract content from the docx file
elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
data += docx2txt.process(document)
# Display the extracted text content from file
st.text_area("Extracted Text", value=data, height=200)
# return extract status, and the data extracted
return True, data
else:
st.error("Error: An error occurred while fetching content.")
# return extract status, and the data extracted
return False, data
def main():
st.subheader("Extract Data from Documents")
documents = st.file_uploader(
"", type=["pdf", "txt", "docx"], accept_multiple_files=True
)
if "button_enter_doc" not in st.session_state:
st.session_state.button_enter_doc = False
if "extracted_doc" not in st.session_state:
st.session_state.extracted_doc = False
data = ""
if st.button("Enter"):
st.session_state.button_enter_doc = True
# the enter button
if st.session_state.button_enter_doc:
# check if it is a sitemap or not
if not documents:
documents = None
else:
for doc in documents:
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
# if documents is not the relevant type
st.error("Unsupported file: " + doc.name)
st.session_state.extracted_doc, data = run_function(documents)
if st.session_state.extracted_doc:
col1, col2 = st.columns([0.5, 0.5])
with col1:
saved_button = False
if st.download_button(
label="Save",
data=data
):
saved_button = True
with col2:
if st.button("Clear"):
st.session_state.button_enter_doc = False
st.session_state.extracted_doc = False
st.experimental_rerun()
if saved_button:
# Confirmation message
st.success(f"File saved successfully.")
else:
st.warning("Data not extracted")
if st.button("clear"):
st.session_state.button_enter_doc = False
st.session_state.extracted_doc = False
st.experimental_rerun()
# Add a success message to the sidebar
st.sidebar.success("Select a page above.")
# importing the custom footer from utils
cust_footer()
if __name__ == "__main__":
main()
|