import streamlit as st
import glob, os, sys; sys.path.append('/src')
#import helper
import preprocessing as pre
import cleaning as clean
def app():
# Sidebar
st.sidebar.title('Analyse Policy Document')
# Container
with st.container():
st.markdown("
SDSN X GIZ Policy Tracing
",
unsafe_allow_html=True)
file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
if file is not None:
st.write("Filename: ", file.name)
# text = []
# with pdfplumber.open(file) as pdf:
# for page in pdf.pages:
# text.append(page.extract_text())
# text_str = ' '.join([page for page in text])
# st.write('Number of pages:',len(pdf.pages))
# load document
docs = pre.load_document(file)
# preprocess document
docs_processed, df, all_text = clean.preprocessing(docs)
st.write('... ')
else:
st.write(' ')
st.write(' ')
st.markdown("no PDF uploaded ...
",
unsafe_allow_html=True)