import streamlit as st import numpy as np from sklearn.feature_extraction.text import CountVectorizer from scipy import linalg import regex as re from configs.db_configs import add_one_item from streamlit.components.v1 import html from configs.html_features import set_image from configs.download_files import FileDownloader def preprocess_text(text): vectorizer = CountVectorizer(stop_words='english') vector = vectorizer.fit_transform([text]).todense() vocab = np.array(vectorizer.get_feature_names_out()) U, s, Vh = linalg.svd(vector, full_matrices=False) return vocab, U, s, Vh def show_topics(text, num_top_words): vocab, U, s, Vh = preprocess_text(text) pattern = '\d+' top_words = lambda Vh: [vocab[i] for i in np.argsort(Vh)[:-num_top_words-1:-1]] topic_words = top_words(Vh[0]) topic_words = ' '.join(topic_words) return ' '.join([re.sub(pattern, '', word) for word in topic_words.split()]) def main(): st.title('Topic Modeling by Top Keywords') im1, im2, im3 = st.columns([1, 5.3, 1]) with im1: pass with im2: url = "https://i.postimg.cc/jdF1hPng/combined.png" html(set_image(url), height=400, width=400) with im3: pass text = st.text_area('Find Topic', placeholder='Enter your input text here ...', height=200, label_visibility='hidden') num_top_words = st.sidebar.slider('Number of Top Keywords', min_value=5, max_value=20, step=1, value=10) if st.button('Find Topic'): if text != '': with st.expander('Original Text'): st.write(text) add_one_item(text, 'Topic Modeling') with st.expander(f'Show Topic by {num_top_words} Top Keywords'): topic_words = show_topics(text, num_top_words) st.write(topic_words) with st.expander('Download Topic words'): FileDownloader(data=topic_words, file_ext='txt').download() if __name__ == '__main__': main()