amirhoseinsedaghati's picture
Update pages/Find_Topic.py
afd2097 verified
import streamlit as st
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import linalg
import regex as re
from configs.db_configs import add_one_item
from streamlit.components.v1 import html
from configs.html_features import set_image
from configs.download_files import FileDownloader
def preprocess_text(text):
vectorizer = CountVectorizer(stop_words='english')
vector = vectorizer.fit_transform([text]).todense()
vocab = np.array(vectorizer.get_feature_names_out())
U, s, Vh = linalg.svd(vector, full_matrices=False)
return vocab, U, s, Vh
def show_topics(text, num_top_words):
vocab, U, s, Vh = preprocess_text(text)
pattern = '\d+'
top_words = lambda Vh: [vocab[i] for i in np.argsort(Vh)[:-num_top_words-1:-1]]
topic_words = top_words(Vh[0])
topic_words = ' '.join(topic_words)
return ' '.join([re.sub(pattern, '', word) for word in topic_words.split()])
def main():
st.title('Topic Modeling by Top Keywords')
im1, im2, im3 = st.columns([1, 5.3, 1])
with im1:
pass
with im2:
url = "https://i.postimg.cc/jdF1hPng/combined.png"
html(set_image(url), height=400, width=400)
with im3:
pass
text = st.text_area('Find Topic', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
num_top_words = st.sidebar.slider('Number of Top Keywords', min_value=5, max_value=20, step=1, value=10)
if st.button('Find Topic'):
if text != '':
with st.expander('Original Text'):
st.write(text)
add_one_item(text, 'Topic Modeling')
with st.expander(f'Show Topic by {num_top_words} Top Keywords'):
topic_words = show_topics(text, num_top_words)
st.write(topic_words)
with st.expander('Download Topic words'):
FileDownloader(data=topic_words, file_ext='txt').download()
if __name__ == '__main__':
main()