|
import streamlit as st |
|
import pandas as pd |
|
import streamlit.components.v1 as stc |
|
import docx2txt |
|
|
|
|
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
from nltk.tag import pos_tag |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.corpus import stopwords |
|
|
|
from nltk.tag import StanfordNERTagger |
|
|
|
from collections import Counter |
|
|
|
from textblob import TextBlob |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
|
|
from wordcloud import WordCloud |
|
|
|
import base64 |
|
import time |
|
from app_utils import * |
|
|
|
HTML_BANNER = """ |
|
<div style="background-color:green;padding:10px;border-radius:10px"> |
|
<h1 style="color:white;text-align:center;">Text Analysis App </h1> |
|
</div> |
|
""" |
|
|
|
def text_analysis(): |
|
stc.html(HTML_BANNER) |
|
menu=['Text-analysis','Upload_Files'] |
|
|
|
choice=st.sidebar.selectbox('Menu',menu) |
|
if choice=='Text-analysis': |
|
st.subheader('Analyse Text') |
|
text=st.text_area("Enter the text to anlayze") |
|
if (st.button("Analyze")): |
|
st.success("Success") |
|
with st.expander('Original Text'): |
|
st.write(text) |
|
with st.expander('Text Analysis'): |
|
token_analysis=nlp_analysis(text) |
|
st.dataframe(token_analysis) |
|
with st.expander('Entitites'): |
|
entity_result=find_entities(text) |
|
stc.html(entity_result, height=100, scrolling=True) |
|
|
|
col1,col2=st.columns(2) |
|
|
|
with col1: |
|
|
|
with st.expander("Word Stats"): |
|
st.info("Word Statistics") |
|
docx = nt.TextFrame(text) |
|
st.write(docx.word_stats()) |
|
|
|
with st.expander("Top keywords"): |
|
keywords=get_most_common_tokens(text) |
|
st.write(keywords) |
|
|
|
with st.expander('Tagged Keywords'): |
|
data= pos_tag(word_tokenize(text)) |
|
st.dataframe(data) |
|
visualize_tags=tag_visualize(data) |
|
stc.html(visualize_tags,scrolling=True) |
|
|
|
|
|
with st.expander("Sentiment"): |
|
sent_result=get_semantics(text) |
|
st.write(sent_result) |
|
|
|
with col2: |
|
|
|
with st.expander("Plot word freq"): |
|
try: |
|
fig, ax = plt.subplots() |
|
most_common_tokens = dict(token_analysis["Token"].value_counts()) |
|
sns.countplot(data=token_analysis[token_analysis["Token"].isin(most_common_tokens)], x="Token", ax=ax) |
|
ax.set_xlabel('PoS') |
|
ax.set_ylabel('Frequency') |
|
ax.tick_params(axis='x' , rotation=45) |
|
st.pyplot(fig) |
|
except: |
|
st.warning('Insufficient data') |
|
|
|
with st.expander("Plot part of speech"): |
|
try: |
|
fig, ax = plt.subplots() |
|
most_common_tokens = dict(token_analysis["Position"].value_counts()) |
|
sns.countplot(data=token_analysis[token_analysis["Position"].isin(most_common_tokens)], x="Position", ax=ax) |
|
ax.set_xlabel('PoS') |
|
ax.set_ylabel('Frequency') |
|
ax.tick_params(axis='x' , rotation=45) |
|
st.pyplot(fig) |
|
except: |
|
st.warning('Insufficient data') |
|
|
|
with st.expander("Plot word cloud"): |
|
try: |
|
plot_wordcloud(text) |
|
except: |
|
st.warning('Insufficient data') |
|
|
|
|
|
with st.expander('Download Results'): |
|
file_download(token_analysis) |
|
|
|
|
|
|
|
|
|
|
|
|
|
elif choice == 'Upload_Files': |
|
text_file = st.file_uploader('Upload Files', type=['docx']) |
|
if text_file is not None: |
|
if text_file.type == 'text/plain': |
|
text = str(text_file.read(), "utf-8") |
|
else: |
|
text = docx2txt.process(text_file) |
|
|
|
if (st.button("Analyze")): |
|
with st.expander('Original Text'): |
|
st.write(text) |
|
with st.expander('Text Analysis'): |
|
token_analysis = nlp_analysis(text) |
|
st.dataframe(token_analysis) |
|
with st.expander('Entities'): |
|
entity_result = find_entities(text) |
|
stc.html(entity_result, height=100, scrolling=True) |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
with st.expander("Word Stats"): |
|
st.info("Word Statistics") |
|
docx = nt.TextFrame(text) |
|
st.write(docx.word_stats()) |
|
|
|
with st.expander("Top keywords"): |
|
keywords = get_most_common_tokens(text) |
|
st.write(keywords) |
|
|
|
with st.expander("Sentiment"): |
|
sent_result = get_semantics(text) |
|
st.write(sent_result) |
|
|
|
with col2: |
|
with st.expander("Plot word freq"): |
|
fig, ax = plt.subplots() |
|
num_tokens = 10 |
|
most_common_tokens = dict(token_analysis["Token"].value_counts().head(num_tokens)) |
|
sns.countplot(data=token_analysis[token_analysis["Token"].isin(most_common_tokens)], x="Token", ax=ax) |
|
ax.set_xlabel('Token') |
|
ax.set_ylabel('Frequency') |
|
ax.tick_params(axis='x', rotation=45) |
|
st.pyplot(fig) |
|
|
|
with st.expander("Plot part of speech"): |
|
fig, ax = plt.subplots() |
|
most_common_tokens = dict(token_analysis["Position"].value_counts()) |
|
sns.countplot(data=token_analysis[token_analysis["Position"].isin(most_common_tokens)], x="Position", ax=ax) |
|
ax.set_xlabel('PoS') |
|
ax.set_ylabel('Frequency') |
|
ax.tick_params(axis='x', rotation=45) |
|
st.pyplot(fig) |
|
|
|
with st.expander("Plot word cloud"): |
|
plot_wordcloud(text) |
|
|
|
with st.expander('Download Results'): |
|
file_download(token_analysis) |
|
|
|
|
|
|
|
|
|
|