File size: 7,277 Bytes
2a97daa 571aae6 2a97daa 4ca5e8f f156672 2a97daa ee2f3fb 2a97daa f156672 f053e24 f156672 f053e24 f156672 f053e24 2a97daa f156672 2a97daa 304ea47 2a97daa f156672 2a97daa f156672 2a97daa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import streamlit as st
from streamlit.components.v1 import html
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud.wordcloud import WordCloud
from configs.db_configs import add_one_item
from configs.html_features import set_image, HTML_WRAPPER
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from spacy import displacy
import spacy
nlp = spacy.load('en_core_web_sm')
from collections import Counter
import neattext as nt
import neattext.functions as nfx
from textblob import TextBlob
import nltk
def get_tokens_analysis(text):
doc_obj = nlp(text)
tokens_stats = [(token.text, token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop) for token in doc_obj]
tokens_stats_df = pd.DataFrame(tokens_stats, columns=['Token', 'Shape', 'Part-of-Speech', 'Part-of-Speech Tag', 'Root', 'IsAlpha', 'IsStop'])
return tokens_stats_df
def get_entities_tokens(text):
doc_obj = nlp(text)
options = {'colors' : {'MONEY' : '#3480f3'}}
html = displacy.render(doc_obj, style='ent', options=options)
html = html.replace('\n\n', '\n')
entities_tokens_html = HTML_WRAPPER.format(html)
return entities_tokens_html
def get_word_stats(text):
text_frame_obj = nt.TextFrame(text)
word_stats = text_frame_obj.word_stats()
word_length_freq = text_frame_obj.word_length_freq()
word_length_df = pd.DataFrame(word_length_freq.items(), columns=['word length', 'frequency'])
word_length_df['word length'] = word_length_df['word length'].astype(str)
word_length_df['word length'] = 'length ' + word_length_df['word length']
custom_color = px.colors.sequential.Blues_r
figure = px.pie(word_length_df, names='word length', values='frequency', title='Word Percentage Frequency by length', width=400, height=400, color_discrete_sequence=custom_color)
return word_stats, figure
def plot_top_keywords_frequencies(text, n_top_keywords):
preprocessed_text = nfx.remove_stopwords(text)
try:
blob = TextBlob(preprocessed_text)
words = blob.words
except:
# These corpora are commonly used by TextBlob for various natural language processing tasks.
nltk.download('brown')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('conll2000')
nltk.download('movie_reviews')
blob = TextBlob(preprocessed_text)
words = blob.words
finally:
top_keywords = Counter(words).most_common(n_top_keywords)
top_keywords_df = pd.DataFrame(top_keywords, columns=['words', 'frequency'])
figure = px.bar(top_keywords_df, x='words', y='frequency', color='frequency', title=f'the frequency of {n_top_keywords} top keywords', width=400, height=400, color_continuous_scale='Blues')
return figure
def get_sentence_stats(text):
blob = TextBlob(text)
sentences = [str(sentence) for sentence in blob.sentences]
noun_phrases = list(blob.noun_phrases)
sentence_stats = {
'Number of Sentences' : len(sentences),
'Number of Noun Phrases' : len(noun_phrases)
}
sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
return sentences, noun_phrases, sentence_stats_df
def plot_tokens_pos(tokens_stats_df):
pos_df = tokens_stats_df['Part-of-Speech'].value_counts().to_frame().reset_index()
pos_df.columns = ['Part-of-Speech', 'Frequency']
figure = px.bar(pos_df, x='Part-of-Speech', y='Frequency', color='Frequency', title=f'The Frequency of Tokens Part of speech', width=400, height=400, color_continuous_scale='Blues')
return figure
def get_sentiment_analysis_res(text):
tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_model')
inputs = tokenizer(text, return_tensors='pt')
model = AutoModelForSequenceClassification.from_pretrained('stevhliu/my_awesome_model')
with torch.no_grad():
logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
model.config.id2label = {0:'Negative', 1:'Positive'}
label = model.config.id2label[predicted_class_id]
score = float(softmax(logits, dim=1)[0][predicted_class_id])
sentiment_df = pd.DataFrame([[label, score]], columns=['Text Polarity', 'Belonging Probability'])
return sentiment_df
def plot_word_frequency(text):
wc = WordCloud(width=600, height=500).generate(text)
fig = plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
return fig
def main():
st.title('Text Analyzer')
im1, im2, im3 = st.columns([1, 5.3, 1])
with im1:
pass
with im2:
url = "https://i.postimg.cc/jdF1hPng/combined.png"
html(set_image(url), height=400, width=400)
with im3:
pass
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
if st.button('Analyze it'):
if text != '':
with st.expander('Original Text'):
st.write(text)
add_one_item(text, 'Text Analyzer')
with st.expander('Text Analysis'):
tokens_stats_df = get_tokens_analysis(text)
st.dataframe(tokens_stats_df)
with st.expander('Text Entities'):
entities_tokens_html = get_entities_tokens(text)
html(entities_tokens_html, height=300, scrolling=True)
col11, col12 = st.columns(2)
with col11:
with st.expander('Word Statistics'):
word_stats_json, figure = get_word_stats(text)
st.json(word_stats_json)
st.plotly_chart(figure)
with col12:
with st.expander(f'The Frequency of {n_top_keywords} Top Keywords'):
figure = plot_top_keywords_frequencies(text, n_top_keywords)
st.plotly_chart(figure)
col21, col22 = st.columns(2)
with col21:
with st.expander('Sentence Statistics'):
sentences, noun_phrases, sentence_stats_df = get_sentence_stats(text)
st.dataframe(sentence_stats_df)
st.write('Sentences:\n', sentences)
st.write('Noun Phrases:\n', noun_phrases)
with col22:
with st.expander('The Distribution of different Parts of Speech'):
figure = plot_tokens_pos(tokens_stats_df)
st.plotly_chart(figure)
col31, col32 = st.columns(2)
with col31:
with st.expander('Sentiment Analysis'):
sentiment_df = get_sentiment_analysis_res(text)
st.dataframe(sentiment_df)
with col32:
with st.expander('Word Frequency'):
fig = plot_word_frequency(text)
st.pyplot(fig)
else:
st.error('Please enter a non-empty text.')
if __name__ == '__main__':
main()
|