Update pages/Analyze_Text.py
Browse files- pages/Analyze_Text.py +27 -22
pages/Analyze_Text.py
CHANGED
@@ -6,29 +6,19 @@ import plotly.express as px
|
|
6 |
from wordcloud.wordcloud import WordCloud
|
7 |
from configs.db_configs import add_one_item
|
8 |
from configs.html_features import set_image, HTML_WRAPPER
|
9 |
-
|
10 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
11 |
-
|
12 |
import torch
|
13 |
from torch.nn.functional import softmax
|
14 |
-
|
15 |
from spacy import displacy
|
16 |
import spacy
|
17 |
nlp = spacy.load('en_core_web_sm')
|
18 |
-
|
19 |
from collections import Counter
|
20 |
import neattext as nt
|
21 |
import neattext.functions as nfx
|
22 |
from textblob import TextBlob
|
23 |
import nltk
|
24 |
|
25 |
-
|
26 |
-
nltk.download('brown')
|
27 |
-
nltk.download('punkt')
|
28 |
-
nltk.download('wordnet')
|
29 |
-
nltk.download('averaged_perceptron_tagger')
|
30 |
-
nltk.download('conll2000')
|
31 |
-
nltk.download('movie_reviews')
|
32 |
|
33 |
def get_tokens_analysis(text):
|
34 |
doc_obj = nlp(text)
|
@@ -39,7 +29,6 @@ def get_tokens_analysis(text):
|
|
39 |
|
40 |
def get_entities_tokens(text):
|
41 |
doc_obj = nlp(text)
|
42 |
-
|
43 |
html = displacy.render(doc_obj, style='ent')
|
44 |
html = html.replace('\n\n', '\n')
|
45 |
entities_tokens_html = HTML_WRAPPER.format(html)
|
@@ -69,15 +58,29 @@ def plot_top_keywords_frequencies(text, n_top_keywords):
|
|
69 |
|
70 |
|
71 |
def get_sentence_stats(text):
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
|
83 |
def plot_tokens_pos(tokens_stats_df):
|
@@ -109,6 +112,7 @@ def plot_word_frequency(text):
|
|
109 |
plt.axis('off')
|
110 |
return fig
|
111 |
|
|
|
112 |
def main():
|
113 |
st.title('Text Analyzer')
|
114 |
im1, im2, im3 = st.columns([1, 5.3, 1])
|
@@ -122,6 +126,7 @@ def main():
|
|
122 |
|
123 |
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
124 |
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
|
|
|
125 |
if st.button('Analyze it'):
|
126 |
if text != '':
|
127 |
with st.expander('Original Text'):
|
@@ -157,7 +162,7 @@ def main():
|
|
157 |
st.write('Noun Phrases:\n', noun_phrases)
|
158 |
|
159 |
with col22:
|
160 |
-
with st.expander('The
|
161 |
figure = plot_tokens_pos(tokens_stats_df)
|
162 |
st.plotly_chart(figure)
|
163 |
|
|
|
6 |
from wordcloud.wordcloud import WordCloud
|
7 |
from configs.db_configs import add_one_item
|
8 |
from configs.html_features import set_image, HTML_WRAPPER
|
|
|
9 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
|
10 |
import torch
|
11 |
from torch.nn.functional import softmax
|
|
|
12 |
from spacy import displacy
|
13 |
import spacy
|
14 |
nlp = spacy.load('en_core_web_sm')
|
|
|
15 |
from collections import Counter
|
16 |
import neattext as nt
|
17 |
import neattext.functions as nfx
|
18 |
from textblob import TextBlob
|
19 |
import nltk
|
20 |
|
21 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def get_tokens_analysis(text):
|
24 |
doc_obj = nlp(text)
|
|
|
29 |
|
30 |
def get_entities_tokens(text):
|
31 |
doc_obj = nlp(text)
|
|
|
32 |
html = displacy.render(doc_obj, style='ent')
|
33 |
html = html.replace('\n\n', '\n')
|
34 |
entities_tokens_html = HTML_WRAPPER.format(html)
|
|
|
58 |
|
59 |
|
60 |
def get_sentence_stats(text):
|
61 |
+
try:
|
62 |
+
blob = TextBlob(text)
|
63 |
+
sentences = [str(sentence) for sentence in blob.sentences]
|
64 |
+
noun_phrases = list(blob.noun_phrases)
|
65 |
+
except:
|
66 |
+
# These corpora are commonly used by TextBlob for various natural language processing tasks.
|
67 |
+
nltk.download('brown')
|
68 |
+
nltk.download('punkt')
|
69 |
+
nltk.download('wordnet')
|
70 |
+
nltk.download('averaged_perceptron_tagger')
|
71 |
+
nltk.download('conll2000')
|
72 |
+
nltk.download('movie_reviews')
|
73 |
+
|
74 |
+
blob = TextBlob(text)
|
75 |
+
sentences = [str(sentence) for sentence in blob.sentences]
|
76 |
+
noun_phrases = list(blob.noun_phrases)
|
77 |
+
finally:
|
78 |
+
sentence_stats = {
|
79 |
+
'Number of Sentences' : len(sentences),
|
80 |
+
'Number of Noun Phrases' : len(noun_phrases)
|
81 |
+
}
|
82 |
+
sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
|
83 |
+
return sentences, noun_phrases, sentence_stats_df
|
84 |
|
85 |
|
86 |
def plot_tokens_pos(tokens_stats_df):
|
|
|
112 |
plt.axis('off')
|
113 |
return fig
|
114 |
|
115 |
+
|
116 |
def main():
|
117 |
st.title('Text Analyzer')
|
118 |
im1, im2, im3 = st.columns([1, 5.3, 1])
|
|
|
126 |
|
127 |
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
128 |
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
|
129 |
+
|
130 |
if st.button('Analyze it'):
|
131 |
if text != '':
|
132 |
with st.expander('Original Text'):
|
|
|
162 |
st.write('Noun Phrases:\n', noun_phrases)
|
163 |
|
164 |
with col22:
|
165 |
+
with st.expander('The Distribution of different Parts of Speech'):
|
166 |
figure = plot_tokens_pos(tokens_stats_df)
|
167 |
st.plotly_chart(figure)
|
168 |
|