import streamlit as st import pandas as pd import streamlit.components.v1 as stc import nltk # NLP Package-used for text analysis import nltk nltk.download('all') from sumy.parsers.plaintext import PlaintextParser from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.stem import WordNetLemmatizer from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.summarizers.text_rank import TextRankSummarizer from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize from sumy.nlp.tokenizers import Tokenizer from rouge import Rouge from transformers import BartForConditionalGeneration, BartTokenizer from transformers import T5ForConditionalGeneration, T5Tokenizer # from nltk import ne_chunk from nltk.tag import StanfordNERTagger from collections import Counter from textblob import TextBlob import seaborn as sns import matplotlib.pyplot as plt from wordcloud import WordCloud import base64 import time stanford_ner_jar = 'https://github.com/UjjwalBansal19/stanford_model/raw/main/stanford-ner.jar' # Path to the pre-trained NER model file stanford_ner_model ='https://huggingface.co/spaces/UjjwalVIT/Text_analysis_and_metadata_app/raw/main/english.all.3class.distsim.crf.ser.gz' timestr = time.strftime("%Y%m%d-%H%M%S") # from spacy import displacy #Text cleaning packages # removing stopwords, removing special characters, removing URLs, normalizing text, removing HTML tags, correcting common spelling mistakes, import neattext as nt import neattext.functions as nfx HTML_WRAPPER = """
{}
""" def evaluate_summary(summary,reference): r=Rouge() eval_score=r.get_scores(summary,reference) eval_score_df=pd.DataFrame(eval_score[0]) return eval_score_df def bart_summary(docx): model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') inputs = tokenizer.batch_encode_plus([docx], truncation=True, padding='longest', max_length=1024, return_tensors='pt') summary_ids = model.generate(inputs['input_ids'], num_beams=6, max_length=100, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def T5_summary(docx): model = T5ForConditionalGeneration.from_pretrained('t5-base') tokenizer = T5Tokenizer.from_pretrained('t5-base') input_text = "summarize: " + docx input_ids = tokenizer.encode(input_text, return_tensors='pt') summary_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary def sumy_summarizer(docx,num=5): parser=PlaintextParser.from_string(docx,Tokenizer("english")) lex_summ=LexRankSummarizer() summary=lex_summ(parser.document,sentences_count= num) summary_list=[str(sentence) for sentence in summary] result=' '.join(summary_list) return result def sumy_text_summarizer(docx, num=5): parser = PlaintextParser.from_string(docx, Tokenizer("english")) text_rank_summarizer = TextRankSummarizer() summary = text_rank_summarizer(parser.document, sentences_count=num) summary_list = [str(sentence) for sentence in summary] result = ' '.join(summary_list) return result def nlp_analysis(text): token_data = [] tokens=word_tokenize(text) tagged_tokens = pos_tag(tokens) #categorize into nouns, verbs, adjectives, adverbs, pronouns etc stop_words = set(stopwords.words('english')) #check for words like a", "an", "the", "is", "in" lemmatizer = WordNetLemmatizer() #preprocessing for token in tagged_tokens: token_text=token[0] token_shape = None token_pos = token[1] # "," - Comma CC - Coordinating conjunction DT - Determiner NN - Noun VBD - Past tense verb PRP - Personal pronoun VBD - Past tense verb token_lemma = lemmatizer.lemmatize(token_text) token_is_alpha = token_text.isalpha() token_is_stop = token_text.lower() in stop_words token_data.append([token_text,token_shape,token_pos,token_lemma,token_is_alpha,token_is_stop]) df=pd.DataFrame(token_data,columns=['Token','Shape','Position','lemma','Contains_Alphabets','Contains_Stop_words']) return df def find_entities(text): stan = StanfordNERTagger(stanford_ner_model, stanford_ner_jar) text=text.replace("\n\n","\n") tokens = nltk.word_tokenize(text) tagged_tokens = stan.tag(tokens) entities = [(token, tag) for token, tag in tagged_tokens if tag != 'O'] entities=HTML_WRAPPER.format(entities) return entities def file_download(data): csv_file= data.to_csv() b64=base64.b64encode(csv_file.encode()).decode() new_filename="result_{}.csv".format(timestr) st.markdown('### 🗃️ Download csv file ') href=f' Click Here! ' st.markdown(href, unsafe_allow_html=True) def get_most_common_tokens(text): word_tokens=Counter(text.split()) most_common=dict(word_tokens.most_common(len(text))) return most_common def get_semantics(text): blob=TextBlob(text) sentiment=blob.sentiment return sentiment def plot_wordcloud(text): text_workcloud= WordCloud().generate(text) #size indicates its frequency fig=plt.figure() plt.imshow(text_workcloud,interpolation='bilinear') plt.axis('off') st.pyplot(fig) def pos_tags(text): blob=TextBlob(text) tagged_text=blob.tags tagged_df=pd.DataFrame(tagged_text,columns=['tokens','tags']) return tagged_df TAGS = { 'NN' : 'green', 'NNS' : 'green', 'NNP' : 'green', 'NNPS' : 'green', 'VB' : 'blue', 'VBD' : 'blue', 'VBG' : 'blue', 'VBN' : 'blue', 'VBP' : 'blue', 'VBZ' : 'blue', 'JJ' : 'red', 'JJR' : 'red', 'JJS' : 'red', 'RB' : 'cyan', 'RBR' : 'cyan', 'RBS' : 'cyan', 'IN' : 'darkwhite', 'POS' : 'darkyellow', 'PRP$' : 'magenta', 'PRP$' : 'magenta', 'DET' : 'black', 'CC' : 'black', 'CD' : 'black', 'WDT' : 'black', 'WP' : 'black', 'WP$' : 'black', 'WRB' : 'black', 'EX' : 'yellow', 'FW' : 'yellow', 'LS' : 'yellow', 'MD' : 'yellow', 'PDT' : 'yellow', 'RP' : 'yellow', 'SYM' : 'yellow', 'TO' : 'yellow', 'None' : 'off' } def tag_visualize(tagged_df): colored_text=[] for i in tagged_df: if i[1] in TAGS.keys(): token=i[0] color_of_text=TAGS.get(i[1]) changed_text='{}'.format(color_of_text,token) colored_text.append(changed_text) result=''.join(colored_text) return result