Spaces:
Runtime error
Runtime error
import nltk | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
import math | |
from nltk import sent_tokenize, word_tokenize, PorterStemmer | |
from nltk.corpus import stopwords | |
def _create_frequency_table(text_string) -> dict: | |
""" | |
we create a dictionary for the word frequency table. | |
For this, we should only use the words that are not part of the stopWords array. | |
Removing stop words and making frequency table | |
Stemmer - an algorithm to bring words to its root word. | |
:rtype: dict | |
""" | |
stopWords = set(stopwords.words("english")) | |
words = word_tokenize(text_string) | |
ps = PorterStemmer() | |
freqTable = dict() | |
for word in words: | |
word = ps.stem(word) | |
if word in stopWords: | |
continue | |
if word in freqTable: | |
freqTable[word] += 1 | |
else: | |
freqTable[word] = 1 | |
return freqTable | |
def _create_frequency_matrix(sentences): | |
frequency_matrix = {} | |
stopWords = set(stopwords.words("english")) | |
ps = PorterStemmer() | |
for sent in sentences: | |
freq_table = {} | |
words = word_tokenize(sent) | |
for word in words: | |
word = word.lower() | |
word = ps.stem(word) | |
if word in stopWords: | |
continue | |
if word in freq_table: | |
freq_table[word] += 1 | |
else: | |
freq_table[word] = 1 | |
frequency_matrix[sent[:15]] = freq_table | |
return frequency_matrix | |
def _create_tf_matrix(freq_matrix): | |
tf_matrix = {} | |
for sent, f_table in freq_matrix.items(): | |
tf_table = {} | |
count_words_in_sentence = len(f_table) | |
for word, count in f_table.items(): | |
tf_table[word] = count / count_words_in_sentence | |
tf_matrix[sent] = tf_table | |
return tf_matrix | |
def _create_documents_per_words(freq_matrix): | |
word_per_doc_table = {} | |
for sent, f_table in freq_matrix.items(): | |
for word, count in f_table.items(): | |
if word in word_per_doc_table: | |
word_per_doc_table[word] += 1 | |
else: | |
word_per_doc_table[word] = 1 | |
return word_per_doc_table | |
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents): | |
idf_matrix = {} | |
for sent, f_table in freq_matrix.items(): | |
idf_table = {} | |
for word in f_table.keys(): | |
idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word])) | |
idf_matrix[sent] = idf_table | |
return idf_matrix | |
def _create_tf_idf_matrix(tf_matrix, idf_matrix): | |
tf_idf_matrix = {} | |
for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()): | |
tf_idf_table = {} | |
for (word1, value1), (word2, value2) in zip(f_table1.items(), | |
f_table2.items()): # here, keys are the same in both the table | |
tf_idf_table[word1] = float(value1 * value2) | |
tf_idf_matrix[sent1] = tf_idf_table | |
return tf_idf_matrix | |
def _score_sentences(tf_idf_matrix) -> dict: | |
""" | |
score a sentence by its word's TF | |
Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence. | |
:rtype: dict | |
""" | |
sentenceValue = {} | |
for sent, f_table in tf_idf_matrix.items(): | |
total_score_per_sentence = 0 | |
count_words_in_sentence = len(f_table) | |
for word, score in f_table.items(): | |
total_score_per_sentence += score | |
sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence | |
return sentenceValue | |
def _find_average_score(sentenceValue) -> int: | |
""" | |
Find the average score from the sentence value dictionary | |
:rtype: int | |
""" | |
sumValues = 0 | |
for entry in sentenceValue: | |
sumValues += sentenceValue[entry] | |
# Average value of a sentence from original summary_text | |
average = (sumValues / len(sentenceValue)) | |
return average | |
def _generate_summary(sentences, sentenceValue, threshold): | |
sentence_count = 0 | |
summary = '' | |
for sentence in sentences: | |
if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold): | |
summary += " " + sentence | |
sentence_count += 1 | |
return summary | |
def run_summarization(text): | |
""" | |
:param text: Plain summary_text of long article | |
:return: summarized summary_text | |
""" | |
''' | |
We already have a sentence tokenizer, so we just need | |
to run the sent_tokenize() method to create the array of sentences. | |
''' | |
# 1 Sentence Tokenize | |
sentences = sent_tokenize(text) | |
total_documents = len(sentences) | |
#print(sentences) | |
# 2 Create the Frequency matrix of the words in each sentence. | |
freq_matrix = _create_frequency_matrix(sentences) | |
#print(freq_matrix) | |
''' | |
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document. | |
''' | |
# 3 Calculate TermFrequency and generate a matrix | |
tf_matrix = _create_tf_matrix(freq_matrix) | |
#print(tf_matrix) | |
# 4 creating table for documents per words | |
count_doc_per_words = _create_documents_per_words(freq_matrix) | |
#print(count_doc_per_words) | |
''' | |
Inverse document frequency (IDF) is how unique or rare a word is. | |
''' | |
# 5 Calculate IDF and generate a matrix | |
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents) | |
#print(idf_matrix) | |
# 6 Calculate TF-IDF and generate a matrix | |
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix) | |
#print(tf_idf_matrix) | |
# 7 Important Algorithm: score the sentences | |
sentence_scores = _score_sentences(tf_idf_matrix) | |
#print(sentence_scores) | |
# 8 Find the threshold | |
threshold = _find_average_score(sentence_scores) | |
#print(threshold) | |
# 9 Important Algorithm: Generate the summary | |
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold) | |
return summary | |
#usage = run_summarization(text_str) | |
# def text_summarize(ARTICLE, maxLength, minLength): | |
# output = summarizer(ARTICLE)[0]['summary_text'] | |
# ans = text_paraphrase(output) | |
# return ans | |