Spaces:

ml6team
/

distilbart-tos-summarizer-tosdr

Build error

App Files Files Community

sdhanabal1 commited on Feb 2, 2022

Commit

99b1da3

•

1 Parent(s): 338f4fe

Use tokenizer to split sentences

Browse files

Files changed (4) hide show

Summarizer.py +35 -4
app.py +2 -1
requirements.txt +2 -1
test_summarizer.py +26 -0

Summarizer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from textwrap import wrap
 from sumy.parsers import DocumentParser
 from sumy.parsers.html import HtmlParser
@@ -7,12 +7,14 @@ from sumy.nlp.tokenizers import Tokenizer
 from sumy.nlp.stemmers import Stemmer
 from sumy.summarizers.lsa import LsaSummarizer
 from sumy.utils import get_stop_words
-from transformers import Pipeline
 class Summarizer:
     DEFAULT_LANGUAGE = "english"
     DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
     def __init__(self, pipeline: Pipeline):
         self.pipeline = pipeline
@@ -27,6 +29,30 @@ class Summarizer:
             summarized_list.append(sentence._text)
         return summarized_list
     def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
         summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
         summarized_list = Summarizer.sentence_list(summarized_sentences)
@@ -41,8 +67,13 @@ class Summarizer:
         return self.__extractive_summary(parser, sentences_count)
     def abstractive_summary(self, extract_summary_sentences: list) -> list:
-        extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
-        wrapped_sentences = wrap(extract_summary, 2048)
         abstractive_summary_list = []
         for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
             abstractive_summary_list.append(result['summary_text'])

+import string
 from sumy.parsers import DocumentParser
 from sumy.parsers.html import HtmlParser
 from sumy.nlp.stemmers import Stemmer
 from sumy.summarizers.lsa import LsaSummarizer
 from sumy.utils import get_stop_words
+from transformers import Pipeline, BertTokenizer
 class Summarizer:
     DEFAULT_LANGUAGE = "english"
     DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
+    TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
+    STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
     def __init__(self, pipeline: Pipeline):
         self.pipeline = pipeline
             summarized_list.append(sentence._text)
         return summarized_list
+    @staticmethod
+    def join_sentences(summary_sentences: list) -> str:
+        return " ".join([sentence for sentence in summary_sentences])
+    @staticmethod
+    def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
+        accumulated_lists = []
+        result_list = []
+        cumulative_token_length = 0
+        for sentence in summary_sentences:
+            result_list.append(sentence)
+            token_list = Summarizer.TOKENIZER.tokenize(sentence)
+            token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
+            token_length = len(token_words)
+            if token_length + cumulative_token_length >= max_token_length:
+                accumulated_lists.append(Summarizer.join_sentences(result_list))
+                result_list = []
+                cumulative_token_length = 0
+            else:
+                cumulative_token_length += token_length
+        if result_list:
+            accumulated_lists.append(Summarizer.join_sentences(result_list))
+        return accumulated_lists
     def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
         summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
         summarized_list = Summarizer.sentence_list(summarized_sentences)
         return self.__extractive_summary(parser, sentences_count)
     def abstractive_summary(self, extract_summary_sentences: list) -> list:
+        """
+        :param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
+        :return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
+        """
+        wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
+                                                                       max_token_length=1000)
+        # The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
         abstractive_summary_list = []
         for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
             abstractive_summary_list.append(result['summary_text'])

app.py CHANGED Viewed

@@ -87,7 +87,8 @@ def main() -> None:
     sentences_length = st.number_input(
         label='Number of sentences to be extracted:',
-        min_value=1,
         value=st.session_state.sentences_length
     )
     sample_choice = st.selectbox(

     sentences_length = st.number_input(
         label='Number of sentences to be extracted:',
+        min_value=5,
+        max_value=15,
         value=st.session_state.sentences_length
     )
     sample_choice = st.selectbox(

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ torchvision==0.10.1
 transformers==4.10.3
 sumy==0.9.0
 nltk==3.6.7
-validators==0.18.2

 transformers==4.10.3
 sumy==0.9.0
 nltk==3.6.7
+validators==0.18.2
+pytest==6.2.5

test_summarizer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from Summarizer import Summarizer
+def test_split_sentences_by_token_length():
+    summary_sentences = [
+        'Python is a programming language.',
+        'Memory allocation.',
+        'Free.'
+    ]
+    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=3)
+    assert split_sentences == [
+        'Python is a programming language.',
+        'Memory allocation. Free.'
+    ]
+    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=5)
+    assert split_sentences == [
+        'Python is a programming language. Memory allocation.',
+        'Free.'
+    ]
+    split_sentences = Summarizer.split_sentences_by_token_length(summary_sentences, max_token_length=10)
+    assert split_sentences == [
+        'Python is a programming language. Memory allocation. Free.'
+    ]