File size: 4,539 Bytes
abcaca9
9c2785c
abcaca9
 
 
9c2785c
abcaca9
8d4dd5e
abcaca9
ce42613
 
 
 
2ad6bb3
 
 
 
 
 
 
 
341d7db
 
ce42613
 
2ad6bb3
ce42613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d4dd5e
ce42613
9c2785c
ce42613
abcaca9
ce42613
 
 
 
 
 
9c2785c
ce42613
 
8d4dd5e
ce42613
 
8d4dd5e
 
ce42613
 
8d4dd5e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import nltk
import streamlit as st
import validators
from transformers import pipeline
from validators import ValidationFailure

from Summarizer import Summarizer


def main() -> None:
    nltk.download('punkt')

    st.markdown('# Terms & conditions summarization :pencil:')
    st.markdown("""
        Do you also take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up for a new app? :thinking_face:
        No?
        Well have we got a demo for you!
        Just copy-paste the lengthy Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!
        You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)
        """, unsafe_allow_html=True)
    st.markdown('Want to find out more?<br>'
             'For information about the extractive summarization :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
             'For information about the abstractive summarization :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)

    st.markdown("""
    How to use summarizer:
    - Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize'
    """)

    @st.cache(allow_output_mutation=True,
              suppress_st_warning=True,
              show_spinner=False)
    def create_pipeline():
        with st.spinner('Please wait for the model to load...'):
            terms_and_conditions_pipeline = pipeline(
                task='summarization',
                model='ml6team/distilbart-tos-summarizer-tosdr',
                tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
            )
        return terms_and_conditions_pipeline

    def display_abstractive_summary(summary) -> None:
        st.subheader("Abstractive Summary")
        st.markdown('#####')
        st.markdown(summary)

    def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
        st.subheader("Extractive Summary")
        st.markdown('#####')
        terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
        replaced_text = terms_and_conditions
        for sentence in summary_sentences:
            replaced_text = replaced_text.replace(sentence,
                                                  f"<span style='background-color: #FFFF00'>{sentence}</span>")
        st.write(replaced_text, unsafe_allow_html=True)

    def is_valid_url(url: str) -> bool:
        result = validators.url(url)
        if isinstance(result, ValidationFailure):
            return False
        return True

    summarizer: Summarizer = Summarizer(create_pipeline())

    if 'tc_text' not in st.session_state:
        st.session_state['tc_text'] = ''

    if 'sentences_length' not in st.session_state:
        st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH

    st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
    st.header("Input")

    with st.form(key='terms-and-conditions'):
        sentences_length_input = st.number_input(
            label='Number of sentences to be extracted:',
            min_value=1,
            value=st.session_state.sentences_length
        )
        tc_text_input = st.text_area(
            value=st.session_state.tc_text,
            label='Terms & conditions content or specify an URL:',
            height=240
        )

        submit_button = st.form_submit_button(label='Summarize')

    if submit_button:

        if is_valid_url(tc_text_input):
            (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
                                                                                                sentences_length_input)
        else:
            (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
                                                                                                 sentences_length_input)

        extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
        abstract_summary = summarizer.abstractive_summary(extract_summary)

        display_extractive_summary(all_sentences, extract_summary_sentences)
        display_abstractive_summary(abstract_summary)


if __name__ == "__main__":
    main()