File size: 4,726 Bytes
abcaca9
9c2785c
abcaca9
 
 
9c2785c
abcaca9
8d4dd5e
abcaca9
ce42613
 
 
 
7c65c8c
a29b26b
6b3f61e
7c65c8c
 
 
6b3f61e
c6ee980
 
ce42613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d4dd5e
ce42613
9c2785c
ce42613
abcaca9
ce42613
 
 
 
 
 
9c2785c
ce42613
 
8d4dd5e
ce42613
 
8d4dd5e
 
ce42613
 
8d4dd5e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import nltk
import streamlit as st
import validators
from transformers import pipeline
from validators import ValidationFailure

from Summarizer import Summarizer


def main() -> None:
    nltk.download('punkt')

    st.markdown('# Terms & conditions summarization :pencil:')
    st.markdown('Do you also always take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up to an app like the responsible citizen that you are?  :thinking_face:<br>'
                'No?<br>'
                "Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
    st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
                'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
                'Now you can just take a quick glanse at the summary and go about the rest of your day assured that no one is abusing your precious personal data  :books:', unsafe_allow_html=True)
    st.markdown('<b>Want to find out more?</b> :brain:<br>'
             'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
             'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)

    @st.cache(allow_output_mutation=True,
              suppress_st_warning=True,
              show_spinner=False)
    def create_pipeline():
        with st.spinner('Please wait for the model to load...'):
            terms_and_conditions_pipeline = pipeline(
                task='summarization',
                model='ml6team/distilbart-tos-summarizer-tosdr',
                tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
            )
        return terms_and_conditions_pipeline

    def display_abstractive_summary(summary) -> None:
        st.subheader("Abstractive Summary")
        st.markdown('#####')
        st.markdown(summary)

    def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None:
        st.subheader("Extractive Summary")
        st.markdown('#####')
        terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences)
        replaced_text = terms_and_conditions
        for sentence in summary_sentences:
            replaced_text = replaced_text.replace(sentence,
                                                  f"<span style='background-color: #FFFF00'>{sentence}</span>")
        st.write(replaced_text, unsafe_allow_html=True)

    def is_valid_url(url: str) -> bool:
        result = validators.url(url)
        if isinstance(result, ValidationFailure):
            return False
        return True

    summarizer: Summarizer = Summarizer(create_pipeline())

    if 'tc_text' not in st.session_state:
        st.session_state['tc_text'] = ''

    if 'sentences_length' not in st.session_state:
        st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH

    st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
    st.header("Input")

    with st.form(key='terms-and-conditions'):
        sentences_length_input = st.number_input(
            label='Number of sentences to be extracted:',
            min_value=1,
            value=st.session_state.sentences_length
        )
        tc_text_input = st.text_area(
            value=st.session_state.tc_text,
            label='Terms & conditions content or specify an URL:',
            height=240
        )

        submit_button = st.form_submit_button(label='Summarize')

    if submit_button:

        if is_valid_url(tc_text_input):
            (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input,
                                                                                                sentences_length_input)
        else:
            (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input,
                                                                                                 sentences_length_input)

        extract_summary = " ".join([sentence for sentence in extract_summary_sentences])
        abstract_summary = summarizer.abstractive_summary(extract_summary)

        display_extractive_summary(all_sentences, extract_summary_sentences)
        display_abstractive_summary(abstract_summary)


if __name__ == "__main__":
    main()