File size: 5,728 Bytes
13cd0a3
9ab7b73
 
 
abcaca9
9c2785c
abcaca9
 
 
9c2785c
abcaca9
8d4dd5e
abcaca9
ce42613
 
 
cd2a4c0
7c65c8c
a29b26b
6b3f61e
7c65c8c
 
4372d93
6b3f61e
c6ee980
 
ce42613
 
 
dcdc714
ce42613
 
 
 
 
 
 
 
 
6977cda
ce42613
 
6977cda
 
ce42613
c4259fc
ce42613
 
13cd0a3
ce42613
13cd0a3
839b745
c4259fc
 
f340342
ce42613
 
 
 
 
 
 
9ab7b73
 
 
 
 
 
 
 
 
 
 
 
ce42613
 
 
 
 
 
 
 
9ab7b73
 
 
ce42613
 
c4259fc
133c41f
99b1da3
 
133c41f
 
9ab7b73
 
 
 
133c41f
 
 
 
 
 
 
 
dcdc714
 
 
 
 
 
 
 
 
 
 
133c41f
 
dcdc714
 
 
 
133c41f
dcdc714
 
 
8d4dd5e
6977cda
 
8d4dd5e
 
ce42613
 
8d4dd5e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import html
import os
from typing import AnyStr

import nltk
import streamlit as st
import validators
from transformers import pipeline
from validators import ValidationFailure

from Summarizer import Summarizer


def main() -> None:
    nltk.download('punkt')

    st.markdown('# Terms & Conditions Summarizer :pencil:')
    st.markdown('Do you also always take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up to an app like the responsible citizen that you are?  :thinking_face:<br>'
                'No?<br>'
                "Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
    st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
                'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
                'The abstractive summary will give you an idea of what the key message of the document likely is :bulb:', unsafe_allow_html=True)
    st.markdown('<b>Want to find out more?</b> :brain:<br>'
             'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
             'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)

    @st.cache(allow_output_mutation=True,
              suppress_st_warning=True,
              show_spinner=False)
    def create_pipeline():
        with st.spinner('Please wait for the model to load...'):
            terms_and_conditions_pipeline = pipeline(
                task='summarization',
                model='ml6team/distilbart-tos-summarizer-tosdr',
                tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
            )
        return terms_and_conditions_pipeline

    def display_abstractive_summary(summary_sentences: list) -> None:
        st.subheader("Abstractive Summary")
        st.markdown('#####')
        for sentence in summary_sentences:
            st.markdown(f"- {sentence}", unsafe_allow_html=True)

    def display_extractive_summary(terms_and_conditions_text: str, summary_sentences: list) -> None:
        st.subheader("Extractive Summary")
        st.markdown('#####')
        replaced_text = html.escape(terms_and_conditions_text)
        for sentence in summary_sentences:
            sentence = html.escape(sentence)
            replaced_text = replaced_text.replace(sentence, f"<mark>{sentence}</mark>")
        replaced_text = replaced_text.replace('\n', '<br/>')
        with st.container():
            st.markdown(replaced_text, unsafe_allow_html=True)

    def is_valid_url(url: str) -> bool:
        result = validators.url(url)
        if isinstance(result, ValidationFailure):
            return False
        return True

    def list_all_filenames() -> list:
        filenames = []
        for file in os.listdir('./sample-terms-and-conditions/'):
            if file.endswith('.txt'):
                filenames.append(file.replace('.txt', ''))
        return filenames

    def fetch_file_contents(filename: str) -> AnyStr:
        with open(f'./sample-terms-and-conditions/{filename.lower()}.txt', 'r') as f:
            data = f.read()
        return data

    summarizer: Summarizer = Summarizer(create_pipeline())

    if 'tc_text' not in st.session_state:
        st.session_state['tc_text'] = ''

    if 'sentences_length' not in st.session_state:
        st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH

    if 'sample_choice' not in st.session_state:
        st.session_state['sample_choice'] = ''

    st.header("Input")

    sentences_length = st.number_input(
        label='Number of sentences to be extracted:',
        min_value=5,
        max_value=15,
        value=st.session_state.sentences_length
    )
    sample_choice = st.selectbox(
        'Choose a sample terms & conditions:',
        list_all_filenames())
    st.session_state.tc_text = fetch_file_contents(sample_choice)
    tc_text_input = st.text_area(
        value=st.session_state.tc_text,
        label='Terms & conditions content or specify an URL:',
        height=240
    )

    summarize_button = st.button(label='Summarize')

    @st.cache(suppress_st_warning=True,
              show_spinner=False,
              allow_output_mutation=True,
              hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
                          "tokenizers.Tokenizer": lambda _: None,
                          "tokenizers.AddedToken": lambda _: None,
                          })
    def abstractive_summary_from_cache(summary_sentences: tuple) -> tuple:
        with st.spinner('Summarizing the text is in progress...'):
            return tuple(summarizer.abstractive_summary(list(summary_sentences)))

    if summarize_button:

        if is_valid_url(tc_text_input):
            extract_summary_sentences = summarizer.extractive_summary_from_url(tc_text_input, sentences_length)
        else:
            extract_summary_sentences = summarizer.extractive_summary_from_text(tc_text_input, sentences_length)

        extract_summary_sentences_tuple = tuple(extract_summary_sentences)
        abstract_summary_tuple = abstractive_summary_from_cache(extract_summary_sentences_tuple)
        abstract_summary_list = list(abstract_summary_tuple)

        display_abstractive_summary(abstract_summary_list)
        display_extractive_summary(tc_text_input, extract_summary_sentences)


if __name__ == "__main__":
    main()