Spaces:
Build error
Build error
File size: 5,728 Bytes
13cd0a3 9ab7b73 abcaca9 9c2785c abcaca9 9c2785c abcaca9 8d4dd5e abcaca9 ce42613 cd2a4c0 7c65c8c a29b26b 6b3f61e 7c65c8c 4372d93 6b3f61e c6ee980 ce42613 dcdc714 ce42613 6977cda ce42613 6977cda ce42613 c4259fc ce42613 13cd0a3 ce42613 13cd0a3 839b745 c4259fc f340342 ce42613 9ab7b73 ce42613 9ab7b73 ce42613 c4259fc 133c41f 99b1da3 133c41f 9ab7b73 133c41f dcdc714 133c41f dcdc714 133c41f dcdc714 8d4dd5e 6977cda 8d4dd5e ce42613 8d4dd5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import html
import os
from typing import AnyStr
import nltk
import streamlit as st
import validators
from transformers import pipeline
from validators import ValidationFailure
from Summarizer import Summarizer
def main() -> None:
nltk.download('punkt')
st.markdown('# Terms & Conditions Summarizer :pencil:')
st.markdown('Do you also always take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up to an app like the responsible citizen that you are? :thinking_face:<br>'
'No?<br>'
"Well don't worry, neither do we! That's why we created a <b>Terms & Conditions Summarization</b> algorithm!", unsafe_allow_html=True)
st.markdown('Just copy-paste that pesky Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest!<br>'
'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary)<br>'
'The abstractive summary will give you an idea of what the key message of the document likely is :bulb:', unsafe_allow_html=True)
st.markdown('<b>Want to find out more?</b> :brain:<br>'
'For details about the extractive part :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis<br>'
'For details about the abstractive part :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True)
@st.cache(allow_output_mutation=True,
suppress_st_warning=True,
show_spinner=False)
def create_pipeline():
with st.spinner('Please wait for the model to load...'):
terms_and_conditions_pipeline = pipeline(
task='summarization',
model='ml6team/distilbart-tos-summarizer-tosdr',
tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
)
return terms_and_conditions_pipeline
def display_abstractive_summary(summary_sentences: list) -> None:
st.subheader("Abstractive Summary")
st.markdown('#####')
for sentence in summary_sentences:
st.markdown(f"- {sentence}", unsafe_allow_html=True)
def display_extractive_summary(terms_and_conditions_text: str, summary_sentences: list) -> None:
st.subheader("Extractive Summary")
st.markdown('#####')
replaced_text = html.escape(terms_and_conditions_text)
for sentence in summary_sentences:
sentence = html.escape(sentence)
replaced_text = replaced_text.replace(sentence, f"<mark>{sentence}</mark>")
replaced_text = replaced_text.replace('\n', '<br/>')
with st.container():
st.markdown(replaced_text, unsafe_allow_html=True)
def is_valid_url(url: str) -> bool:
result = validators.url(url)
if isinstance(result, ValidationFailure):
return False
return True
def list_all_filenames() -> list:
filenames = []
for file in os.listdir('./sample-terms-and-conditions/'):
if file.endswith('.txt'):
filenames.append(file.replace('.txt', ''))
return filenames
def fetch_file_contents(filename: str) -> AnyStr:
with open(f'./sample-terms-and-conditions/{filename.lower()}.txt', 'r') as f:
data = f.read()
return data
summarizer: Summarizer = Summarizer(create_pipeline())
if 'tc_text' not in st.session_state:
st.session_state['tc_text'] = ''
if 'sentences_length' not in st.session_state:
st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH
if 'sample_choice' not in st.session_state:
st.session_state['sample_choice'] = ''
st.header("Input")
sentences_length = st.number_input(
label='Number of sentences to be extracted:',
min_value=5,
max_value=15,
value=st.session_state.sentences_length
)
sample_choice = st.selectbox(
'Choose a sample terms & conditions:',
list_all_filenames())
st.session_state.tc_text = fetch_file_contents(sample_choice)
tc_text_input = st.text_area(
value=st.session_state.tc_text,
label='Terms & conditions content or specify an URL:',
height=240
)
summarize_button = st.button(label='Summarize')
@st.cache(suppress_st_warning=True,
show_spinner=False,
allow_output_mutation=True,
hash_funcs={"torch.nn.parameter.Parameter": lambda _: None,
"tokenizers.Tokenizer": lambda _: None,
"tokenizers.AddedToken": lambda _: None,
})
def abstractive_summary_from_cache(summary_sentences: tuple) -> tuple:
with st.spinner('Summarizing the text is in progress...'):
return tuple(summarizer.abstractive_summary(list(summary_sentences)))
if summarize_button:
if is_valid_url(tc_text_input):
extract_summary_sentences = summarizer.extractive_summary_from_url(tc_text_input, sentences_length)
else:
extract_summary_sentences = summarizer.extractive_summary_from_text(tc_text_input, sentences_length)
extract_summary_sentences_tuple = tuple(extract_summary_sentences)
abstract_summary_tuple = abstractive_summary_from_cache(extract_summary_sentences_tuple)
abstract_summary_list = list(abstract_summary_tuple)
display_abstractive_summary(abstract_summary_list)
display_extractive_summary(tc_text_input, extract_summary_sentences)
if __name__ == "__main__":
main()
|