import nltk import streamlit as st import validators from transformers import pipeline from validators import ValidationFailure from Summarizer import Summarizer def main() -> None: nltk.download('punkt') st.markdown('# Terms & conditions summarization :pencil:') st.write( 'Do you also take the time out of your day to thoroughly read every word of the Terms & Conditions before signing up for a new app? :thinking_face: \nNo? ' 'Well have we got a demo for you! ' 'Just copy-paste the lengthy Terms & Conditions text or provide a URL to the text and let our fancy NLP algorithm do the rest! ' 'You will see both an extractive summary (the most important sentences will be highlighted) and an abstractive summary (an actual summary) ' 'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization', unsafe_allow_html=True) st.write('Want to find out more?
' 'For information about the extractive summarization :point_right: https://en.wikipedia.org/wiki/Latent_semantic_analysis
' 'For information about the abstractive summarization :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr', unsafe_allow_html=True) st.markdown(""" To use this: - Number of sentences to be extracted is configurable - Specify an URL to extract contents OR copy terms & conditions content and hit 'Summarize' """) @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False) def create_pipeline(): with st.spinner('Please wait for the model to load...'): terms_and_conditions_pipeline = pipeline( task='summarization', model='ml6team/distilbart-tos-summarizer-tosdr', tokenizer='ml6team/distilbart-tos-summarizer-tosdr' ) return terms_and_conditions_pipeline def display_abstractive_summary(summary) -> None: st.subheader("Abstractive Summary") st.markdown('#####') st.markdown(summary) def display_extractive_summary(terms_and_conditions_sentences: list, summary_sentences: list) -> None: st.subheader("Extractive Summary") st.markdown('#####') terms_and_conditions = " ".join(sentence for sentence in terms_and_conditions_sentences) replaced_text = terms_and_conditions for sentence in summary_sentences: replaced_text = replaced_text.replace(sentence, f"{sentence}") st.write(replaced_text, unsafe_allow_html=True) def is_valid_url(url: str) -> bool: result = validators.url(url) if isinstance(result, ValidationFailure): return False return True summarizer: Summarizer = Summarizer(create_pipeline()) if 'tc_text' not in st.session_state: st.session_state['tc_text'] = '' if 'sentences_length' not in st.session_state: st.session_state['sentences_length'] = Summarizer.DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH st.write('', unsafe_allow_html=True) st.header("Input") with st.form(key='terms-and-conditions'): sentences_length_input = st.number_input( label='Number of sentences to be extracted:', min_value=1, value=st.session_state.sentences_length ) tc_text_input = st.text_area( value=st.session_state.tc_text, label='Terms & conditions content or specify an URL:', height=240 ) submit_button = st.form_submit_button(label='Summarize') if submit_button: if is_valid_url(tc_text_input): (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_url(tc_text_input, sentences_length_input) else: (all_sentences, extract_summary_sentences) = summarizer.extractive_summary_from_text(tc_text_input, sentences_length_input) extract_summary = " ".join([sentence for sentence in extract_summary_sentences]) abstract_summary = summarizer.abstractive_summary(extract_summary) display_extractive_summary(all_sentences, extract_summary_sentences) display_abstractive_summary(abstract_summary) if __name__ == "__main__": main()