from textwrap import wrap from transformers import pipeline import streamlit as st from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.summarizers.lsa import LsaSummarizer from sumy.utils import get_stop_words import nltk nltk.download('punkt') DEFAULT_LANGUAGE = "english" DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10 stemmer = Stemmer(DEFAULT_LANGUAGE) lsa_summarizer = LsaSummarizer(stemmer) lsa_summarizer.stop_words = get_stop_words(language=DEFAULT_LANGUAGE) st.markdown('# Terms & conditions abstractive summarization model :pencil:') st.write('This app provides the abstract summary of the provided terms & conditions. ' 'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization') st.write('Information about the model :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr') st.markdown(""" To use this: - Number of sentences to be extracted is configurable - Copy terms & conditions and hit 'Summarize' """) @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=False) def load_model(): with st.spinner('Please wait for the model to load...'): terms_and_conditions_pipeline = pipeline( task='summarization', model='ml6team/distilbart-tos-summarizer-tosdr', tokenizer='ml6team/distilbart-tos-summarizer-tosdr' ) return terms_and_conditions_pipeline tc_pipeline = load_model() if 'tc_text' not in st.session_state: st.session_state['tc_text'] = "" if 'sentences_length' not in st.session_state: st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH st.header("Input") with st.form(key='terms-and-conditions'): sentences_length_input = st.number_input( label='Number of sentences to be extracted:', min_value=1, value=st.session_state.sentences_length ) tc_text_input = st.text_area( value=st.session_state.tc_text, label='Terms & conditions text:', height=240 ) submit_button = st.form_submit_button(label='Summarize') st.header("Output") def generate_abstractive_summary(summary) -> str: summary_text = " ".join([result['summary_text'] for result in tc_pipeline(wrap(summary, 2048))]) return summary_text def generate_extractive_summary(text, sentences_count: int) -> str: parser = PlaintextParser.from_string(text, Tokenizer(DEFAULT_LANGUAGE)) summarized_sentences = lsa_summarizer(parser.document, sentences_count) summarized_text = " ".join([sentence._text for sentence in summarized_sentences]) return summarized_text def display_abstractive_summary(summary) -> None: st.subheader("Abstractive Summary") st.markdown('#####') st.text_area( value=summary, label='', height=240 ) def display_extractive_summary(summary) -> None: st.subheader("Extractive Summary") st.markdown('#####') st.text_area( value=summary, label='', height=240 ) if submit_button: tc_text = tc_text_input sentences_length = sentences_length_input extract_summary = generate_extractive_summary(tc_text, sentences_length) abstract_summary = generate_abstractive_summary(extract_summary) display_extractive_summary(extract_summary) display_abstractive_summary(abstract_summary)