Spaces:

ml6team
/

distilbart-tos-summarizer-tosdr

Build error

File size: 3,487 Bytes

90f2ef6
9c2785c
 
 
8d4dd5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90f2ef6
8d4dd5e
 
90f2ef6
9c2785c
 
 
8d4dd5e
 
 
9c2785c
 
 
 
 
 
 
 
90f2ef6
9c2785c
 
 
 
 
 
 
 
8d4dd5e
 
 
 
 
9c2785c
90f2ef6
8d4dd5e
 
 
 
 
 
 
 
 
 
 
 
9c2785c
90f2ef6
9c2785c
8d4dd5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90f2ef6
 
8d4dd5e
 
90f2ef6
 
9c2785c
8d4dd5e

from textwrap import wrap
from transformers import pipeline
import streamlit as st

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words

import nltk
nltk.download('punkt')

DEFAULT_LANGUAGE = "english"
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
stemmer = Stemmer(DEFAULT_LANGUAGE)
lsa_summarizer = LsaSummarizer(stemmer)
lsa_summarizer.stop_words = get_stop_words(language=DEFAULT_LANGUAGE)

st.markdown('# Terms & conditions abstractive summarization model :pencil:')
st.write('This app provides the abstract summary of the provided terms & conditions. '
         'The abstractive summarization is preceded by LSA (Latent Semantic Analysis) extractive summarization')
st.write('Information about the model :point_right: https://huggingface.co/ml6team/distilbart-tos-summarizer-tosdr')

st.markdown("""
To use this:
- Number of sentences to be extracted is configurable
- Copy terms & conditions and hit 'Summarize'
""")


@st.cache(allow_output_mutation=True,
          suppress_st_warning=True,
          show_spinner=False)
def load_model():
    with st.spinner('Please wait for the model to load...'):
        terms_and_conditions_pipeline = pipeline(
            task='summarization',
            model='ml6team/distilbart-tos-summarizer-tosdr',
            tokenizer='ml6team/distilbart-tos-summarizer-tosdr'
        )
    return terms_and_conditions_pipeline


tc_pipeline = load_model()

if 'tc_text' not in st.session_state:
    st.session_state['tc_text'] = ""

if 'sentences_length' not in st.session_state:
    st.session_state['sentences_length'] = DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH

st.header("Input")
with st.form(key='terms-and-conditions'):
    sentences_length_input = st.number_input(
        label='Number of sentences to be extracted:',
        min_value=1,
        value=st.session_state.sentences_length
    )
    tc_text_input = st.text_area(
        value=st.session_state.tc_text,
        label='Terms & conditions text:',
        height=240
    )
    submit_button = st.form_submit_button(label='Summarize')

st.header("Output")


def generate_abstractive_summary(summary) -> str:
    summary_text = " ".join([result['summary_text'] for result in tc_pipeline(wrap(summary, 2048))])
    return summary_text


def generate_extractive_summary(text, sentences_count: int) -> str:
    parser = PlaintextParser.from_string(text, Tokenizer(DEFAULT_LANGUAGE))
    summarized_sentences = lsa_summarizer(parser.document, sentences_count)
    summarized_text = " ".join([sentence._text for sentence in summarized_sentences])
    return summarized_text


def display_abstractive_summary(summary) -> None:
    st.subheader("Abstractive Summary")
    st.markdown('#####')
    st.text_area(
        value=summary,
        label='',
        height=240
    )


def display_extractive_summary(summary) -> None:
    st.subheader("Extractive Summary")
    st.markdown('#####')
    st.text_area(
        value=summary,
        label='',
        height=240
    )


if submit_button:
    tc_text = tc_text_input
    sentences_length = sentences_length_input

    extract_summary = generate_extractive_summary(tc_text, sentences_length)
    abstract_summary = generate_abstractive_summary(extract_summary)

    display_extractive_summary(extract_summary)
    display_abstractive_summary(abstract_summary)