import logging import re from pathlib import Path import gradio as gr import nltk from cleantext import clean from summarize import load_model_and_tokenizer, summarize_via_tokenbatches _here = Path(__file__).parent nltk.download("stopwords") # TODO=find where this requirement originates from import transformers transformers.logging.set_verbosity_error() logging.basicConfig() def truncate_word_count(text, max_words=512): """ truncate_word_count - a helper function for the gradio module Parameters ---------- text : str, required, the text to be processed max_words : int, optional, the maximum number of words, default=512 Returns ------- dict, the text and whether it was truncated """ # split on whitespace with regex words = re.split(r"\s+", text) processed = {} if len(words) > max_words: processed["was_truncated"] = True processed["truncated_text"] = " ".join(words[:max_words]) else: processed["was_truncated"] = False processed["truncated_text"] = text return processed def proc_submission( input_text: str, num_beams, token_batch_length, length_penalty, repetition_penalty, no_repeat_ngram_size, max_input_length: int = 512, ): """ proc_submission - a helper function for the gradio module Parameters ---------- input_text : str, required, the text to be processed max_input_length : int, optional, the maximum length of the input text, default=512 Returns ------- str of HTML, the interactive HTML form for the model """ settings = { "length_penalty": length_penalty, "repetition_penalty": repetition_penalty, "no_repeat_ngram_size": no_repeat_ngram_size, "encoder_no_repeat_ngram_size": 4, "num_beams": num_beams, "early_stopping": True, "min_length": 10, "do_sample": False, } history = {} clean_text = clean(input_text, lower=False) processed = truncate_word_count(clean_text, max_input_length) if processed["was_truncated"]: tr_in = processed["truncated_text"] msg = f"Input text was truncated to {max_input_length} words (based on whitespace)" logging.warning(msg) history["WARNING"] = msg else: tr_in = input_text history["was_truncated"] = False _summaries = summarize_via_tokenbatches( tr_in, model, tokenizer, batch_length=token_batch_length, **settings, ) sum_text = [s["summary"][0] for s in _summaries] sum_scores = [f"\n - Section {i}: {round(s['summary_score'],4)}" for i, s in enumerate(_summaries)] history["Summary Text"] = "\n\t".join(sum_text) history["Summary Scores"] = "\n".join(sum_scores) history["Input"] = tr_in html = "" for name, item in history.items(): html += ( f"