Spaces:
Sleeping
Sleeping
File size: 8,030 Bytes
fe0e9af 53cfd2d 01d78f2 fe0e9af 01d78f2 fe0e9af 01d78f2 fe0e9af 66e7228 fe0e9af 98a3ea7 fe0e9af 9b3e02d fe0e9af 7a2e137 fe0e9af aa3c57c fe0e9af aa3c57c b9e8529 3247bd6 fe0e9af 53cfd2d fe0e9af 98a3ea7 fe0e9af 01d78f2 fe0e9af f4f4797 8dbbc84 fe0e9af f4f4797 01d78f2 fe0e9af f4f4797 98a3ea7 66e7228 ecba037 3b66adc 01d78f2 3b66adc fe0e9af 01d78f2 b1e0e58 53cfd2d 01d78f2 53cfd2d 01d78f2 fe0e9af 01d78f2 fe0e9af 66e7228 fe0e9af 66e7228 98a3ea7 01d78f2 dd63c17 01d78f2 b2df366 01d78f2 afa6ede ecba037 afa6ede ecba037 01d78f2 fe0e9af 01d78f2 fe0e9af 01d78f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
import logging
import time
from pathlib import Path
import gradio as gr
import nltk
from cleantext import clean
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
from utils import load_example_filenames, truncate_word_count
_here = Path(__file__).parent
nltk.download("stopwords") # TODO=find where this requirement originates from
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
def proc_submission(
input_text: str,
model_size: str,
num_beams,
token_batch_length,
length_penalty,
repetition_penalty,
no_repeat_ngram_size,
max_input_length: int = 768,
):
"""
proc_submission - a helper function for the gradio module
Parameters
----------
input_text : str, required, the text to be processed
max_input_length : int, optional, the maximum length of the input text, default=512
Returns
-------
str of HTML, the interactive HTML form for the model
"""
settings = {
"length_penalty": float(length_penalty),
"repetition_penalty": float(repetition_penalty),
"no_repeat_ngram_size": int(no_repeat_ngram_size),
"encoder_no_repeat_ngram_size": 4,
"num_beams": int(num_beams),
"min_length": 4,
"max_length": int(token_batch_length // 4),
"early_stopping": True,
"do_sample": False,
}
st = time.perf_counter()
history = {}
clean_text = clean(input_text, lower=False)
max_input_length = 1024 if model_size == "base" else max_input_length
processed = truncate_word_count(clean_text, max_input_length)
if processed["was_truncated"]:
tr_in = processed["truncated_text"]
msg = f"Input text was truncated to {max_input_length} words (based on whitespace)"
logging.warning(msg)
history["WARNING"] = msg
else:
tr_in = input_text
msg = None
_summaries = summarize_via_tokenbatches(
tr_in,
model_sm if model_size == "base" else model,
tokenizer_sm if model_size == "base" else tokenizer,
batch_length=token_batch_length,
**settings,
)
sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
sum_scores = [
f" - Section {i}: {round(s['summary_score'],4)}"
for i, s in enumerate(_summaries)
]
sum_text_out = "\n".join(sum_text)
history["Summary Scores"] = "<br><br>"
scores_out = "\n".join(sum_scores)
rt = round((time.perf_counter() - st) / 60, 2)
print(f"Runtime: {rt} minutes")
html = ""
html += f"<p>Runtime: {rt} minutes on CPU</p>"
if msg is not None:
html += f"<h2>WARNING:</h2><hr><b>{msg}</b><br><br>"
html += ""
return html, sum_text_out, scores_out
def load_single_example_text(
example_path: str or Path,
):
"""
load_single_example - a helper function for the gradio module to load examples
Returns:
list of str, the examples
"""
global name_to_path
full_ex_path = name_to_path[example_path]
full_ex_path = Path(full_ex_path)
# load the examples into a list
with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
raw_text = f.read()
text = clean(raw_text, lower=False)
return text
if __name__ == "__main__":
model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
name_to_path = load_example_filenames(_here / "examples")
logging.info(f"Loaded {len(name_to_path)} examples")
demo = gr.Blocks()
with demo:
gr.Markdown("# Long-Form Summarization: LED & BookSum")
gr.Markdown(
"A simple demo using a fine-tuned LED model to summarize long-form text. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
)
with gr.Column():
gr.Markdown("## Load Inputs & Select Parameters")
gr.Markdown(
"Enter your text below or choose an example, and select the model size and parameters. Press the button to load examples."
)
model_size = gr.inputs.Radio(
choices=["base", "large"], label="model size", default="large"
)
num_beams = gr.inputs.Slider(
minimum=2, maximum=4, label="num_beams", default=2, step=1
)
token_batch_length = gr.inputs.Slider(
minimum=512,
maximum=1024,
label="token_batch_length",
default=512,
step=256,
)
length_penalty = gr.inputs.Slider(
minimum=0.5, maximum=1.0, label="length penalty", default=0.7, step=0.05
)
repetition_penalty = gr.inputs.Slider(
minimum=1.0,
maximum=5.0,
label="repetition penalty",
default=3.5,
step=0.1,
)
no_repeat_ngram_size = gr.inputs.Slider(
minimum=2, maximum=4, label="no repeat ngram size", default=3, step=1
)
example_name = gr.Dropdown(
list(name_to_path.keys()),
label="Load Example",
)
load_examples_button = gr.Button(
"Load Example",
)
input_text = gr.Textbox(
lines=6,
label="input text",
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
)
with gr.Column():
gr.Markdown("## Generate Summary")
gr.Markdown("Summary generation should take approximately 1-2 minutes for most settings.")
summarize_button = gr.Button("Summarize!")
output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
gr.Markdown("### Summary Output")
summary_text = gr.Textbox(
label="Summary", placeholder="The generated summary will appear here"
)
gr.Markdown(
"The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
)
summary_scores = gr.Textbox(
label="Summary Scores", placeholder="Summary scores will appear here"
)
with gr.Column():
gr.Markdown("## About the Model")
gr.Markdown(
"- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
)
gr.Markdown(
"- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial."
)
load_examples_button.click(
fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
)
summarize_button.click(
fn=proc_submission,
inputs=[
input_text,
model_size,
num_beams,
token_batch_length,
length_penalty,
repetition_penalty,
no_repeat_ngram_size,
],
outputs=[output_text, summary_text, summary_scores],
)
demo.launch(enable_queue=True, prevent_thread_lock=True)
|