Spaces:

pszemraj
/

summarize-long-text

Running on CPU Upgrade

App Files Files Community

Peter commited on Jun 17, 2022

Commit

01d78f2

1 Parent(s): e1cbb91

:sparkles: update to blocks api

Browse files

Files changed (4) hide show

app.py +125 -57
requirements.txt +1 -0
summarize.py +4 -2
utils.py +15 -2

app.py CHANGED Viewed

@@ -1,22 +1,21 @@
 import logging
-import re
-from pathlib import Path
 import time
 import gradio as gr
 import nltk
 from cleantext import clean
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
-from utils import load_examples, truncate_word_count
 _here = Path(__file__).parent
 nltk.download("stopwords")  # TODO=find where this requirement originates from
-import transformers
-transformers.logging.set_verbosity_error()
-logging.basicConfig()
 def proc_submission(
@@ -56,6 +55,7 @@ def proc_submission(
     clean_text = clean(input_text, lower=False)
     max_input_length = 1024 if model_size == "base" else max_input_length
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
         tr_in = processed["truncated_text"]
         msg = f"Input text was truncated to {max_input_length} words (based on whitespace)"
@@ -63,6 +63,7 @@ def proc_submission(
         history["WARNING"] = msg
     else:
         tr_in = input_text
     _summaries = summarize_via_tokenbatches(
         tr_in,
@@ -73,79 +74,146 @@ def proc_submission(
     )
     sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
     sum_scores = [
-        f"\n - Section {i}: {round(s['summary_score'],4)}"
         for i, s in enumerate(_summaries)
     ]
-    history["Summary Text"] = "<br>".join(sum_text)
-    history[
-        "Summary Scores"
-    ] = "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better.<br><br>"
-    history["Summary Scores"] += "\n".join(sum_scores)
-    html = ""
     rt = round((time.perf_counter() - st) / 60, 2)
     print(f"Runtime: {rt} minutes")
     html += f"<p>Runtime: {rt} minutes on CPU</p>"
-    for name, item in history.items():
-        html += (
-            f"<h2>{name}:</h2><hr><b>{item}</b><br><br>"
-            if "summary" not in name.lower()
-            else f"<h2>{name}:</h2><hr>{item}<br><br>"
-        )
     html += ""
-    return html
 if __name__ == "__main__":
     model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
     model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
-    title = "Long-Form Summarization: LED & BookSum"
-    description = "A simple demo using a fine-tuned LED model to summarize long-form text. [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
-    gr.Interface(
-        proc_submission,
-        inputs=[
-            gr.inputs.Textbox(
-                lines=10,
-                label="input text",
-                placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
-            ),
-            gr.inputs.Radio(
                 choices=["base", "large"], label="model size", default="large"
-            ),
-            gr.inputs.Slider(
                 minimum=2, maximum=4, label="num_beams", default=2, step=1
-            ),
-            gr.inputs.Slider(
                 minimum=512,
                 maximum=1024,
                 label="token_batch_length",
                 default=512,
                 step=256,
-            ),
-            gr.inputs.Slider(
-                minimum=0.5, maximum=1.1, label="length_penalty", default=0.7, step=0.05
-            ),
-            gr.inputs.Slider(
                 minimum=1.0,
                 maximum=5.0,
-                label="repetition_penalty",
                 default=3.5,
                 step=0.1,
-            ),
-            gr.inputs.Slider(
-                minimum=2, maximum=4, label="no_repeat_ngram_size", default=3, step=1
-            ),
-        ],
-        outputs="html",
-        examples_per_page=2,
-        title=title,
-        description=description,
-        article="The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial.",
-        examples=load_examples(_here / "examples"),
-        cache_examples=True,
-    ).launch()

 import logging
 import time
+from pathlib import Path
 import gradio as gr
 import nltk
 from cleantext import clean
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
+from utils import load_example_filenames, truncate_word_count
 _here = Path(__file__).parent
 nltk.download("stopwords")  # TODO=find where this requirement originates from
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 def proc_submission(
     clean_text = clean(input_text, lower=False)
     max_input_length = 1024 if model_size == "base" else max_input_length
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
         tr_in = processed["truncated_text"]
         msg = f"Input text was truncated to {max_input_length} words (based on whitespace)"
         history["WARNING"] = msg
     else:
         tr_in = input_text
+        msg = None
     _summaries = summarize_via_tokenbatches(
         tr_in,
     )
     sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
     sum_scores = [
+        f" - Section {i}: {round(s['summary_score'],4)}"
         for i, s in enumerate(_summaries)
     ]
+    sum_text_out = "\n".join(sum_text)
+    history["Summary Scores"] = "<br><br>"
+    scores_out = "\n".join(sum_scores)
     rt = round((time.perf_counter() - st) / 60, 2)
     print(f"Runtime: {rt} minutes")
+    html = ""
     html += f"<p>Runtime: {rt} minutes on CPU</p>"
+    if msg is not None:
+        html += f"<h2>WARNING:</h2><hr><b>{msg}</b><br><br>"
     html += ""
+    return html, sum_text_out, scores_out
+def load_single_example_text(
+    example_path: str or Path,
+):
+    """
+    load_single_example - a helper function for the gradio module to load examples
+    Returns:
+        list of str, the examples
+    """
+    global name_to_path
+    full_ex_path = name_to_path[example_path]
+    full_ex_path = Path(full_ex_path)
+    # load the examples into a list
+    with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
+        raw_text = f.read()
+        text = clean(raw_text, lower=False)
+    return text
 if __name__ == "__main__":
     model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
     model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
+    name_to_path = load_example_filenames(_here / "examples")
+    logging.info(f"Loaded {len(name_to_path)} examples")
+    demo = gr.Blocks()
+    with demo:
+        gr.Markdown("# Long-Form Summarization: LED & BookSum")
+        gr.Markdown(
+            "A simple demo using a fine-tuned LED model to summarize long-form text. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
+        )
+        with gr.Column():
+            gr.Markdown("## Load Inputs & Select Parameters")
+            gr.Markdown(
+                "Enter your text below or choose an example, and select the model size and parameters. Press the button to load examples."
+            )
+            model_size = gr.inputs.Radio(
                 choices=["base", "large"], label="model size", default="large"
+            )
+            num_beams = gr.inputs.Slider(
                 minimum=2, maximum=4, label="num_beams", default=2, step=1
+            )
+            token_batch_length = gr.inputs.Slider(
                 minimum=512,
                 maximum=1024,
                 label="token_batch_length",
                 default=512,
                 step=256,
+            )
+            length_penalty = gr.inputs.Slider(
+                minimum=0.5, maximum=1.0, label="length penalty", default=0.7, step=0.05
+            )
+            repetition_penalty = gr.inputs.Slider(
                 minimum=1.0,
                 maximum=5.0,
+                label="repetition penalty",
                 default=3.5,
                 step=0.1,
+            )
+            no_repeat_ngram_size = gr.inputs.Slider(
+                minimum=2, maximum=4, label="no repeat ngram size", default=3, step=1
+            )
+            example_name = gr.Dropdown(
+                list(name_to_path.keys()),
+                label="Load Example",
+            )
+            load_examples_button = gr.Button(
+                "Load Example",
+            )
+            input_text = gr.Textbox(
+                lines=6,
+                label="input text",
+                placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
+            )
+        with gr.Column():
+            gr.Markdown("## Generate Summary")
+            gr.Markdown("Summary generation should take approximately 1-2 minutes for most settings.")
+            summarize_button = gr.Button("Summarize!")
+            output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
+            gr.Markdown("### Summary Output")
+            summary_text = gr.Textbox(
+                label="Summary", placeholder="The generated summary will appear here"
+            )
+            gr.Markdown(
+                "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
+            )
+            summary_scores = gr.Textbox(
+                label="Summary Scores", placeholder="Summary scores will appear here"
+            )
+        with gr.Column():
+            gr.Markdown("## About the Model")
+            gr.Markdown(
+                "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
+            )
+            gr.Markdown(
+                "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial."
+            )
+        load_examples_button.click(
+            fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
+        )
+        summarize_button.click(
+            fn=proc_submission,
+            inputs=[
+                input_text,
+                model_size,
+                num_beams,
+                token_batch_length,
+                length_penalty,
+                repetition_penalty,
+                no_repeat_ngram_size,
+            ],
+            outputs=[output_text, summary_text, summary_scores],
+        )
+    demo.launch(enable_queue=True, prevent_thread_lock=True)

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ nltk
 torch
 tqdm
 transformers

 torch
 tqdm
 transformers
+accelerate

summarize.py CHANGED Viewed

@@ -18,11 +18,13 @@ def load_model_and_tokenizer(model_name):
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
-        low_cpu_mem_usage=True,
-        use_cache=False,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = model.to("cuda") if torch.cuda.is_available() else model
     return model, tokenizer

     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
+        # low_cpu_mem_usage=True,
+        # use_cache=False,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = model.to("cuda") if torch.cuda.is_available() else model
+    logging.info(f"Loaded model {model_name}")
     return model, tokenizer

utils.py CHANGED Viewed

@@ -2,9 +2,10 @@
     utils.py - Utility functions for the project.
 """
-from natsort import natsorted
-from pathlib import Path
 import re
 def truncate_word_count(text, max_words=512):
@@ -48,3 +49,15 @@ def load_examples(src):
             text_examples.append([text, "large", 2, 512, 0.7, 3.5, 3])
     return text_examples

     utils.py - Utility functions for the project.
 """
 import re
+from pathlib import Path
+from natsort import natsorted
 def truncate_word_count(text, max_words=512):
             text_examples.append([text, "large", 2, 512, 0.7, 3.5, 3])
     return text_examples
+def load_example_filenames(example_path: str or Path):
+    """
+    load_example_filenames - a helper function for the gradio module to load examples
+    Returns:
+        dict, the examples (filename:full path)
+    """
+    example_path = Path(example_path)
+    # load the examples into a list
+    examples = {f.name: f for f in example_path.glob("*.txt")}
+    return examples