summarize-long-text

Sleeping

App Files Files Community

pszemraj commited on Oct 5, 2022

Commit

9bc2923

•

1 Parent(s): 93b2cca

💄 reorg UI

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

app.py +61 -52

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import logging
 import time
 from pathlib import Path
@@ -64,7 +66,14 @@ def proc_submission(
     if processed["was_truncated"]:
         tr_in = processed["truncated_text"]
-        msg = f"Input text was truncated to {max_input_length} words (based on whitespace)"
         logging.warning(msg)
         history["WARNING"] = msg
     else:
@@ -92,7 +101,7 @@ def proc_submission(
     html = ""
     html += f"<p>Runtime: {rt} minutes on CPU</p>"
     if msg is not None:
-        html += f"<h2>WARNING:</h2><hr><b>{msg}</b><br><br>"
     html += ""
@@ -152,7 +161,7 @@ if __name__ == "__main__":
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
     demo = gr.Blocks()
     with demo:
         gr.Markdown("# Long-Form Summarization: LED & BookSum")
@@ -167,66 +176,37 @@ if __name__ == "__main__":
             )
             with gr.Row():
                 model_size = gr.Radio(
-                    choices=["base", "large"], label="Model Variant", value="large"
                 )
                 num_beams = gr.Radio(
                     choices=[2, 3, 4],
                     label="Beam Search: # of Beams",
                     value=2,
                 )
-            gr.Markdown(
-                "_The base model is less performant than the large model, but is faster and will accept up to 2048 words per input (Large model accepts up to 768)._"
-            )
-            with gr.Row():
-                length_penalty = gr.inputs.Slider(
-                    minimum=0.5,
-                    maximum=1.0,
-                    label="length penalty",
-                    default=0.7,
-                    step=0.05,
-                )
-                token_batch_length = gr.Radio(
-                    choices=[512, 768, 1024],
-                    label="token batch length",
-                    value=512,
-                )
-            with gr.Row():
-                repetition_penalty = gr.inputs.Slider(
-                    minimum=1.0,
-                    maximum=5.0,
-                    label="repetition penalty",
-                    default=3.5,
-                    step=0.1,
-                )
-                no_repeat_ngram_size = gr.Radio(
-                    choices=[2, 3, 4],
-                    label="no repeat ngram size",
-                    value=3,
-                )
             with gr.Row():
                 example_name = gr.Dropdown(
-                    list(name_to_path.keys()),
-                    label="Choose an Example",
                 )
-                load_examples_button = gr.Button(
-                    "Load Example",
-                )
-            input_text = gr.Textbox(
-                lines=6,
-                label="Input Text (for summarization)",
-                placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
-            )
-            gr.Markdown("Upload your own file:")
-            with gr.Row():
                 uploaded_file = gr.File(
-                    label="Upload a text file",
                     file_count="single",
                     type="file",
                 )
-                load_file_button = gr.Button("Load Uploaded File")
-            gr.Markdown("---")
         with gr.Column():
             gr.Markdown("## Generate Summary")
@@ -250,10 +230,39 @@ if __name__ == "__main__":
                 label="Summary Scores", placeholder="Summary scores will appear here"
             )
-            gr.Markdown("---")
         with gr.Column():
-            gr.Markdown("## About the Model")
             gr.Markdown(
                 "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
             )

 import logging
+import random
+import re
 import time
 from pathlib import Path
     if processed["was_truncated"]:
         tr_in = processed["truncated_text"]
+        # create elaborate HTML warning
+        input_wc = re.split(r"\s+", input_text)
+        msg = f"""
+        <div style="background-color: #FFA500; color: white; padding: 20px;">
+        <h3>Warning</h3>
+        <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/len(input_wc):.2f}% of the submission.</p>
+        </div>
+        """
         logging.warning(msg)
         history["WARNING"] = msg
     else:
     html = ""
     html += f"<p>Runtime: {rt} minutes on CPU</p>"
     if msg is not None:
+        html += msg
     html += ""
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
     demo = gr.Blocks()
+    _examples = list(name_to_path.keys())
     with demo:
         gr.Markdown("# Long-Form Summarization: LED & BookSum")
             )
             with gr.Row():
                 model_size = gr.Radio(
+                    choices=["base", "large"], label="Model Variant", value="base"
                 )
                 num_beams = gr.Radio(
                     choices=[2, 3, 4],
                     label="Beam Search: # of Beams",
                     value=2,
                 )
+            gr.Markdown("Select an example, or upload a `.txt` file")
             with gr.Row():
                 example_name = gr.Dropdown(
+                    _examples,
+                    label="Examples",
+                    value=random.choice(_examples),
                 )
                 uploaded_file = gr.File(
+                    label="File Upload",
                     file_count="single",
                     type="file",
                 )
+            with gr.Row():
+                input_text = gr.Textbox(
+                    lines=4,
+                    label="Input Text (for summarization)",
+                    placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
+                )
+                with gr.Column(min_width=100, scale=0.5):
+                    load_examples_button = gr.Button(
+                        "Load Example",
+                    )
+                    load_file_button = gr.Button("Upload File")
+        gr.Markdown("---")
         with gr.Column():
             gr.Markdown("## Generate Summary")
                 label="Summary Scores", placeholder="Summary scores will appear here"
             )
+        gr.Markdown("---")
         with gr.Column():
+            gr.Markdown("### Advanced Settings")
+            with gr.Row():
+                length_penalty = gr.inputs.Slider(
+                    minimum=0.5,
+                    maximum=1.0,
+                    label="length penalty",
+                    default=0.7,
+                    step=0.05,
+                )
+                token_batch_length = gr.Radio(
+                    choices=[512, 768, 1024, 1536],
+                    label="token batch length",
+                    value=1024,
+                )
+            with gr.Row():
+                repetition_penalty = gr.inputs.Slider(
+                    minimum=1.0,
+                    maximum=5.0,
+                    label="repetition penalty",
+                    default=3.5,
+                    step=0.1,
+                )
+                no_repeat_ngram_size = gr.Radio(
+                    choices=[2, 3, 4],
+                    label="no repeat ngram size",
+                    value=3,
+                )
+        with gr.Column():
+            gr.Markdown("### About the Model")
             gr.Markdown(
                 "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
             )