Peter commited on
Commit
01d78f2
1 Parent(s): e1cbb91

:sparkles: update to blocks api

Browse files
Files changed (4) hide show
  1. app.py +125 -57
  2. requirements.txt +1 -0
  3. summarize.py +4 -2
  4. utils.py +15 -2
app.py CHANGED
@@ -1,22 +1,21 @@
1
  import logging
2
- import re
3
- from pathlib import Path
4
  import time
 
 
5
  import gradio as gr
6
  import nltk
7
  from cleantext import clean
8
 
9
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
10
- from utils import load_examples, truncate_word_count
11
 
12
  _here = Path(__file__).parent
13
 
14
  nltk.download("stopwords") # TODO=find where this requirement originates from
15
 
16
- import transformers
17
-
18
- transformers.logging.set_verbosity_error()
19
- logging.basicConfig()
20
 
21
 
22
  def proc_submission(
@@ -56,6 +55,7 @@ def proc_submission(
56
  clean_text = clean(input_text, lower=False)
57
  max_input_length = 1024 if model_size == "base" else max_input_length
58
  processed = truncate_word_count(clean_text, max_input_length)
 
59
  if processed["was_truncated"]:
60
  tr_in = processed["truncated_text"]
61
  msg = f"Input text was truncated to {max_input_length} words (based on whitespace)"
@@ -63,6 +63,7 @@ def proc_submission(
63
  history["WARNING"] = msg
64
  else:
65
  tr_in = input_text
 
66
 
67
  _summaries = summarize_via_tokenbatches(
68
  tr_in,
@@ -73,79 +74,146 @@ def proc_submission(
73
  )
74
  sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
75
  sum_scores = [
76
- f"\n - Section {i}: {round(s['summary_score'],4)}"
77
  for i, s in enumerate(_summaries)
78
  ]
79
 
80
- history["Summary Text"] = "<br>".join(sum_text)
81
- history[
82
- "Summary Scores"
83
- ] = "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better.<br><br>"
84
- history["Summary Scores"] += "\n".join(sum_scores)
85
- html = ""
86
  rt = round((time.perf_counter() - st) / 60, 2)
87
  print(f"Runtime: {rt} minutes")
 
88
  html += f"<p>Runtime: {rt} minutes on CPU</p>"
89
- for name, item in history.items():
90
- html += (
91
- f"<h2>{name}:</h2><hr><b>{item}</b><br><br>"
92
- if "summary" not in name.lower()
93
- else f"<h2>{name}:</h2><hr>{item}<br><br>"
94
- )
95
 
96
  html += ""
97
 
98
- return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
 
101
  if __name__ == "__main__":
102
 
103
  model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
104
  model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
105
- title = "Long-Form Summarization: LED & BookSum"
106
-
107
- description = "A simple demo using a fine-tuned LED model to summarize long-form text. [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
108
-
109
- gr.Interface(
110
- proc_submission,
111
- inputs=[
112
- gr.inputs.Textbox(
113
- lines=10,
114
- label="input text",
115
- placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
116
- ),
117
- gr.inputs.Radio(
 
 
 
 
 
 
118
  choices=["base", "large"], label="model size", default="large"
119
- ),
120
- gr.inputs.Slider(
121
  minimum=2, maximum=4, label="num_beams", default=2, step=1
122
- ),
123
- gr.inputs.Slider(
124
  minimum=512,
125
  maximum=1024,
126
  label="token_batch_length",
127
  default=512,
128
  step=256,
129
- ),
130
- gr.inputs.Slider(
131
- minimum=0.5, maximum=1.1, label="length_penalty", default=0.7, step=0.05
132
- ),
133
- gr.inputs.Slider(
134
  minimum=1.0,
135
  maximum=5.0,
136
- label="repetition_penalty",
137
  default=3.5,
138
  step=0.1,
139
- ),
140
- gr.inputs.Slider(
141
- minimum=2, maximum=4, label="no_repeat_ngram_size", default=3, step=1
142
- ),
143
- ],
144
- outputs="html",
145
- examples_per_page=2,
146
- title=title,
147
- description=description,
148
- article="The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial.",
149
- examples=load_examples(_here / "examples"),
150
- cache_examples=True,
151
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
 
 
2
  import time
3
+ from pathlib import Path
4
+
5
  import gradio as gr
6
  import nltk
7
  from cleantext import clean
8
 
9
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
10
+ from utils import load_example_filenames, truncate_word_count
11
 
12
  _here = Path(__file__).parent
13
 
14
  nltk.download("stopwords") # TODO=find where this requirement originates from
15
 
16
+ logging.basicConfig(
17
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
18
+ )
 
19
 
20
 
21
  def proc_submission(
 
55
  clean_text = clean(input_text, lower=False)
56
  max_input_length = 1024 if model_size == "base" else max_input_length
57
  processed = truncate_word_count(clean_text, max_input_length)
58
+
59
  if processed["was_truncated"]:
60
  tr_in = processed["truncated_text"]
61
  msg = f"Input text was truncated to {max_input_length} words (based on whitespace)"
 
63
  history["WARNING"] = msg
64
  else:
65
  tr_in = input_text
66
+ msg = None
67
 
68
  _summaries = summarize_via_tokenbatches(
69
  tr_in,
 
74
  )
75
  sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
76
  sum_scores = [
77
+ f" - Section {i}: {round(s['summary_score'],4)}"
78
  for i, s in enumerate(_summaries)
79
  ]
80
 
81
+ sum_text_out = "\n".join(sum_text)
82
+ history["Summary Scores"] = "<br><br>"
83
+ scores_out = "\n".join(sum_scores)
 
 
 
84
  rt = round((time.perf_counter() - st) / 60, 2)
85
  print(f"Runtime: {rt} minutes")
86
+ html = ""
87
  html += f"<p>Runtime: {rt} minutes on CPU</p>"
88
+ if msg is not None:
89
+ html += f"<h2>WARNING:</h2><hr><b>{msg}</b><br><br>"
 
 
 
 
90
 
91
  html += ""
92
 
93
+ return html, sum_text_out, scores_out
94
+
95
+
96
+ def load_single_example_text(
97
+ example_path: str or Path,
98
+ ):
99
+ """
100
+ load_single_example - a helper function for the gradio module to load examples
101
+ Returns:
102
+ list of str, the examples
103
+ """
104
+ global name_to_path
105
+ full_ex_path = name_to_path[example_path]
106
+ full_ex_path = Path(full_ex_path)
107
+ # load the examples into a list
108
+ with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
109
+ raw_text = f.read()
110
+ text = clean(raw_text, lower=False)
111
+ return text
112
 
113
 
114
  if __name__ == "__main__":
115
 
116
  model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
117
  model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
118
+
119
+ name_to_path = load_example_filenames(_here / "examples")
120
+ logging.info(f"Loaded {len(name_to_path)} examples")
121
+ demo = gr.Blocks()
122
+
123
+ with demo:
124
+
125
+ gr.Markdown("# Long-Form Summarization: LED & BookSum")
126
+ gr.Markdown(
127
+ "A simple demo using a fine-tuned LED model to summarize long-form text. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
128
+ )
129
+ with gr.Column():
130
+
131
+ gr.Markdown("## Load Inputs & Select Parameters")
132
+ gr.Markdown(
133
+ "Enter your text below or choose an example, and select the model size and parameters. Press the button to load examples."
134
+ )
135
+
136
+ model_size = gr.inputs.Radio(
137
  choices=["base", "large"], label="model size", default="large"
138
+ )
139
+ num_beams = gr.inputs.Slider(
140
  minimum=2, maximum=4, label="num_beams", default=2, step=1
141
+ )
142
+ token_batch_length = gr.inputs.Slider(
143
  minimum=512,
144
  maximum=1024,
145
  label="token_batch_length",
146
  default=512,
147
  step=256,
148
+ )
149
+ length_penalty = gr.inputs.Slider(
150
+ minimum=0.5, maximum=1.0, label="length penalty", default=0.7, step=0.05
151
+ )
152
+ repetition_penalty = gr.inputs.Slider(
153
  minimum=1.0,
154
  maximum=5.0,
155
+ label="repetition penalty",
156
  default=3.5,
157
  step=0.1,
158
+ )
159
+ no_repeat_ngram_size = gr.inputs.Slider(
160
+ minimum=2, maximum=4, label="no repeat ngram size", default=3, step=1
161
+ )
162
+ example_name = gr.Dropdown(
163
+ list(name_to_path.keys()),
164
+ label="Load Example",
165
+ )
166
+ load_examples_button = gr.Button(
167
+ "Load Example",
168
+ )
169
+ input_text = gr.Textbox(
170
+ lines=6,
171
+ label="input text",
172
+ placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
173
+ )
174
+
175
+ with gr.Column():
176
+ gr.Markdown("## Generate Summary")
177
+ gr.Markdown("Summary generation should take approximately 1-2 minutes for most settings.")
178
+ summarize_button = gr.Button("Summarize!")
179
+
180
+ output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
181
+ gr.Markdown("### Summary Output")
182
+ summary_text = gr.Textbox(
183
+ label="Summary", placeholder="The generated summary will appear here"
184
+ )
185
+ gr.Markdown(
186
+ "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
187
+ )
188
+ summary_scores = gr.Textbox(
189
+ label="Summary Scores", placeholder="Summary scores will appear here"
190
+ )
191
+
192
+ with gr.Column():
193
+ gr.Markdown("## About the Model")
194
+ gr.Markdown(
195
+ "- [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned checkpoint of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
196
+ )
197
+ gr.Markdown(
198
+ "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial."
199
+ )
200
+
201
+ load_examples_button.click(
202
+ fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
203
+ )
204
+
205
+ summarize_button.click(
206
+ fn=proc_submission,
207
+ inputs=[
208
+ input_text,
209
+ model_size,
210
+ num_beams,
211
+ token_batch_length,
212
+ length_penalty,
213
+ repetition_penalty,
214
+ no_repeat_ngram_size,
215
+ ],
216
+ outputs=[output_text, summary_text, summary_scores],
217
+ )
218
+
219
+ demo.launch(enable_queue=True, prevent_thread_lock=True)
requirements.txt CHANGED
@@ -5,3 +5,4 @@ nltk
5
  torch
6
  tqdm
7
  transformers
 
 
5
  torch
6
  tqdm
7
  transformers
8
+ accelerate
summarize.py CHANGED
@@ -18,11 +18,13 @@ def load_model_and_tokenizer(model_name):
18
 
19
  model = AutoModelForSeq2SeqLM.from_pretrained(
20
  model_name,
21
- low_cpu_mem_usage=True,
22
- use_cache=False,
23
  )
24
  tokenizer = AutoTokenizer.from_pretrained(model_name)
25
  model = model.to("cuda") if torch.cuda.is_available() else model
 
 
26
  return model, tokenizer
27
 
28
 
 
18
 
19
  model = AutoModelForSeq2SeqLM.from_pretrained(
20
  model_name,
21
+ # low_cpu_mem_usage=True,
22
+ # use_cache=False,
23
  )
24
  tokenizer = AutoTokenizer.from_pretrained(model_name)
25
  model = model.to("cuda") if torch.cuda.is_available() else model
26
+
27
+ logging.info(f"Loaded model {model_name}")
28
  return model, tokenizer
29
 
30
 
utils.py CHANGED
@@ -2,9 +2,10 @@
2
  utils.py - Utility functions for the project.
3
  """
4
 
5
- from natsort import natsorted
6
- from pathlib import Path
7
  import re
 
 
 
8
 
9
 
10
  def truncate_word_count(text, max_words=512):
@@ -48,3 +49,15 @@ def load_examples(src):
48
  text_examples.append([text, "large", 2, 512, 0.7, 3.5, 3])
49
 
50
  return text_examples
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  utils.py - Utility functions for the project.
3
  """
4
 
 
 
5
  import re
6
+ from pathlib import Path
7
+
8
+ from natsort import natsorted
9
 
10
 
11
  def truncate_word_count(text, max_words=512):
 
49
  text_examples.append([text, "large", 2, 512, 0.7, 3.5, 3])
50
 
51
  return text_examples
52
+
53
+
54
+ def load_example_filenames(example_path: str or Path):
55
+ """
56
+ load_example_filenames - a helper function for the gradio module to load examples
57
+ Returns:
58
+ dict, the examples (filename:full path)
59
+ """
60
+ example_path = Path(example_path)
61
+ # load the examples into a list
62
+ examples = {f.name: f for f in example_path.glob("*.txt")}
63
+ return examples