Matthijs Hollemans commited on
Commit
44f5cb7
1 Parent(s): 1aa521f

16-bit floats, draw one word at a time, optimize making video

Browse files
Files changed (1) hide show
  1. app.py +76 -22
app.py CHANGED
@@ -2,13 +2,14 @@ import gradio as gr
2
  import librosa
3
  import numpy as np
4
  import moviepy.editor as mpy
 
5
 
6
  from PIL import Image, ImageDraw, ImageFont
7
  from transformers import pipeline
8
 
9
 
10
- fps = 25
11
  max_duration = 60 # seconds
 
12
  video_width = 640
13
  video_height = 480
14
  margin_left = 20
@@ -21,28 +22,46 @@ font = ImageFont.truetype("Lato-Regular.ttf", 40)
21
  text_color = (255, 200, 200)
22
  highlight_color = (255, 255, 255)
23
 
24
- checkpoint = "openai/whisper-tiny"
25
  # checkpoint = "openai/whisper-base"
26
- # checkpoint = "openai/whisper-small"
27
- pipe = pipeline(model=checkpoint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # TODO: no longer need to set these manually once the models have been updated on the Hub
30
  # whisper-tiny
31
- pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
32
  # whisper-base
33
  # pipe.model.generation_config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
34
  # whisper-small
35
- # pipe.model.generation_config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
36
 
37
  chunks = []
38
 
 
 
 
39
 
40
- def make_frame(t):
41
- global chunks
42
 
43
- # TODO speed optimization: could cache the last image returned and if the
44
- # active chunk and active word didn't change, use that last image instead
45
- # of drawing the exact same thing again
46
 
47
  # TODO in the Henry V example, the word "desires" has an ending timestamp
48
  # that's too far into the future, and so the word stays highlighted.
@@ -55,29 +74,60 @@ def make_frame(t):
55
  # for debugging: draw frame time
56
  #draw.text((20, 20), str(t), fill=text_color, font=font)
57
 
 
58
  x = margin_left
59
  y = margin_top
60
 
61
- for chunk in chunks:
 
 
 
62
  chunk_start = chunk["timestamp"][0]
63
  chunk_end = chunk["timestamp"][1]
 
64
  if chunk_end is None: chunk_end = max_duration
65
 
66
- if chunk_start <= t <= chunk_end:
67
- word = chunk["text"]
68
- word_length = draw.textlength(word, font)
 
 
 
 
 
 
 
 
69
 
70
- x = (video_width - word_length) / 2
71
- y = video_height / 2 - 20
72
 
73
- draw.text((x, y), word, fill=highlight_color, font=font)
74
- break
75
 
76
- return np.array(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  def predict(audio_path):
80
- global chunks
 
 
 
 
81
 
82
  audio_data, sr = librosa.load(audio_path, mono=True)
83
  duration = librosa.get_duration(y=audio_data, sr=sr)
@@ -103,7 +153,11 @@ title = "Word-level timestamps with Whisper"
103
  description = """
104
  This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!
105
 
106
- This demo uses the <b>openai/whisper-small</b> checkpoint. Since it's only a demo, the output is limited to the first 60 seconds of audio.
 
 
 
 
107
  """
108
 
109
  article = """
 
2
  import librosa
3
  import numpy as np
4
  import moviepy.editor as mpy
5
+ import torch
6
 
7
  from PIL import Image, ImageDraw, ImageFont
8
  from transformers import pipeline
9
 
10
 
 
11
  max_duration = 60 # seconds
12
+ fps = 25
13
  video_width = 640
14
  video_height = 480
15
  margin_left = 20
 
22
  text_color = (255, 200, 200)
23
  highlight_color = (255, 255, 255)
24
 
25
+ # checkpoint = "openai/whisper-tiny"
26
  # checkpoint = "openai/whisper-base"
27
+ checkpoint = "openai/whisper-small"
28
+
29
+ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
30
+ from transformers import (
31
+ AutomaticSpeechRecognitionPipeline,
32
+ WhisperForConditionalGeneration,
33
+ WhisperProcessor,
34
+ )
35
+ model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half()
36
+ processor = WhisperProcessor.from_pretrained(checkpoint)
37
+ pipe = AutomaticSpeechRecognitionPipeline(
38
+ model=model,
39
+ tokenizer=processor.tokenizer,
40
+ feature_extractor=processor.feature_extractor,
41
+ batch_size=8,
42
+ torch_dtype=torch.float16,
43
+ device="cuda:0"
44
+ )
45
+ else:
46
+ pipe = pipeline(model=checkpoint)
47
 
48
  # TODO: no longer need to set these manually once the models have been updated on the Hub
49
  # whisper-tiny
50
+ # pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
51
  # whisper-base
52
  # pipe.model.generation_config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
53
  # whisper-small
54
+ pipe.model.generation_config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
55
 
56
  chunks = []
57
 
58
+ start_chunk = 0
59
+ last_draws = []
60
+ last_image = None
61
 
 
 
62
 
63
+ def make_frame(t):
64
+ global chunks, start_chunk, last_draws, last_image
 
65
 
66
  # TODO in the Henry V example, the word "desires" has an ending timestamp
67
  # that's too far into the future, and so the word stays highlighted.
 
74
  # for debugging: draw frame time
75
  #draw.text((20, 20), str(t), fill=text_color, font=font)
76
 
77
+ space_length = draw.textlength(" ", font)
78
  x = margin_left
79
  y = margin_top
80
 
81
+ # Create a list of drawing commands
82
+ draws = []
83
+ for i in range(start_chunk, len(chunks)):
84
+ chunk = chunks[i]
85
  chunk_start = chunk["timestamp"][0]
86
  chunk_end = chunk["timestamp"][1]
87
+ if chunk_start > t: break
88
  if chunk_end is None: chunk_end = max_duration
89
 
90
+ word = chunk["text"]
91
+ word_length = draw.textlength(word + " ", font) - space_length
92
+
93
+ if x + word_length >= video_width - margin_right:
94
+ x = margin_left
95
+ y += line_height
96
+
97
+ # restart page when end is reached
98
+ if y >= margin_top + line_height * 7:
99
+ start_chunk = i
100
+ break
101
 
102
+ highlight = (chunk_start <= t < chunk_end)
103
+ draws.append([x, y, word, word_length, highlight])
104
 
105
+ x += word_length + space_length
 
106
 
107
+ # If the drawing commands didn't change, then reuse the last image,
108
+ # otherwise draw a new image
109
+ if draws != last_draws:
110
+ for x, y, word, word_length, highlight in draws:
111
+ if highlight:
112
+ color = highlight_color
113
+ draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
114
+ else:
115
+ color = text_color
116
+
117
+ draw.text((x, y), word, fill=color, font=font)
118
+
119
+ last_image = np.array(image)
120
+ last_draws = draws
121
+
122
+ return last_image
123
 
124
 
125
  def predict(audio_path):
126
+ global chunks, start_chunk, last_draws, last_image
127
+
128
+ start_chunk = 0
129
+ last_draws = []
130
+ last_image = None
131
 
132
  audio_data, sr = librosa.load(audio_path, mono=True)
133
  duration = librosa.get_duration(y=audio_data, sr=sr)
 
153
  description = """
154
  This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!
155
 
156
+ This demo uses the <b>openai/whisper-small</b> checkpoint.
157
+
158
+ Since it's only a demo, the output is limited to the first 60 seconds of audio.
159
+ To use this on longer audio, <a href="https://huggingface.co/spaces/Matthijs/whisper_word_timestamps/settings?duplicate=true">duplicate the space</a>
160
+ and in <b>app.py</b> change the value of `max_duration`.
161
  """
162
 
163
  article = """