Matthijs Hollemans
add a demo song
f539e6d
raw
history blame
4.97 kB
import gradio as gr
import librosa
import numpy as np
import moviepy.editor as mpy
from PIL import Image, ImageDraw, ImageFont
from transformers import pipeline
fps = 25
max_duration = 60 # seconds
video_width = 640
video_height = 480
margin_left = 20
margin_right = 20
margin_top = 20
line_height = 44
background_image = Image.open("background.png")
font = ImageFont.truetype("Lato-Regular.ttf", 40)
text_color = (255, 200, 200)
highlight_color = (255, 255, 255)
# checkpoint = "openai/whisper-tiny"
# checkpoint = "openai/whisper-base"
checkpoint = "openai/whisper-small"
pipe = pipeline(model=checkpoint)
# TODO: no longer need to set these manually once the models have been updated on the Hub
# whisper-base
# pipe.model.config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
# whisper-small
pipe.model.config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]
chunks = []
def make_frame(t):
global chunks
# TODO speed optimization: could cache the last image returned and if the
# active chunk and active word didn't change, use that last image instead
# of drawing the exact same thing again
# TODO in the Henry V example, the word "desires" has an ending timestamp
# that's too far into the future, and so the word stays highlighted.
# Could fix this by finding the latest word that is active in the chunk
# and only highlight that one.
image = background_image.copy()
draw = ImageDraw.Draw(image)
# for debugging: draw frame time
#draw.text((20, 20), str(t), fill=text_color, font=font)
space_length = draw.textlength(" ", font)
x = margin_left
y = margin_top
for chunk in chunks:
chunk_start = chunk["timestamp"][0]
chunk_end = chunk["timestamp"][1]
if chunk_end is None: chunk_end = max_duration
if chunk_start <= t <= chunk_end:
words = [x["text"] for x in chunk["words"]]
word_times = [x["timestamp"] for x in chunk["words"]]
for (word, times) in zip(words, word_times):
word_length = draw.textlength(word + " ", font) - space_length
if x + word_length >= video_width - margin_right:
x = margin_left
y += line_height
if times[0] <= t <= times[1]:
color = highlight_color
draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
else:
color = text_color
draw.text((x, y), word, fill=color, font=font)
x += word_length + space_length
break
return np.array(image)
def predict(audio_path):
global chunks
audio_data, sr = librosa.load(audio_path, mono=True)
duration = librosa.get_duration(y=audio_data, sr=sr)
duration = min(max_duration, duration)
audio_data = audio_data[:int(duration * sr)]
# Run Whisper to get word-level timestamps.
audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
chunks = output["chunks"]
#print(chunks)
# Create the video.
clip = mpy.VideoClip(make_frame, duration=duration)
audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration)
clip = clip.set_audio(audio_clip)
clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac")
return "my_video.mp4"
title = "Word-level timestamps with Whisper"
description = """
This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!
This demo uses the <b>openai/whisper-small</b> checkpoint. Since it's only a demo, the output is limited to the first 60 seconds of audio.
"""
article = """
<div style='margin:20px auto;'>
<p>Credits:<p>
<ul>
<li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
<li>"Here's to the Crazy Ones" speech by Steve Jobs</li>
<li>"Stupid People" comedy routine by Bill Engvall</li>
<li>"BeOS, It's The OS" song by The Cotton Squares</li>
<li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li>
<li>Whisper model by OpenAI</li>
</ul>
</div>
"""
examples = [
"examples/steve_jobs_crazy_ones.mp3",
"examples/henry5.wav",
"examples/stupid_people.mp3",
"examples/beos_song.mp3",
]
gr.Interface(
fn=predict,
inputs=[
gr.Audio(label="Upload Audio", source="upload", type="filepath"),
],
outputs=[
gr.Video(label="Output Video"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch()