Spaces:
Runtime error
Runtime error
import gradio as gr | |
import librosa | |
import numpy as np | |
import moviepy.editor as mpy | |
from PIL import Image, ImageDraw, ImageFont | |
from transformers import pipeline | |
fps = 25 | |
max_duration = 60 # seconds | |
video_width = 640 | |
video_height = 480 | |
margin_left = 20 | |
margin_right = 20 | |
margin_top = 20 | |
line_height = 44 | |
background_image = Image.open("background.png") | |
font = ImageFont.truetype("Lato-Regular.ttf", 40) | |
text_color = (255, 200, 200) | |
highlight_color = (255, 255, 255) | |
# checkpoint = "openai/whisper-tiny" | |
# checkpoint = "openai/whisper-base" | |
checkpoint = "openai/whisper-small" | |
pipe = pipeline(model=checkpoint) | |
# TODO: no longer need to set these manually once the models have been updated on the Hub | |
# whisper-base | |
# pipe.model.config.alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]] | |
# whisper-small | |
pipe.model.config.alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]] | |
chunks = [] | |
def make_frame(t): | |
global chunks | |
# TODO speed optimization: could cache the last image returned and if the | |
# active chunk and active word didn't change, use that last image instead | |
# of drawing the exact same thing again | |
# TODO in the Henry V example, the word "desires" has an ending timestamp | |
# that's too far into the future, and so the word stays highlighted. | |
# Could fix this by finding the latest word that is active in the chunk | |
# and only highlight that one. | |
image = background_image.copy() | |
draw = ImageDraw.Draw(image) | |
# for debugging: draw frame time | |
#draw.text((20, 20), str(t), fill=text_color, font=font) | |
space_length = draw.textlength(" ", font) | |
x = margin_left | |
y = margin_top | |
for chunk in chunks: | |
chunk_start = chunk["timestamp"][0] | |
chunk_end = chunk["timestamp"][1] | |
if chunk_end is None: chunk_end = max_duration | |
if chunk_start <= t <= chunk_end: | |
words = [x["text"] for x in chunk["words"]] | |
word_times = [x["timestamp"] for x in chunk["words"]] | |
for (word, times) in zip(words, word_times): | |
word_length = draw.textlength(word + " ", font) - space_length | |
if x + word_length >= video_width - margin_right: | |
x = margin_left | |
y += line_height | |
if times[0] <= t <= times[1]: | |
color = highlight_color | |
draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color) | |
else: | |
color = text_color | |
draw.text((x, y), word, fill=color, font=font) | |
x += word_length + space_length | |
break | |
return np.array(image) | |
def predict(audio_path): | |
global chunks | |
audio_data, sr = librosa.load(audio_path, mono=True) | |
duration = librosa.get_duration(y=audio_data, sr=sr) | |
duration = min(max_duration, duration) | |
audio_data = audio_data[:int(duration * sr)] | |
# Run Whisper to get word-level timestamps. | |
audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate) | |
output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word") | |
chunks = output["chunks"] | |
#print(chunks) | |
# Create the video. | |
clip = mpy.VideoClip(make_frame, duration=duration) | |
audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration) | |
clip = clip.set_audio(audio_clip) | |
clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac") | |
return "my_video.mp4" | |
title = "Word-level timestamps with Whisper" | |
description = """ | |
This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics! | |
This demo uses the <b>openai/whisper-small</b> checkpoint. Since it's only a demo, the output is limited to the first 60 seconds of audio. | |
""" | |
article = """ | |
<div style='margin:20px auto;'> | |
<p>Credits:<p> | |
<ul> | |
<li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license) | |
<li>"Here's to the Crazy Ones" speech by Steve Jobs</li> | |
<li>"Stupid People" comedy routine by Bill Engvall</li> | |
<li>"BeOS, It's The OS" song by The Cotton Squares</li> | |
<li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li> | |
<li>Whisper model by OpenAI</li> | |
</ul> | |
</div> | |
""" | |
examples = [ | |
"examples/steve_jobs_crazy_ones.mp3", | |
"examples/henry5.wav", | |
"examples/stupid_people.mp3", | |
"examples/beos_song.mp3", | |
] | |
gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Audio(label="Upload Audio", source="upload", type="filepath"), | |
], | |
outputs=[ | |
gr.Video(label="Output Video"), | |
], | |
title=title, | |
description=description, | |
article=article, | |
examples=examples, | |
).launch() | |