Spaces:

Matthijs
/

whisper_word_timestamps

Runtime error

File size: 9,424 Bytes

import gradio as gr
import librosa
import numpy as np
import moviepy.editor as mpy
import torch

from PIL import Image, ImageDraw, ImageFont
from transformers import pipeline


# checkpoint = "openai/whisper-tiny"
# checkpoint = "openai/whisper-base"
checkpoint = "openai/whisper-small"

# We need to set alignment_heads on the model's generation_config (at least
# until the models have been updated on the hub).
# If you're going to use a different version of whisper, see the following
# for which values to use for alignment_heads:
# https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a

# whisper-tiny
# alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
# whisper-base
# alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
# whisper-small
alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]

max_duration = 60  # seconds
fps = 25
video_width = 640
video_height = 480
margin_left = 20
margin_right = 20
margin_top = 20
line_height = 44

background_image = Image.open("background.png")
font = ImageFont.truetype("Lato-Regular.ttf", 40)
text_color = (255, 200, 200)
highlight_color = (255, 255, 255)


LANGUAGES = {
    "en": "english",
    "zh": "chinese",
    "de": "german",
    "es": "spanish",
    "ru": "russian",
    "ko": "korean",
    "fr": "french",
    "ja": "japanese",
    "pt": "portuguese",
    "tr": "turkish",
    "pl": "polish",
    "ca": "catalan",
    "nl": "dutch",
    "ar": "arabic",
    "sv": "swedish",
    "it": "italian",
    "id": "indonesian",
    "hi": "hindi",
    "fi": "finnish",
    "vi": "vietnamese",
    "he": "hebrew",
    "uk": "ukrainian",
    "el": "greek",
    "ms": "malay",
    "cs": "czech",
    "ro": "romanian",
    "da": "danish",
    "hu": "hungarian",
    "ta": "tamil",
    "no": "norwegian",
    "th": "thai",
    "ur": "urdu",
    "hr": "croatian",
    "bg": "bulgarian",
    "lt": "lithuanian",
    "la": "latin",
    "mi": "maori",
    "ml": "malayalam",
    "cy": "welsh",
    "sk": "slovak",
    "te": "telugu",
    "fa": "persian",
    "lv": "latvian",
    "bn": "bengali",
    "sr": "serbian",
    "az": "azerbaijani",
    "sl": "slovenian",
    "kn": "kannada",
    "et": "estonian",
    "mk": "macedonian",
    "br": "breton",
    "eu": "basque",
    "is": "icelandic",
    "hy": "armenian",
    "ne": "nepali",
    "mn": "mongolian",
    "bs": "bosnian",
    "kk": "kazakh",
    "sq": "albanian",
    "sw": "swahili",
    "gl": "galician",
    "mr": "marathi",
    "pa": "punjabi",
    "si": "sinhala",
    "km": "khmer",
    "sn": "shona",
    "yo": "yoruba",
    "so": "somali",
    "af": "afrikaans",
    "oc": "occitan",
    "ka": "georgian",
    "be": "belarusian",
    "tg": "tajik",
    "sd": "sindhi",
    "gu": "gujarati",
    "am": "amharic",
    "yi": "yiddish",
    "lo": "lao",
    "uz": "uzbek",
    "fo": "faroese",
    "ht": "haitian creole",
    "ps": "pashto",
    "tk": "turkmen",
    "nn": "nynorsk",
    "mt": "maltese",
    "sa": "sanskrit",
    "lb": "luxembourgish",
    "my": "myanmar",
    "bo": "tibetan",
    "tl": "tagalog",
    "mg": "malagasy",
    "as": "assamese",
    "tt": "tatar",
    "haw": "hawaiian",
    "ln": "lingala",
    "ha": "hausa",
    "ba": "bashkir",
    "jw": "javanese",
    "su": "sundanese",
}

# language code lookup by name, with a few language aliases
TO_LANGUAGE_CODE = {
    **{language: code for code, language in LANGUAGES.items()},
    "burmese": "my",
    "valencian": "ca",
    "flemish": "nl",
    "haitian": "ht",
    "letzeburgesch": "lb",
    "pushto": "ps",
    "panjabi": "pa",
    "moldavian": "ro",
    "moldovan": "ro",
    "sinhalese": "si",
    "castilian": "es",
}


if torch.cuda.is_available() and torch.cuda.device_count() > 0:
    from transformers import (
        AutomaticSpeechRecognitionPipeline,
        WhisperForConditionalGeneration,
        WhisperProcessor,
    )
    model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half()
    processor = WhisperProcessor.from_pretrained(checkpoint)
    pipe = AutomaticSpeechRecognitionPipeline(
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        batch_size=8,
        torch_dtype=torch.float16,
        device="cuda:0"
    )
else:
    pipe = pipeline(model=checkpoint)

pipe.model.generation_config.alignment_heads = alignment_heads

chunks = []

start_chunk = 0
last_draws = None
last_image = None


def make_frame(t):
    global chunks, start_chunk, last_draws, last_image

    # TODO in the Henry V example, the word "desires" has an ending timestamp
    # that's too far into the future, and so the word stays highlighted.
    # Could fix this by finding the latest word that is active in the chunk
    # and only highlight that one.

    image = background_image.copy()
    draw = ImageDraw.Draw(image)

    # for debugging: draw frame time
    #draw.text((20, 20), str(t), fill=text_color, font=font)

    space_length = draw.textlength(" ", font)
    x = margin_left
    y = margin_top

    # Create a list of drawing commands
    draws = []
    for i in range(start_chunk, len(chunks)):
        chunk = chunks[i]
        chunk_start = chunk["timestamp"][0]
        chunk_end = chunk["timestamp"][1]
        if chunk_start > t: break
        if chunk_end is None: chunk_end = max_duration

        word = chunk["text"]
        word_length = draw.textlength(word + " ", font) - space_length

        if x + word_length >= video_width - margin_right:
            x = margin_left
            y += line_height

            # restart page when end is reached
            if y >= margin_top + line_height * 7:
                start_chunk = i
                break

        highlight = (chunk_start <= t < chunk_end)
        draws.append([x, y, word, word_length, highlight])

        x += word_length + space_length

    # If the drawing commands didn't change, then reuse the last image,
    # otherwise draw a new image
    if draws != last_draws:
        for x, y, word, word_length, highlight in draws:
            if highlight:
                color = highlight_color
                draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
            else:
                color = text_color

            draw.text((x, y), word, fill=color, font=font)

        last_image = np.array(image)
        last_draws = draws

    return last_image


def predict(audio_path, language=None):
    global chunks, start_chunk, last_draws, last_image

    start_chunk = 0
    last_draws = None
    last_image = None

    audio_data, sr = librosa.load(audio_path, mono=True)
    duration = librosa.get_duration(y=audio_data, sr=sr)
    duration = min(max_duration, duration)
    audio_data = audio_data[:int(duration * sr)]

    if language is not None:
        pipe.model.config.forced_decoder_ids = (
            pipe.tokenizer.get_decoder_prompt_ids(
                language=language,
                task="transcribe"
            )
        )

    # Run Whisper to get word-level timestamps.
    audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
    output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
    chunks = output["chunks"]
    #print(chunks)

    # Create the video.
    clip = mpy.VideoClip(make_frame, duration=duration)
    audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration)
    clip = clip.set_audio(audio_clip)
    clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac")
    return "my_video.mp4"


title = "Word-level timestamps with Whisper"

description = """
This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!

This demo uses the <b>openai/whisper-small</b> checkpoint.

Since it's only a demo, the output is limited to the first 60 seconds of audio.
To use this on longer audio, <a href="https://huggingface.co/spaces/Matthijs/whisper_word_timestamps/settings?duplicate=true">duplicate the space</a>
and in <b>app.py</b> change the value of `max_duration`.
"""

article = """
<div style='margin:20px auto;'>

<p>Credits:<p>

<ul>
<li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
<li>"Here's to the Crazy Ones" speech by Steve Jobs</li>
<li>"Stupid People" comedy routine by Bill Engvall</li>
<li>"BeOS, It's The OS" song by The Cotton Squares</li>
<li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li>
<li>Whisper model by OpenAI</li>
</ul>

</div>
"""

examples = [
    ["examples/steve_jobs_crazy_ones.mp3", "english"],
    ["examples/henry5.wav", "english"],
    ["examples/stupid_people.mp3", "english"],
    ["examples/beos_song.mp3", "english"],
    ["examples/johan_cruijff.mp3", "dutch"],
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Audio(label="Upload Audio", source="upload", type="filepath"),
        gr.Dropdown(label="Language", choices=sorted(list(TO_LANGUAGE_CODE.keys()))),
    ],
    outputs=[
        gr.Video(label="Output Video"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()