Spaces:

Matthijs
/

whisper_word_timestamps

Runtime error

Matthijs Hollemans

add language selector

d2d20b7 over 1 year ago

9.42 kB

	import gradio as gr
	import librosa
	import numpy as np
	import moviepy.editor as mpy
	import torch

	from PIL import Image, ImageDraw, ImageFont
	from transformers import pipeline


	# checkpoint = "openai/whisper-tiny"
	# checkpoint = "openai/whisper-base"
	checkpoint = "openai/whisper-small"

	# We need to set alignment_heads on the model's generation_config (at least
	# until the models have been updated on the hub).
	# If you're going to use a different version of whisper, see the following
	# for which values to use for alignment_heads:
	# https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a

	# whisper-tiny
	# alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
	# whisper-base
	# alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]]
	# whisper-small
	alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]]

	max_duration = 60 # seconds
	fps = 25
	video_width = 640
	video_height = 480
	margin_left = 20
	margin_right = 20
	margin_top = 20
	line_height = 44

	background_image = Image.open("background.png")
	font = ImageFont.truetype("Lato-Regular.ttf", 40)
	text_color = (255, 200, 200)
	highlight_color = (255, 255, 255)


	LANGUAGES = {
	"en": "english",
	"zh": "chinese",
	"de": "german",
	"es": "spanish",
	"ru": "russian",
	"ko": "korean",
	"fr": "french",
	"ja": "japanese",
	"pt": "portuguese",
	"tr": "turkish",
	"pl": "polish",
	"ca": "catalan",
	"nl": "dutch",
	"ar": "arabic",
	"sv": "swedish",
	"it": "italian",
	"id": "indonesian",
	"hi": "hindi",
	"fi": "finnish",
	"vi": "vietnamese",
	"he": "hebrew",
	"uk": "ukrainian",
	"el": "greek",
	"ms": "malay",
	"cs": "czech",
	"ro": "romanian",
	"da": "danish",
	"hu": "hungarian",
	"ta": "tamil",
	"no": "norwegian",
	"th": "thai",
	"ur": "urdu",
	"hr": "croatian",
	"bg": "bulgarian",
	"lt": "lithuanian",
	"la": "latin",
	"mi": "maori",
	"ml": "malayalam",
	"cy": "welsh",
	"sk": "slovak",
	"te": "telugu",
	"fa": "persian",
	"lv": "latvian",
	"bn": "bengali",
	"sr": "serbian",
	"az": "azerbaijani",
	"sl": "slovenian",
	"kn": "kannada",
	"et": "estonian",
	"mk": "macedonian",
	"br": "breton",
	"eu": "basque",
	"is": "icelandic",
	"hy": "armenian",
	"ne": "nepali",
	"mn": "mongolian",
	"bs": "bosnian",
	"kk": "kazakh",
	"sq": "albanian",
	"sw": "swahili",
	"gl": "galician",
	"mr": "marathi",
	"pa": "punjabi",
	"si": "sinhala",
	"km": "khmer",
	"sn": "shona",
	"yo": "yoruba",
	"so": "somali",
	"af": "afrikaans",
	"oc": "occitan",
	"ka": "georgian",
	"be": "belarusian",
	"tg": "tajik",
	"sd": "sindhi",
	"gu": "gujarati",
	"am": "amharic",
	"yi": "yiddish",
	"lo": "lao",
	"uz": "uzbek",
	"fo": "faroese",
	"ht": "haitian creole",
	"ps": "pashto",
	"tk": "turkmen",
	"nn": "nynorsk",
	"mt": "maltese",
	"sa": "sanskrit",
	"lb": "luxembourgish",
	"my": "myanmar",
	"bo": "tibetan",
	"tl": "tagalog",
	"mg": "malagasy",
	"as": "assamese",
	"tt": "tatar",
	"haw": "hawaiian",
	"ln": "lingala",
	"ha": "hausa",
	"ba": "bashkir",
	"jw": "javanese",
	"su": "sundanese",
	}

	# language code lookup by name, with a few language aliases
	TO_LANGUAGE_CODE = {
	**{language: code for code, language in LANGUAGES.items()},
	"burmese": "my",
	"valencian": "ca",
	"flemish": "nl",
	"haitian": "ht",
	"letzeburgesch": "lb",
	"pushto": "ps",
	"panjabi": "pa",
	"moldavian": "ro",
	"moldovan": "ro",
	"sinhalese": "si",
	"castilian": "es",
	}


	if torch.cuda.is_available() and torch.cuda.device_count() > 0:
	from transformers import (
	AutomaticSpeechRecognitionPipeline,
	WhisperForConditionalGeneration,
	WhisperProcessor,
	)
	model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half()
	processor = WhisperProcessor.from_pretrained(checkpoint)
	pipe = AutomaticSpeechRecognitionPipeline(
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	batch_size=8,
	torch_dtype=torch.float16,
	device="cuda:0"
	)
	else:
	pipe = pipeline(model=checkpoint)

	pipe.model.generation_config.alignment_heads = alignment_heads

	chunks = []

	start_chunk = 0
	last_draws = None
	last_image = None


	def make_frame(t):
	global chunks, start_chunk, last_draws, last_image

	# TODO in the Henry V example, the word "desires" has an ending timestamp
	# that's too far into the future, and so the word stays highlighted.
	# Could fix this by finding the latest word that is active in the chunk
	# and only highlight that one.

	image = background_image.copy()
	draw = ImageDraw.Draw(image)

	# for debugging: draw frame time
	#draw.text((20, 20), str(t), fill=text_color, font=font)

	space_length = draw.textlength(" ", font)
	x = margin_left
	y = margin_top

	# Create a list of drawing commands
	draws = []
	for i in range(start_chunk, len(chunks)):
	chunk = chunks[i]
	chunk_start = chunk["timestamp"][0]
	chunk_end = chunk["timestamp"][1]
	if chunk_start > t: break
	if chunk_end is None: chunk_end = max_duration

	word = chunk["text"]
	word_length = draw.textlength(word + " ", font) - space_length

	if x + word_length >= video_width - margin_right:
	x = margin_left
	y += line_height

	# restart page when end is reached
	if y >= margin_top + line_height * 7:
	start_chunk = i
	break

	highlight = (chunk_start <= t < chunk_end)
	draws.append([x, y, word, word_length, highlight])

	x += word_length + space_length

	# If the drawing commands didn't change, then reuse the last image,
	# otherwise draw a new image
	if draws != last_draws:
	for x, y, word, word_length, highlight in draws:
	if highlight:
	color = highlight_color
	draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color)
	else:
	color = text_color

	draw.text((x, y), word, fill=color, font=font)

	last_image = np.array(image)
	last_draws = draws

	return last_image


	def predict(audio_path, language=None):
	global chunks, start_chunk, last_draws, last_image

	start_chunk = 0
	last_draws = None
	last_image = None

	audio_data, sr = librosa.load(audio_path, mono=True)
	duration = librosa.get_duration(y=audio_data, sr=sr)
	duration = min(max_duration, duration)
	audio_data = audio_data[:int(duration * sr)]

	if language is not None:
	pipe.model.config.forced_decoder_ids = (
	pipe.tokenizer.get_decoder_prompt_ids(
	language=language,
	task="transcribe"
	)
	)

	# Run Whisper to get word-level timestamps.
	audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate)
	output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word")
	chunks = output["chunks"]
	#print(chunks)

	# Create the video.
	clip = mpy.VideoClip(make_frame, duration=duration)
	audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration)
	clip = clip.set_audio(audio_clip)
	clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac")
	return "my_video.mp4"


	title = "Word-level timestamps with Whisper"

	description = """
	This demo shows Whisper <b>word-level timestamps</b> in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics!

	This demo uses the <b>openai/whisper-small</b> checkpoint.

	Since it's only a demo, the output is limited to the first 60 seconds of audio.
	To use this on longer audio, <a href="https://huggingface.co/spaces/Matthijs/whisper_word_timestamps/settings?duplicate=true">duplicate the space</a>
	and in <b>app.py</b> change the value of `max_duration`.
	"""

	article = """
	<div style='margin:20px auto;'>

	<p>Credits:<p>

	<ul>
	<li>Shakespeare's "Henry V" speech from <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
	<li>"Here's to the Crazy Ones" speech by Steve Jobs</li>
	<li>"Stupid People" comedy routine by Bill Engvall</li>
	<li>"BeOS, It's The OS" song by The Cotton Squares</li>
	<li>Lato font by Łukasz Dziedzic (licensed under Open Font License)</li>
	<li>Whisper model by OpenAI</li>
	</ul>

	</div>
	"""

	examples = [
	["examples/steve_jobs_crazy_ones.mp3", "english"],
	["examples/henry5.wav", "english"],
	["examples/stupid_people.mp3", "english"],
	["examples/beos_song.mp3", "english"],
	["examples/johan_cruijff.mp3", "dutch"],
	]

	gr.Interface(
	fn=predict,
	inputs=[
	gr.Audio(label="Upload Audio", source="upload", type="filepath"),
	gr.Dropdown(label="Language", choices=sorted(list(TO_LANGUAGE_CODE.keys()))),
	],
	outputs=[
	gr.Video(label="Output Video"),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	).launch()