import gradio as gr import librosa import numpy as np import moviepy.editor as mpy import torch from PIL import Image, ImageDraw, ImageFont from transformers import pipeline # checkpoint = "openai/whisper-tiny" # checkpoint = "openai/whisper-base" checkpoint = "openai/whisper-small" # We need to set alignment_heads on the model's generation_config (at least # until the models have been updated on the hub). # If you're going to use a different version of whisper, see the following # for which values to use for alignment_heads: # https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a # whisper-tiny # alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]] # whisper-base # alignment_heads = [[3, 1], [4, 2], [4, 3], [4, 7], [5, 1], [5, 2], [5, 4], [5, 6]] # whisper-small alignment_heads = [[5, 3], [5, 9], [8, 0], [8, 4], [8, 7], [8, 8], [9, 0], [9, 7], [9, 9], [10, 5]] max_duration = 60 # seconds fps = 25 video_width = 640 video_height = 480 margin_left = 20 margin_right = 20 margin_top = 20 line_height = 44 background_image = Image.open("background.png") font = ImageFont.truetype("Lato-Regular.ttf", 40) text_color = (255, 200, 200) highlight_color = (255, 255, 255) LANGUAGES = { "en": "english", "zh": "chinese", "de": "german", "es": "spanish", "ru": "russian", "ko": "korean", "fr": "french", "ja": "japanese", "pt": "portuguese", "tr": "turkish", "pl": "polish", "ca": "catalan", "nl": "dutch", "ar": "arabic", "sv": "swedish", "it": "italian", "id": "indonesian", "hi": "hindi", "fi": "finnish", "vi": "vietnamese", "he": "hebrew", "uk": "ukrainian", "el": "greek", "ms": "malay", "cs": "czech", "ro": "romanian", "da": "danish", "hu": "hungarian", "ta": "tamil", "no": "norwegian", "th": "thai", "ur": "urdu", "hr": "croatian", "bg": "bulgarian", "lt": "lithuanian", "la": "latin", "mi": "maori", "ml": "malayalam", "cy": "welsh", "sk": "slovak", "te": "telugu", "fa": "persian", "lv": "latvian", "bn": "bengali", "sr": "serbian", "az": "azerbaijani", "sl": "slovenian", "kn": "kannada", "et": "estonian", "mk": "macedonian", "br": "breton", "eu": "basque", "is": "icelandic", "hy": "armenian", "ne": "nepali", "mn": "mongolian", "bs": "bosnian", "kk": "kazakh", "sq": "albanian", "sw": "swahili", "gl": "galician", "mr": "marathi", "pa": "punjabi", "si": "sinhala", "km": "khmer", "sn": "shona", "yo": "yoruba", "so": "somali", "af": "afrikaans", "oc": "occitan", "ka": "georgian", "be": "belarusian", "tg": "tajik", "sd": "sindhi", "gu": "gujarati", "am": "amharic", "yi": "yiddish", "lo": "lao", "uz": "uzbek", "fo": "faroese", "ht": "haitian creole", "ps": "pashto", "tk": "turkmen", "nn": "nynorsk", "mt": "maltese", "sa": "sanskrit", "lb": "luxembourgish", "my": "myanmar", "bo": "tibetan", "tl": "tagalog", "mg": "malagasy", "as": "assamese", "tt": "tatar", "haw": "hawaiian", "ln": "lingala", "ha": "hausa", "ba": "bashkir", "jw": "javanese", "su": "sundanese", } # language code lookup by name, with a few language aliases TO_LANGUAGE_CODE = { **{language: code for code, language in LANGUAGES.items()}, "burmese": "my", "valencian": "ca", "flemish": "nl", "haitian": "ht", "letzeburgesch": "lb", "pushto": "ps", "panjabi": "pa", "moldavian": "ro", "moldovan": "ro", "sinhalese": "si", "castilian": "es", } if torch.cuda.is_available() and torch.cuda.device_count() > 0: from transformers import ( AutomaticSpeechRecognitionPipeline, WhisperForConditionalGeneration, WhisperProcessor, ) model = WhisperForConditionalGeneration.from_pretrained(checkpoint).to("cuda").half() processor = WhisperProcessor.from_pretrained(checkpoint) pipe = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, batch_size=8, torch_dtype=torch.float16, device="cuda:0" ) else: pipe = pipeline(model=checkpoint) pipe.model.generation_config.alignment_heads = alignment_heads chunks = [] start_chunk = 0 last_draws = None last_image = None def make_frame(t): global chunks, start_chunk, last_draws, last_image # TODO in the Henry V example, the word "desires" has an ending timestamp # that's too far into the future, and so the word stays highlighted. # Could fix this by finding the latest word that is active in the chunk # and only highlight that one. image = background_image.copy() draw = ImageDraw.Draw(image) # for debugging: draw frame time #draw.text((20, 20), str(t), fill=text_color, font=font) space_length = draw.textlength(" ", font) x = margin_left y = margin_top # Create a list of drawing commands draws = [] for i in range(start_chunk, len(chunks)): chunk = chunks[i] chunk_start = chunk["timestamp"][0] chunk_end = chunk["timestamp"][1] if chunk_start > t: break if chunk_end is None: chunk_end = max_duration word = chunk["text"] word_length = draw.textlength(word + " ", font) - space_length if x + word_length >= video_width - margin_right: x = margin_left y += line_height # restart page when end is reached if y >= margin_top + line_height * 7: start_chunk = i break highlight = (chunk_start <= t < chunk_end) draws.append([x, y, word, word_length, highlight]) x += word_length + space_length # If the drawing commands didn't change, then reuse the last image, # otherwise draw a new image if draws != last_draws: for x, y, word, word_length, highlight in draws: if highlight: color = highlight_color draw.rectangle([x, y + line_height, x + word_length, y + line_height + 4], fill=color) else: color = text_color draw.text((x, y), word, fill=color, font=font) last_image = np.array(image) last_draws = draws return last_image def predict(audio_path, language=None): global chunks, start_chunk, last_draws, last_image start_chunk = 0 last_draws = None last_image = None audio_data, sr = librosa.load(audio_path, mono=True) duration = librosa.get_duration(y=audio_data, sr=sr) duration = min(max_duration, duration) audio_data = audio_data[:int(duration * sr)] if language is not None: pipe.model.config.forced_decoder_ids = ( pipe.tokenizer.get_decoder_prompt_ids( language=language, task="transcribe" ) ) # Run Whisper to get word-level timestamps. audio_inputs = librosa.resample(audio_data, orig_sr=sr, target_sr=pipe.feature_extractor.sampling_rate) output = pipe(audio_inputs, chunk_length_s=30, stride_length_s=[4, 2], return_timestamps="word") chunks = output["chunks"] #print(chunks) # Create the video. clip = mpy.VideoClip(make_frame, duration=duration) audio_clip = mpy.AudioFileClip(audio_path).set_duration(duration) clip = clip.set_audio(audio_clip) clip.write_videofile("my_video.mp4", fps=fps, codec="libx264", audio_codec="aac") return "my_video.mp4" title = "Word-level timestamps with Whisper" description = """ This demo shows Whisper word-level timestamps in action using Hugging Face Transformers. It creates a video showing subtitled audio with the current word highlighted. It can even do music lyrics! This demo uses the openai/whisper-small checkpoint. Since it's only a demo, the output is limited to the first 60 seconds of audio. To use this on longer audio, duplicate the space and in app.py change the value of `max_duration`. """ article = """
Credits: