import gradio as gr import numpy as np import torch from transformers import pipeline device = "cuda:0" if torch.cuda.is_available() else "cpu" # load speech translation checkpoint asr_pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-base", device=device, chunk_length_s=30, use_fast=True, ) # load text translation checkpoint translation_pipe = pipeline( "translation", model="facebook/nllb-200-distilled-600M", use_fast=True, device=device, ) # load text-to-speech checkpoint and speaker embeddings tts_pipe = pipeline( "text-to-speech", model="JackismyShephard/speecht5_tts-finetuned-nst-da", use_fast=True, device=device, revision="5af228df418092b681cf31c31e413bdd2b5f9c8c", ) speaker_embedding_path = "female_23_vestjylland.npy" speaker_embedding = np.load(speaker_embedding_path) speaker_embedding_tensor = torch.tensor(speaker_embedding).unsqueeze(0) target_dtype = np.int16 max_range = np.iinfo(target_dtype).max def translate(audio): outputs = asr_pipe( audio, batch_size=8, generate_kwargs={ "task": "translate", }, ) translated_text = translation_pipe( outputs["text"], src_lang="eng_Latn", tgt_lang="dan_Latn", )[0]["translation_text"] return translated_text def synthesise(text): if len(text.strip()) == 0: return (16000, np.zeros(0)) text = replace_danish_letters(text) forward_params = {"speaker_embeddings": speaker_embedding_tensor} speech = tts_pipe(text, forward_params=forward_params) sr, audio = speech["sampling_rate"], speech["audio"] audio = (audio * max_range).astype(np.int16) return sr, audio def speech_to_speech_translation(audio): translated_text = translate(audio) return synthesise(translated_text) def replace_danish_letters(text): for src, dst in replacements: text = text.replace(src, dst) return text replacements = [ ("&", "og"), ("\r", " "), ("´", ""), ("\\", ""), ("¨", " "), ("Å", "AA"), ("Æ", "AE"), ("É", "E"), ("Ö", "OE"), ("Ø", "OE"), ("á", "a"), ("ä", "ae"), ("å", "aa"), ("è", "e"), ("î", "i"), ("ô", "oe"), ("ö", "oe"), ("ø", "oe"), ("ü", "y"), ] title = "Speech to Danish Speech Translation" description = """ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses: 1. OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech to english text translation 2. Facebook's [NLBB](https://huggingface.co/facebook/nllb-200-distilled-600M) model for english to danish text translation 3. JackismyShephard's [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for danish speech synthesis ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation") """ demo = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(label="Input Speech", type="filepath"), outputs=gr.Audio(label="Translated Speech", type="numpy"), title=title, description=description, examples=[["./example.wav"]], cache_examples=True, allow_flagging="never", ) demo.launch()