Spaces:
Build error
Build error
import gradio as gr | |
import numpy as np | |
import torch | |
from transformers import pipeline | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
# load speech translation checkpoint | |
asr_pipe = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-base", | |
device=device, | |
chunk_length_s=30, | |
use_fast=True, | |
) | |
# load text translation checkpoint | |
translation_pipe = pipeline( | |
"translation", | |
model="facebook/nllb-200-distilled-600M", | |
use_fast=True, | |
device=device, | |
) | |
# load text-to-speech checkpoint and speaker embeddings | |
tts_pipe = pipeline( | |
"text-to-speech", | |
model="JackismyShephard/speecht5_tts-finetuned-nst-da", | |
use_fast=True, | |
device=device, | |
revision="5af228df418092b681cf31c31e413bdd2b5f9c8c", | |
) | |
speaker_embedding_path = "female_23_vestjylland.npy" | |
speaker_embedding = np.load(speaker_embedding_path) | |
speaker_embedding_tensor = torch.tensor(speaker_embedding).unsqueeze(0) | |
target_dtype = np.int16 | |
max_range = np.iinfo(target_dtype).max | |
def translate(audio): | |
outputs = asr_pipe( | |
audio, | |
batch_size=8, | |
generate_kwargs={ | |
"task": "translate", | |
}, | |
) | |
translated_text = translation_pipe( | |
outputs["text"], | |
src_lang="eng_Latn", | |
tgt_lang="dan_Latn", | |
)[0]["translation_text"] | |
return translated_text | |
def synthesise(text): | |
if len(text.strip()) == 0: | |
return (16000, np.zeros(0)) | |
text = replace_danish_letters(text) | |
forward_params = {"speaker_embeddings": speaker_embedding_tensor} | |
speech = tts_pipe(text, forward_params=forward_params) | |
sr, audio = speech["sampling_rate"], speech["audio"] | |
audio = (audio * max_range).astype(np.int16) | |
return sr, audio | |
def speech_to_speech_translation(audio): | |
translated_text = translate(audio) | |
return synthesise(translated_text) | |
def replace_danish_letters(text): | |
for src, dst in replacements: | |
text = text.replace(src, dst) | |
return text | |
replacements = [ | |
("&", "og"), | |
("\r", " "), | |
("´", ""), | |
("\\", ""), | |
("¨", " "), | |
("Å", "AA"), | |
("Æ", "AE"), | |
("É", "E"), | |
("Ö", "OE"), | |
("Ø", "OE"), | |
("á", "a"), | |
("ä", "ae"), | |
("å", "aa"), | |
("è", "e"), | |
("î", "i"), | |
("ô", "oe"), | |
("ö", "oe"), | |
("ø", "oe"), | |
("ü", "y"), | |
] | |
title = "Speech to Danish Speech Translation" | |
description = """ | |
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses: | |
1. OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech to english text translation | |
2. Facebook's [NLBB](https://huggingface.co/facebook/nllb-200-distilled-600M) model for english to danish text translation | |
3. JackismyShephard's [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for danish speech synthesis | |
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation") | |
""" | |
demo = gr.Interface( | |
fn=speech_to_speech_translation, | |
inputs=gr.Audio(label="Input Speech", type="filepath"), | |
outputs=gr.Audio(label="Translated Speech", type="numpy"), | |
title=title, | |
description=description, | |
examples=[["./example.wav"]], | |
cache_examples=True, | |
allow_flagging="never", | |
) | |
demo.launch() | |