Spaces:
Build error
Build error
File size: 3,442 Bytes
d347764 77862e1 d347764 77862e1 1aa084a 91325a6 77862e1 4b94ac2 d347764 4b94ac2 77862e1 4b94ac2 77862e1 4b94ac2 77862e1 d347764 77862e1 d347764 77862e1 d347764 77862e1 4b94ac2 77862e1 4b94ac2 d347764 77862e1 a224cc4 4b94ac2 77862e1 d347764 77862e1 d347764 4b94ac2 f805e49 47755f8 f805e49 77862e1 d347764 4b94ac2 1aa084a c737803 1aa084a c737803 d347764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
asr_pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base",
device=device,
chunk_length_s=30,
use_fast=True,
)
# load text translation checkpoint
translation_pipe = pipeline(
"translation",
model="facebook/nllb-200-distilled-600M",
use_fast=True,
device=device,
)
# load text-to-speech checkpoint and speaker embeddings
tts_pipe = pipeline(
"text-to-speech",
model="JackismyShephard/speecht5_tts-finetuned-nst-da",
use_fast=True,
device=device,
revision="5af228df418092b681cf31c31e413bdd2b5f9c8c",
)
speaker_embedding_path = "female_23_vestjylland.npy"
speaker_embedding = np.load(speaker_embedding_path)
speaker_embedding_tensor = torch.tensor(speaker_embedding).unsqueeze(0)
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
def translate(audio):
outputs = asr_pipe(
audio,
batch_size=8,
generate_kwargs={
"task": "translate",
},
)
translated_text = translation_pipe(
outputs["text"],
src_lang="eng_Latn",
tgt_lang="dan_Latn",
)[0]["translation_text"]
return translated_text
def synthesise(text):
if len(text.strip()) == 0:
return (16000, np.zeros(0))
text = replace_danish_letters(text)
forward_params = {"speaker_embeddings": speaker_embedding_tensor}
speech = tts_pipe(text, forward_params=forward_params)
sr, audio = speech["sampling_rate"], speech["audio"]
audio = (audio * max_range).astype(np.int16)
return sr, audio
def speech_to_speech_translation(audio):
translated_text = translate(audio)
return synthesise(translated_text)
def replace_danish_letters(text):
for src, dst in replacements:
text = text.replace(src, dst)
return text
replacements = [
("&", "og"),
("\r", " "),
("´", ""),
("\\", ""),
("¨", " "),
("Å", "AA"),
("Æ", "AE"),
("É", "E"),
("Ö", "OE"),
("Ø", "OE"),
("á", "a"),
("ä", "ae"),
("å", "aa"),
("è", "e"),
("î", "i"),
("ô", "oe"),
("ö", "oe"),
("ø", "oe"),
("ü", "y"),
]
title = "Speech to Danish Speech Translation"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses:
1. OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech to english text translation
2. Facebook's [NLBB](https://huggingface.co/facebook/nllb-200-distilled-600M) model for english to danish text translation
3. JackismyShephard's [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for danish speech synthesis
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(label="Input Speech", type="filepath"),
outputs=gr.Audio(label="Translated Speech", type="numpy"),
title=title,
description=description,
examples=[["./example.wav"]],
cache_examples=True,
allow_flagging="never",
)
demo.launch()
|