File size: 3,458 Bytes
90c5b6d 831b161 90c5b6d 831b161 cd96dae 5ed78b8 ec39417 831b161 ec39417 325312c 831b161 325312c 90c5b6d 325312c 90c5b6d 1d1e03e 5ed78b8 90c5b6d 54811b2 90c5b6d 831b161 90c5b6d 5ed78b8 1d1e03e 90c5b6d be8cb80 90c5b6d 54811b2 90c5b6d 7cbdcbc 90c5b6d 7cbdcbc 90c5b6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"text-to-speech",
model=checkpoint_finetuned,
use_fast=True,
device=device,
revision=revision,
)
embeddings_dir = "embeddings/nst-da-metricgan-plus/"
speaker_embeddings = {
"F23": embeddings_dir + "female_23_vestjylland.npy",
"F24": embeddings_dir + "female_24_storkoebenhavn.npy",
"F49": embeddings_dir + "female_49_nordjylland.npy",
"M51": embeddings_dir + "male_51_vest_sydsjaelland.npy",
"M18": embeddings_dir + "male_18_vest_sydsjaelland.npy",
"M31": embeddings_dir + "male_31_fyn.npy",
}
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
def predict(text, speaker):
if len(text.strip()) == 0:
return (16000, np.zeros(0))
text = replace_danish_letters(text)
speaker_id = speaker[:3]
speaker_embedding_path = speaker_embeddings[speaker_id]
speaker_embedding = np.load(speaker_embedding_path)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
forward_params = {"speaker_embeddings": speaker_embedding}
speech = pipe(text, forward_params=forward_params)
sr, audio = speech["sampling_rate"], speech["audio"]
audio = (audio * max_range).astype(np.int16)
return sr, audio
def replace_danish_letters(text):
for src, dst in replacements:
text = text.replace(src, dst)
return text
replacements = [
("&", "og"),
("\r", " "),
("´", ""),
("\\", ""),
("¨", " "),
("Å", "AA"),
("Æ", "AE"),
("É", "E"),
("Ö", "OE"),
("Ø", "OE"),
("á", "a"),
("ä", "ae"),
("å", "aa"),
("è", "e"),
("î", "i"),
("ô", "oe"),
("ö", "oe"),
("ø", "oe"),
("ü", "y"),
]
title = "Danish Speech Synthesis"
description = (
"Synthesize long-form danish speech from text with the click of a button! Demo uses the"
f" checkpoint [{checkpoint_finetuned}](https://huggingface.co/{checkpoint_finetuned}) and 🤗 Transformers to synthesize speech"
"."
)
examples = [
[
"I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
"F23 (Female, 23, Vestjylland)",
],
]
demo = gr.Interface(
fn=predict,
inputs=[
gr.Textbox(label="Input Text"),
gr.Radio(
label="Speaker",
choices=[
"F23 (Female, 23, Vestjylland)",
"F24 (Female, 24, Storkøbenhavn)",
"F49 (Female, 49 Nordjylland)",
"M51 (Male. 51, Vest-sydsjælland)",
"M18 (Male, 18, Vest-sydjælland)",
"M31 (Male, 31, Fyn)",
],
value="F23 (Female, 23, Vestjylland)",
),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
examples=examples,
cache_examples=True,
allow_flagging="never",
)
demo.launch()
|