JackismyShephard's picture
add new embeddings and change speech enhancement settings
325312c
raw
history blame
3.93 kB
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
from resemble_enhance.enhancer.inference import denoise, enhance
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"text-to-speech",
model=checkpoint_finetuned,
use_fast=True,
device=device,
revision=revision,
)
embeddings_dir = "embeddings/nst-da-metricgan-plus/"
speaker_embeddings = {
"F23": embeddings_dir + "female_23_vestjylland.npy",
"F24": embeddings_dir + "female_24_storkoebenhavn.npy",
"F49": embeddings_dir + "female_49_nordjylland.npy",
"M51": embeddings_dir + "male_51_vest_sydsjaelland.npy",
"M18": embeddings_dir + "male_18_vest_sydsjaelland.npy",
"M31": embeddings_dir + "male_31_fyn.npy",
}
def predict(text, speaker, post_process):
if len(text.strip()) == 0:
return (16000, np.zeros(0))
text = replace_danish_letters(text)
speaker_id = speaker[:3]
speaker_embedding_path = speaker_embeddings[speaker_id]
speaker_embedding = np.load(speaker_embedding_path)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
forward_params = {"speaker_embeddings": speaker_embedding}
speech = pipe(text, forward_params=forward_params)
if post_process:
return enhance_audio(speech["audio"], speech["sampling_rate"], device)
else:
return (speech["sampling_rate"], speech["audio"])
def replace_danish_letters(text):
for src, dst in replacements:
text = text.replace(src, dst)
return text
replacements = [
("&", "og"),
("\r", " "),
("´", ""),
("\\", ""),
("¨", " "),
("Å", "AA"),
("Æ", "AE"),
("É", "E"),
("Ö", "OE"),
("Ø", "OE"),
("á", "a"),
("ä", "ae"),
("å", "aa"),
("è", "e"),
("î", "i"),
("ô", "oe"),
("ö", "oe"),
("ø", "oe"),
("ü", "y"),
]
def enhance_audio(waveform, sr, device="cuda"):
tensor = torch.tensor(waveform).float()
denoised, new_sr = denoise(tensor, sr, device)
enhanced, new_sr = enhance(
denoised, new_sr, device, nfe=64, solver="midpoint", lambd=0.1, tau=0.5
)
enhanced_cpu = enhanced.cpu().numpy()
return new_sr, enhanced_cpu
title = "Danish Speech Synthesis"
description = (
"Synthesize long-form danish speech from text with the click of a button! Demo uses the"
f" checkpoint [{checkpoint_finetuned}](https://huggingface.co/{checkpoint_finetuned}) and 🤗 Transformers to synthesize speech"
"."
)
examples = [
[
"I sin oprindelige før-kristne form blev alferne sandsynligvis opfattet som en personificering af det land og den natur, der omgav menneskene, dvs. den opdyrkede jord, gården og de naturressourcer, som hørte dertil. De var guddommelige eller delvis guddommelige væsener, der besad magiske kræfter, som de brugte både til fordel og ulempe for menneskene.",
"F23 (Female, 23, Vestjylland)",
True,
],
]
demo = gr.Interface(
fn=predict,
inputs=[
gr.Textbox(label="Input Text"),
gr.Radio(
label="Speaker",
choices=[
"F23 (Female, 23, Vestjylland)",
"F24 (Female, 24, Storkøbenhavn)",
"F49 (Female, 49 Nordjylland)",
"M51 (Male. 51, Vest-sydsjælland)",
"M18 (Male, 18, Vest-sydjælland)",
"M31 (Male, 31, Fyn)",
],
value="F23 (Female, 23, Vestjylland)",
),
gr.Checkbox(label="Enhance audio (takes substantially longer)"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
examples=examples,
cache_examples=True,
allow_flagging="never",
)
demo.launch()