Spaces:
Running
on
Zero
Running
on
Zero
from pathlib import Path | |
import torchaudio | |
import gradio as gr | |
import numpy as np | |
import torch | |
from hifigan.config import v1 | |
from hifigan.denoiser import Denoiser | |
from hifigan.env import AttrDict | |
from hifigan.models import Generator as HiFiGAN | |
#from BigVGAN.models import BigVGAN | |
#from BigVGAN.env import AttrDict as BigVGANAttrDict | |
from pflow.models.pflow_tts import pflowTTS | |
from pflow.text import text_to_sequence, sequence_to_text | |
from pflow.utils.utils import intersperse | |
from pflow.data.text_mel_datamodule import mel_spectrogram | |
from pflow.utils.model import normalize | |
BIGVGAN_CONFIG = { | |
"resblock": "1", | |
"num_gpus": 0, | |
"batch_size": 32, | |
"learning_rate": 0.0001, | |
"adam_b1": 0.8, | |
"adam_b2": 0.99, | |
"lr_decay": 0.999, | |
"seed": 1234, | |
"upsample_rates": [4,4,2,2,2,2], | |
"upsample_kernel_sizes": [8,8,4,4,4,4], | |
"upsample_initial_channel": 1536, | |
"resblock_kernel_sizes": [3,7,11], | |
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], | |
"activation": "snakebeta", | |
"snake_logscale": True, | |
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], | |
"mpd_reshapes": [2, 3, 5, 7, 11], | |
"use_spectral_norm": False, | |
"discriminator_channel_mult": 1, | |
"segment_size": 8192, | |
"num_mels": 80, | |
"num_freq": 1025, | |
"n_fft": 1024, | |
"hop_size": 256, | |
"win_size": 1024, | |
"sampling_rate": 22050, | |
"fmin": 0, | |
"fmax": 8000, | |
"fmax_for_loss": None, | |
"num_workers": 4, | |
"dist_config": { | |
"dist_backend": "nccl", | |
"dist_url": "tcp://localhost:54321", | |
"world_size": 1 | |
} | |
} | |
PFLOW_MODEL_PATH = 'checkpoint_epoch=499.ckpt' | |
VOCODER_MODEL_PATH = 'g_00120000' | |
VOCODER_BIGVGAN_MODEL_PATH = 'g_05000000' | |
wav, sr = torchaudio.load('prompt.wav') | |
prompt = mel_spectrogram( | |
wav, | |
1024, | |
80, | |
22050, | |
256, | |
1024, | |
0, | |
8000, | |
center=False, | |
)[:,:,:264] | |
def process_text(text: str, device: torch.device): | |
x = torch.tensor( | |
intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0), | |
dtype=torch.long, | |
device=device, | |
)[None] | |
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) | |
x_phones = sequence_to_text(x.squeeze(0).tolist()) | |
return {"x_orig": text, "x": x, "x_lengths": x_lengths, 'x_phones':x_phones} | |
def load_hifigan(checkpoint_path, device): | |
h = AttrDict(v1) | |
hifigan = HiFiGAN(h).to(device) | |
hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"]) | |
_ = hifigan.eval() | |
hifigan.remove_weight_norm() | |
return hifigan | |
def load_bigvgan(checkpoint_path, device): | |
print("Loading '{}'".format(checkpoint_path)) | |
checkpoint_dict = torch.load(checkpoint_path, map_location=device) | |
h = BigVGANAttrDict(BIGVGAN_CONFIG) | |
torch.manual_seed(h.seed) | |
generator = BigVGAN(h).to(device) | |
generator.load_state_dict(checkpoint_dict['generator']) | |
generator.eval() | |
generator.remove_weight_norm() | |
return generator | |
def to_waveform(mel, vocoder, denoiser=None): | |
audio = vocoder(mel).clamp(-1, 1) | |
if denoiser is not None: | |
audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze() | |
return audio.cpu().squeeze() | |
def get_device(): | |
if torch.cuda.is_available(): | |
print("[+] GPU Available! Using GPU") | |
device = torch.device("cuda") | |
else: | |
print("[-] GPU not available or forced CPU run! Using CPU") | |
device = torch.device("cpu") | |
return device | |
device = get_device() | |
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device) | |
_ = model.eval() | |
#vocoder = load_bigvgan(VOCODER_BIGVGAN_MODEL_PATH, device) | |
vocoder = load_hifigan(VOCODER_MODEL_PATH, device) | |
denoiser = Denoiser(vocoder, mode="zeros") | |
def synthesise(text, temperature, speed): | |
if len(text) > 1000: | |
raise gr.Error("Текст повинен бути коротшим за 1000 символів.") | |
text_processed = process_text(text.strip(), device) | |
output = model.synthesise( | |
text_processed["x"], | |
text_processed["x_lengths"], | |
n_timesteps=40, | |
temperature=temperature, | |
length_scale=1/speed, | |
prompt= normalize(prompt, model.mel_mean, model.mel_std) | |
) | |
waveform = to_waveform(output["mel"], vocoder, denoiser) | |
return text_processed['x_phones'][1::2], (22050, waveform.numpy()) | |
description = f''' | |
# Експериментальна апка для генерації аудіо з тексту. | |
pflow checkpoint {PFLOW_MODEL_PATH} | |
vocoder: HIFIGAN(трейнутий на датасеті, з нуля) - {VOCODER_MODEL_PATH} | |
''' | |
if __name__ == "__main__": | |
i = gr.Interface( | |
fn=synthesise, | |
description=description, | |
inputs=[ | |
gr.Text(label='Текст для синтезу:', lines=5, max_lines=10), | |
gr.Slider(minimum=0.0, maximum=1.0, label="Температура", value=0.2), | |
gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0) | |
], | |
outputs=[ | |
gr.Text(label='Фонемізований текст:', lines=5), | |
gr.Audio( | |
label="Згенероване аудіо:", | |
autoplay=False, | |
streaming=False, | |
type="numpy", | |
) | |
], | |
allow_flagging ='manual', | |
flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")], | |
cache_examples=True, | |
title='', | |
# description=description, | |
# article=article, | |
# examples=examples, | |
) | |
i.queue(max_size=20, default_concurrency_limit=4) | |
i.launch(share=False, server_name="0.0.0.0") | |