Spaces:
Running
Running
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# This code is modified from https://github.com/svc-develop-team/so-vits-svc/blob/4.1-Stable/models.py | |
import copy | |
import torch | |
from torch import nn | |
from torch.nn import functional as F | |
from utils.util import * | |
from modules.transformer.attentions import Encoder | |
from models.tts.vits.vits import ResidualCouplingBlock, PosteriorEncoder | |
from models.vocoders.gan.generator.bigvgan import BigVGAN | |
from models.vocoders.gan.generator.hifigan import HiFiGAN | |
from models.vocoders.gan.generator.nsfhifigan import NSFHiFiGAN | |
from models.vocoders.gan.generator.melgan import MelGAN | |
from models.vocoders.gan.generator.apnet import APNet | |
from modules.encoder.condition_encoder import ConditionEncoder | |
def slice_pitch_segments(x, ids_str, segment_size=4): | |
ret = torch.zeros_like(x[:, :segment_size]) | |
for i in range(x.size(0)): | |
idx_str = ids_str[i] | |
idx_end = idx_str + segment_size | |
ret[i] = x[i, idx_str:idx_end] | |
return ret | |
def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4): | |
b, d, t = x.size() | |
if x_lengths is None: | |
x_lengths = t | |
ids_str_max = x_lengths - segment_size + 1 | |
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) | |
ret = slice_segments(x, ids_str, segment_size) | |
ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size) | |
return ret, ret_pitch, ids_str | |
class ContentEncoder(nn.Module): | |
def __init__( | |
self, | |
out_channels, | |
hidden_channels, | |
kernel_size, | |
n_layers, | |
gin_channels=0, | |
filter_channels=None, | |
n_heads=None, | |
p_dropout=None, | |
): | |
super().__init__() | |
self.out_channels = out_channels | |
self.hidden_channels = hidden_channels | |
self.kernel_size = kernel_size | |
self.n_layers = n_layers | |
self.gin_channels = gin_channels | |
self.f0_emb = nn.Embedding(256, hidden_channels) | |
self.enc_ = Encoder( | |
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout | |
) | |
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) | |
# condition_encoder ver. | |
def forward(self, x, x_mask, noice_scale=1): | |
x = self.enc_(x * x_mask, x_mask) | |
stats = self.proj(x) * x_mask | |
m, logs = torch.split(stats, self.out_channels, dim=1) | |
z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask | |
return z, m, logs, x_mask | |
class SynthesizerTrn(nn.Module): | |
""" | |
Synthesizer for Training | |
""" | |
def __init__(self, spec_channels, segment_size, cfg): | |
super().__init__() | |
self.spec_channels = spec_channels | |
self.segment_size = segment_size | |
self.cfg = cfg | |
self.inter_channels = cfg.model.vits.inter_channels | |
self.hidden_channels = cfg.model.vits.hidden_channels | |
self.filter_channels = cfg.model.vits.filter_channels | |
self.n_heads = cfg.model.vits.n_heads | |
self.n_layers = cfg.model.vits.n_layers | |
self.kernel_size = cfg.model.vits.kernel_size | |
self.p_dropout = cfg.model.vits.p_dropout | |
self.n_flow_layer = cfg.model.vits.n_flow_layer | |
self.gin_channels = cfg.model.vits.gin_channels | |
self.n_speakers = cfg.model.vits.n_speakers | |
# f0 | |
self.n_bins = cfg.preprocess.pitch_bin | |
self.f0_min = cfg.preprocess.f0_min | |
self.f0_max = cfg.preprocess.f0_max | |
# TODO: sort out the config | |
self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min | |
self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max | |
self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder) | |
self.emb_g = nn.Embedding(self.n_speakers, self.gin_channels) | |
self.enc_p = ContentEncoder( | |
self.inter_channels, | |
self.hidden_channels, | |
filter_channels=self.filter_channels, | |
n_heads=self.n_heads, | |
n_layers=self.n_layers, | |
kernel_size=self.kernel_size, | |
p_dropout=self.p_dropout, | |
) | |
assert cfg.model.generator in [ | |
"bigvgan", | |
"hifigan", | |
"melgan", | |
"nsfhifigan", | |
"apnet", | |
] | |
self.dec_name = cfg.model.generator | |
temp_cfg = copy.deepcopy(cfg) | |
temp_cfg.preprocess.n_mel = self.inter_channels | |
if cfg.model.generator == "bigvgan": | |
temp_cfg.model.bigvgan = cfg.model.generator_config.bigvgan | |
self.dec = BigVGAN(temp_cfg) | |
elif cfg.model.generator == "hifigan": | |
temp_cfg.model.hifigan = cfg.model.generator_config.hifigan | |
self.dec = HiFiGAN(temp_cfg) | |
elif cfg.model.generator == "melgan": | |
temp_cfg.model.melgan = cfg.model.generator_config.melgan | |
self.dec = MelGAN(temp_cfg) | |
elif cfg.model.generator == "nsfhifigan": | |
temp_cfg.model.nsfhifigan = cfg.model.generator_config.nsfhifigan | |
self.dec = NSFHiFiGAN(temp_cfg) # TODO: nsf need f0 | |
elif cfg.model.generator == "apnet": | |
temp_cfg.model.apnet = cfg.model.generator_config.apnet | |
self.dec = APNet(temp_cfg) | |
self.enc_q = PosteriorEncoder( | |
self.spec_channels, | |
self.inter_channels, | |
self.hidden_channels, | |
5, | |
1, | |
16, | |
gin_channels=self.gin_channels, | |
) | |
self.flow = ResidualCouplingBlock( | |
self.inter_channels, | |
self.hidden_channels, | |
5, | |
1, | |
self.n_flow_layer, | |
gin_channels=self.gin_channels, | |
) | |
def forward(self, data): | |
"""VitsSVC forward function. | |
Args: | |
data (dict): condition data & audio data, including: | |
B: batch size, T: target length | |
{ | |
"spk_id": [B, singer_table_size] | |
"target_len": [B] | |
"mask": [B, T, 1] | |
"mel": [B, T, n_mel] | |
"linear": [B, T, n_fft // 2 + 1] | |
"frame_pitch": [B, T] | |
"frame_uv": [B, T] | |
"audio": [B, audio_len] | |
"audio_len": [B] | |
"contentvec_feat": [B, T, contentvec_dim] | |
"whisper_feat": [B, T, whisper_dim] | |
... | |
} | |
""" | |
# TODO: elegantly handle the dimensions | |
spec = data["linear"].transpose(1, 2) | |
g = data["spk_id"] | |
g = self.emb_g(g).transpose(1, 2) | |
c_lengths = data["target_len"] | |
spec_lengths = data["target_len"] | |
f0 = data["frame_pitch"] | |
# condition_encoder ver. | |
x = self.condition_encoder(data).transpose(1, 2) | |
x_mask = torch.unsqueeze(sequence_mask(c_lengths, f0.size(1)), 1).to(x.dtype) | |
# prior encoder | |
z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask) | |
# posterior encoder | |
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) | |
# flow | |
z_p = self.flow(z, spec_mask, g=g) | |
z_slice, pitch_slice, ids_slice = rand_slice_segments_with_pitch( | |
z, f0, spec_lengths, self.segment_size | |
) | |
if self.dec_name == "nsfhifigan": | |
o = self.dec(z_slice, f0=f0.float()) | |
elif self.dec_name == "apnet": | |
_, _, _, _, o = self.dec(z_slice) | |
else: | |
o = self.dec(z_slice) | |
outputs = { | |
"y_hat": o, | |
"ids_slice": ids_slice, | |
"x_mask": x_mask, | |
"z_mask": data["mask"].transpose(1, 2), | |
"z": z, | |
"z_p": z_p, | |
"m_p": m_p, | |
"logs_p": logs_p, | |
"m_q": m_q, | |
"logs_q": logs_q, | |
} | |
return outputs | |
def infer(self, data, noise_scale=0.35, seed=52468): | |
# c, f0, uv, g | |
f0 = data["frame_pitch"] | |
g = data["spk_id"] | |
if f0.device == torch.device("cuda"): | |
torch.cuda.manual_seed_all(seed) | |
else: | |
torch.manual_seed(seed) | |
c_lengths = (torch.ones(f0.size(0)) * f0.size(-1)).to(f0.device) | |
if g.dim() == 1: | |
g = g.unsqueeze(0) | |
g = self.emb_g(g).transpose(1, 2) | |
# condition_encoder ver. | |
x = self.condition_encoder(data).transpose(1, 2) | |
x_mask = torch.unsqueeze(sequence_mask(c_lengths, f0.size(1)), 1).to(x.dtype) | |
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, noice_scale=noise_scale) | |
z = self.flow(z_p, c_mask, g=g, reverse=True) | |
if self.dec_name == "nsfhifigan": | |
o = self.dec(z * c_mask, f0=f0.float()) | |
elif self.dec_name == "apnet": | |
_, _, _, _, o = self.dec(z * c_mask) | |
else: | |
o = self.dec(z * c_mask) | |
return o, f0 | |