|
import torch |
|
from transformers import PreTrainedModel |
|
|
|
from config import DinoHuVitsConfig |
|
from src import CAMPPlus, Flow, HiFiGAN, PosteriorHubert |
|
|
|
|
|
class DinoHuVits(PreTrainedModel): |
|
config_class = DinoHuVitsConfig |
|
|
|
def __init__(self, config: DinoHuVitsConfig): |
|
super().__init__(config) |
|
|
|
self.enc_r = CAMPPlus(embed_dim=config.gin_channels, pooling_func="TSTP") |
|
self.enc_q = PosteriorHubert( |
|
out_channels=config.inter_channels, |
|
feature_channels=config.hubert_feature_channels, |
|
downsample_channels=config.hubert_downsample_channels, |
|
output_layer=config.hubert_output_layer, |
|
) |
|
self.flow = Flow( |
|
channels=config.inter_channels, |
|
hidden_channels=config.hidden_channels, |
|
kernel_size=5, |
|
dilation_rate=1, |
|
n_layers=4, |
|
gin_channels=config.gin_channels, |
|
) |
|
self.dec = HiFiGAN( |
|
initial_channel=config.inter_channels, |
|
resblock=config.resblock, |
|
resblock_kernel_sizes=config.resblock_kernel_sizes, |
|
resblock_dilation_sizes=config.resblock_dilation_sizes, |
|
upsample_rates=config.upsample_rates, |
|
upsample_initial_channel=config.upsample_initial_channel, |
|
upsample_kernel_sizes=config.upsample_kernel_sizes, |
|
gin_channels=config.gin_channels, |
|
) |
|
|
|
def forward( |
|
self, content: torch.Tensor, lengths: torch.Tensor, reference: torch.Tensor |
|
): |
|
g_src = self.__get_style_embedding(content) |
|
g_tgt = self.__get_style_embedding(reference) |
|
z, _, _, y_mask = self.enc_q(content, lengths, g=g_src) |
|
z_p = self.flow(z, y_mask, g=g_src) |
|
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) |
|
o_hat = self.dec(z_hat * y_mask, g=g_tgt) |
|
return o_hat, y_mask |
|
|
|
def __get_style_embedding(self, wavefrom: torch.Tensor): |
|
g = self.enc_r(wavefrom) |
|
g = torch.nn.functional.normalize(g, dim=1) |
|
return g.unsqueeze(-1) |
|
|