geneing
/

Kokoro

Text-to-Speech

English

Model card Files Files and versions Community

geneing commited on 29 days ago

Commit

5eda599

1 Parent(s): b8db573

Merged from upstream.

Browse files

Files changed (2) hide show

kokoro.py +2 -2
models.py +2 -220

kokoro.py CHANGED Viewed

@@ -135,8 +135,8 @@ def forward(model, tokens, ref_s, speed):
     asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
     return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
-def generate(model, text, voicepack, lang='a', speed=1):
-    ps = phonemize(text, lang)
     tokens = tokenize(ps)
     if not tokens:
         return None

     asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
     return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
+def generate(model, text, voicepack, lang='a', speed=1, ps=None):
+    ps = ps or phonemize(text, lang)
     tokens = tokenize(ps)
     if not tokens:
         return None

models.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # https://github.com/yl4579/StyleTTS2/blob/main/models.py
-from ast import Tuple
-from istftnet import Decoder
 from munch import Munch
 from pathlib import Path
 from plbert import load_plbert
@@ -13,118 +12,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-class LearnedDownSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
-        super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
-        elif self.layer_type == 'half':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-    def forward(self, x):
-        return self.conv(x)
-class LearnedUpSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
-        super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
-        elif self.layer_type == 'half':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-    def forward(self, x):
-        return self.conv(x)
-class DownSample(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.avg_pool2d(x, (2, 1))
-        elif self.layer_type == 'half':
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool2d(x, 2)
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-class UpSample(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
-        elif self.layer_type == 'half':
-            return F.interpolate(x, scale_factor=2, mode='nearest')
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-class ResBlk(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none'):
-        super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample = DownSample(downsample)
-        self.downsample_res = LearnedDownSample(downsample, dim_in)
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-    def _build_weights(self, dim_in, dim_out):
-        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
-        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
-        if self.learned_sc:
-            self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def _shortcut(self, x):
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        if self.downsample:
-            x = self.downsample(x)
-        return x
-    def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
-        x = self.actv(x)
-        x = self.conv1(x)
-        x = self.downsample_res(x)
-        if self.normalize:
-            x = self.norm2(x)
-        x = self.actv(x)
-        x = self.conv2(x)
-        return x
-    def forward(self, x):
-        x = self._shortcut(x) + self._residual(x)
-        return x / np.sqrt(2)  # unit variance
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
@@ -137,98 +24,6 @@ class LinearNorm(torch.nn.Module):
     def forward(self, x):
         return self.linear_layer(x)
-class Discriminator2d(nn.Module):
-    def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
-        super().__init__()
-        blocks = []
-        blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
-        for lid in range(repeat_num):
-            dim_out = min(dim_in*2, max_conv_dim)
-            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
-            dim_in = dim_out
-        blocks += [nn.LeakyReLU(0.2)]
-        blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
-        blocks += [nn.LeakyReLU(0.2)]
-        blocks += [nn.AdaptiveAvgPool2d(1)]
-        blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
-        self.main = nn.Sequential(*blocks)
-    def get_feature(self, x):
-        features = []
-        for l in self.main:
-            x = l(x)
-            features.append(x)
-        out = features[-1]
-        out = out.view(out.size(0), -1)  # (batch, num_domains)
-        return out, features
-    def forward(self, x):
-        out, features = self.get_feature(x)
-        out = out.squeeze()  # (batch)
-        return out, features
-class ResBlk1d(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none', dropout_p=0.2):
-        super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample_type = downsample
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-        self.dropout_p = dropout_p
-        if self.downsample_type == 'none':
-            self.pool = nn.Identity()
-        else:
-            self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
-    def _build_weights(self, dim_in, dim_out):
-        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
-        self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
-        if self.learned_sc:
-            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def downsample(self, x):
-        if self.downsample_type == 'none':
-            return x
-        else:
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool1d(x, 2)
-    def _shortcut(self, x):
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        x = self.downsample(x)
-        return x
-    def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv1(x)
-        x = self.pool(x)
-        if self.normalize:
-            x = self.norm2(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv2(x)
-        return x
-    def forward(self, x):
-        x = self._shortcut(x) + self._residual(x)
-        return x / np.sqrt(2)  # unit variance
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
@@ -313,19 +108,6 @@ class TextEncoder(nn.Module):
         return mask
-class AdaIN1d(nn.Module):
-    def __init__(self, style_dim, num_features):
-        super().__init__()
-        self.norm = nn.InstanceNorm1d(num_features, affine=False)
-        self.fc = nn.Linear(style_dim, num_features*2)
-    def forward(self, x, s):
-        h = self.fc(s)
-        h = h.view(h.size(0), h.size(1), 1)
-        gamma, beta = torch.chunk(h, chunks=2, dim=1)
-        return (1 + gamma) * self.norm(x) + beta
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
@@ -484,7 +266,7 @@ class ProsodyPredictor(nn.Module):
         mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
         mask = torch.gt(mask+1, lengths.unsqueeze(1))
         return mask
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):

 # https://github.com/yl4579/StyleTTS2/blob/main/models.py
+from istftnet import AdaIN1d, Decoder
 from munch import Munch
 from pathlib import Path
 from plbert import load_plbert
 import torch.nn as nn
 import torch.nn.functional as F
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
     def forward(self, x):
         return self.linear_layer(x)
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
         return mask
 class UpSample1d(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
         mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
         mask = torch.gt(mask+1, lengths.unsqueeze(1))
         return mask
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):