Text-to-Speech
English
geneing commited on
Commit
5eda599
·
1 Parent(s): b8db573

Merged from upstream.

Browse files
Files changed (2) hide show
  1. kokoro.py +2 -2
  2. models.py +2 -220
kokoro.py CHANGED
@@ -135,8 +135,8 @@ def forward(model, tokens, ref_s, speed):
135
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
136
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
137
 
138
- def generate(model, text, voicepack, lang='a', speed=1):
139
- ps = phonemize(text, lang)
140
  tokens = tokenize(ps)
141
  if not tokens:
142
  return None
 
135
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
136
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
137
 
138
+ def generate(model, text, voicepack, lang='a', speed=1, ps=None):
139
+ ps = ps or phonemize(text, lang)
140
  tokens = tokenize(ps)
141
  if not tokens:
142
  return None
models.py CHANGED
@@ -1,6 +1,5 @@
1
  # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
- from ast import Tuple
3
- from istftnet import Decoder
4
  from munch import Munch
5
  from pathlib import Path
6
  from plbert import load_plbert
@@ -13,118 +12,6 @@ import torch
13
  import torch.nn as nn
14
  import torch.nn.functional as F
15
 
16
- class LearnedDownSample(nn.Module):
17
- def __init__(self, layer_type, dim_in):
18
- super().__init__()
19
- self.layer_type = layer_type
20
-
21
- if self.layer_type == 'none':
22
- self.conv = nn.Identity()
23
- elif self.layer_type == 'timepreserve':
24
- self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
25
- elif self.layer_type == 'half':
26
- self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
27
- else:
28
- raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
29
-
30
- def forward(self, x):
31
- return self.conv(x)
32
-
33
- class LearnedUpSample(nn.Module):
34
- def __init__(self, layer_type, dim_in):
35
- super().__init__()
36
- self.layer_type = layer_type
37
-
38
- if self.layer_type == 'none':
39
- self.conv = nn.Identity()
40
- elif self.layer_type == 'timepreserve':
41
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
42
- elif self.layer_type == 'half':
43
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
44
- else:
45
- raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
46
-
47
-
48
- def forward(self, x):
49
- return self.conv(x)
50
-
51
- class DownSample(nn.Module):
52
- def __init__(self, layer_type):
53
- super().__init__()
54
- self.layer_type = layer_type
55
-
56
- def forward(self, x):
57
- if self.layer_type == 'none':
58
- return x
59
- elif self.layer_type == 'timepreserve':
60
- return F.avg_pool2d(x, (2, 1))
61
- elif self.layer_type == 'half':
62
- if x.shape[-1] % 2 != 0:
63
- x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
64
- return F.avg_pool2d(x, 2)
65
- else:
66
- raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
67
-
68
-
69
- class UpSample(nn.Module):
70
- def __init__(self, layer_type):
71
- super().__init__()
72
- self.layer_type = layer_type
73
-
74
- def forward(self, x):
75
- if self.layer_type == 'none':
76
- return x
77
- elif self.layer_type == 'timepreserve':
78
- return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
79
- elif self.layer_type == 'half':
80
- return F.interpolate(x, scale_factor=2, mode='nearest')
81
- else:
82
- raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
83
-
84
-
85
- class ResBlk(nn.Module):
86
- def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
87
- normalize=False, downsample='none'):
88
- super().__init__()
89
- self.actv = actv
90
- self.normalize = normalize
91
- self.downsample = DownSample(downsample)
92
- self.downsample_res = LearnedDownSample(downsample, dim_in)
93
- self.learned_sc = dim_in != dim_out
94
- self._build_weights(dim_in, dim_out)
95
-
96
- def _build_weights(self, dim_in, dim_out):
97
- self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
98
- self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
99
- if self.normalize:
100
- self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
101
- self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
102
- if self.learned_sc:
103
- self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
104
-
105
- def _shortcut(self, x):
106
- if self.learned_sc:
107
- x = self.conv1x1(x)
108
- if self.downsample:
109
- x = self.downsample(x)
110
- return x
111
-
112
- def _residual(self, x):
113
- if self.normalize:
114
- x = self.norm1(x)
115
- x = self.actv(x)
116
- x = self.conv1(x)
117
- x = self.downsample_res(x)
118
- if self.normalize:
119
- x = self.norm2(x)
120
- x = self.actv(x)
121
- x = self.conv2(x)
122
- return x
123
-
124
- def forward(self, x):
125
- x = self._shortcut(x) + self._residual(x)
126
- return x / np.sqrt(2) # unit variance
127
-
128
  class LinearNorm(torch.nn.Module):
129
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
130
  super(LinearNorm, self).__init__()
@@ -137,98 +24,6 @@ class LinearNorm(torch.nn.Module):
137
  def forward(self, x):
138
  return self.linear_layer(x)
139
 
140
- class Discriminator2d(nn.Module):
141
- def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
142
- super().__init__()
143
- blocks = []
144
- blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
145
-
146
- for lid in range(repeat_num):
147
- dim_out = min(dim_in*2, max_conv_dim)
148
- blocks += [ResBlk(dim_in, dim_out, downsample='half')]
149
- dim_in = dim_out
150
-
151
- blocks += [nn.LeakyReLU(0.2)]
152
- blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
153
- blocks += [nn.LeakyReLU(0.2)]
154
- blocks += [nn.AdaptiveAvgPool2d(1)]
155
- blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
156
- self.main = nn.Sequential(*blocks)
157
-
158
- def get_feature(self, x):
159
- features = []
160
- for l in self.main:
161
- x = l(x)
162
- features.append(x)
163
- out = features[-1]
164
- out = out.view(out.size(0), -1) # (batch, num_domains)
165
- return out, features
166
-
167
- def forward(self, x):
168
- out, features = self.get_feature(x)
169
- out = out.squeeze() # (batch)
170
- return out, features
171
-
172
- class ResBlk1d(nn.Module):
173
- def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
174
- normalize=False, downsample='none', dropout_p=0.2):
175
- super().__init__()
176
- self.actv = actv
177
- self.normalize = normalize
178
- self.downsample_type = downsample
179
- self.learned_sc = dim_in != dim_out
180
- self._build_weights(dim_in, dim_out)
181
- self.dropout_p = dropout_p
182
-
183
- if self.downsample_type == 'none':
184
- self.pool = nn.Identity()
185
- else:
186
- self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
187
-
188
- def _build_weights(self, dim_in, dim_out):
189
- self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
190
- self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
191
- if self.normalize:
192
- self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
193
- self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
194
- if self.learned_sc:
195
- self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
196
-
197
- def downsample(self, x):
198
- if self.downsample_type == 'none':
199
- return x
200
- else:
201
- if x.shape[-1] % 2 != 0:
202
- x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
203
- return F.avg_pool1d(x, 2)
204
-
205
- def _shortcut(self, x):
206
- if self.learned_sc:
207
- x = self.conv1x1(x)
208
- x = self.downsample(x)
209
- return x
210
-
211
- def _residual(self, x):
212
- if self.normalize:
213
- x = self.norm1(x)
214
- x = self.actv(x)
215
- x = F.dropout(x, p=self.dropout_p, training=self.training)
216
-
217
- x = self.conv1(x)
218
- x = self.pool(x)
219
- if self.normalize:
220
- x = self.norm2(x)
221
-
222
- x = self.actv(x)
223
- x = F.dropout(x, p=self.dropout_p, training=self.training)
224
-
225
- x = self.conv2(x)
226
- return x
227
-
228
- def forward(self, x):
229
- x = self._shortcut(x) + self._residual(x)
230
- return x / np.sqrt(2) # unit variance
231
-
232
  class LayerNorm(nn.Module):
233
  def __init__(self, channels, eps=1e-5):
234
  super().__init__()
@@ -313,19 +108,6 @@ class TextEncoder(nn.Module):
313
  return mask
314
 
315
 
316
-
317
- class AdaIN1d(nn.Module):
318
- def __init__(self, style_dim, num_features):
319
- super().__init__()
320
- self.norm = nn.InstanceNorm1d(num_features, affine=False)
321
- self.fc = nn.Linear(style_dim, num_features*2)
322
-
323
- def forward(self, x, s):
324
- h = self.fc(s)
325
- h = h.view(h.size(0), h.size(1), 1)
326
- gamma, beta = torch.chunk(h, chunks=2, dim=1)
327
- return (1 + gamma) * self.norm(x) + beta
328
-
329
  class UpSample1d(nn.Module):
330
  def __init__(self, layer_type):
331
  super().__init__()
@@ -484,7 +266,7 @@ class ProsodyPredictor(nn.Module):
484
  mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
485
  mask = torch.gt(mask+1, lengths.unsqueeze(1))
486
  return mask
487
-
488
  class DurationEncoder(nn.Module):
489
 
490
  def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
 
1
  # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
+ from istftnet import AdaIN1d, Decoder
 
3
  from munch import Munch
4
  from pathlib import Path
5
  from plbert import load_plbert
 
12
  import torch.nn as nn
13
  import torch.nn.functional as F
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  class LinearNorm(torch.nn.Module):
16
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
17
  super(LinearNorm, self).__init__()
 
24
  def forward(self, x):
25
  return self.linear_layer(x)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  class LayerNorm(nn.Module):
28
  def __init__(self, channels, eps=1e-5):
29
  super().__init__()
 
108
  return mask
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  class UpSample1d(nn.Module):
112
  def __init__(self, layer_type):
113
  super().__init__()
 
266
  mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
267
  mask = torch.gt(mask+1, lengths.unsqueeze(1))
268
  return mask
269
+
270
  class DurationEncoder(nn.Module):
271
 
272
  def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):