Spaces:
Running
Running
maxmax20160403
commited on
Commit
•
c24b656
1
Parent(s):
c2e61ce
final ver
Browse files- app.py +12 -12
- configs/base.yaml +16 -16
- vits/data_utils.py +5 -3
- vits/models.py +14 -5
- vits/modules.py +0 -151
- vits/modules_grl.py +62 -0
- vits/utils.py +1 -20
- vits_decoder/__init__.py +1 -0
- vits_decoder/alias/act.py +101 -0
- vits_decoder/alias/activations.py +0 -0
- vits_decoder/bigv.py +3 -70
- vits_decoder/discriminator.py +14 -7
- vits_decoder/generator.py +27 -28
- vits_decoder/med.py +65 -0
- vits_decoder/msd.py +29 -0
- vits_decoder/nsf.py +10 -25
- vits_pretrain/{sovits5.0-48k-debug.pth → sovits5.0_bigvgan.pth} +2 -2
app.py
CHANGED
@@ -60,7 +60,7 @@ def compute_f0_nn(filename, device):
|
|
60 |
periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2
|
61 |
# CREPE was not trained on silent audio. some error on silent need filter.
|
62 |
periodicity = torchcrepe.filter.median(periodicity, 9)
|
63 |
-
pitch = torchcrepe.filter.mean(pitch,
|
64 |
pitch[periodicity < 0.1] = 0
|
65 |
pitch = pitch.squeeze(0)
|
66 |
return pitch
|
@@ -72,7 +72,7 @@ model = SynthesizerInfer(
|
|
72 |
hp.data.filter_length // 2 + 1,
|
73 |
hp.data.segment_size // hp.data.hop_length,
|
74 |
hp)
|
75 |
-
load_svc_model("vits_pretrain/sovits5.
|
76 |
model.eval()
|
77 |
model.to(device)
|
78 |
|
@@ -116,17 +116,17 @@ def svc_change(argswave, argsspk):
|
|
116 |
has_audio = True
|
117 |
if (out_index == 0): # start frame
|
118 |
cut_s = out_index
|
119 |
-
|
120 |
else:
|
121 |
cut_s = out_index - hop_frame
|
122 |
-
|
123 |
|
124 |
if (out_index + out_chunk + hop_frame > all_frame): # end frame
|
125 |
cut_e = out_index + out_chunk
|
126 |
-
|
127 |
else:
|
128 |
cut_e = out_index + out_chunk + hop_frame
|
129 |
-
|
130 |
|
131 |
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
|
132 |
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
|
@@ -136,17 +136,17 @@ def svc_change(argswave, argsspk):
|
|
136 |
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
137 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
138 |
|
139 |
-
sub_out = sub_out[
|
140 |
out_audio.extend(sub_out)
|
141 |
out_index = out_index + out_chunk
|
142 |
|
143 |
if (out_index < all_frame):
|
144 |
if (has_audio):
|
145 |
cut_s = out_index - hop_frame
|
146 |
-
|
147 |
else:
|
148 |
cut_s = 0
|
149 |
-
|
150 |
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
|
151 |
sub_pit = pit[cut_s:].unsqueeze(0).to(device)
|
152 |
sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
|
@@ -154,7 +154,7 @@ def svc_change(argswave, argsspk):
|
|
154 |
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
155 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
156 |
|
157 |
-
sub_out = sub_out[
|
158 |
out_audio.extend(sub_out)
|
159 |
out_audio = np.asarray(out_audio)
|
160 |
|
@@ -175,7 +175,7 @@ def svc_main(sid, input_audio):
|
|
175 |
wav_path = "temp.wav"
|
176 |
soundfile.write(wav_path, audio, 16000, format="wav")
|
177 |
out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
|
178 |
-
return "Success", (
|
179 |
|
180 |
|
181 |
app = gr.Blocks()
|
@@ -183,7 +183,7 @@ with app:
|
|
183 |
with gr.Tabs():
|
184 |
with gr.TabItem("sovits 5.0"):
|
185 |
gr.Markdown(value="""
|
186 |
-
|
187 |
|
188 |
https://github.com/Multi-Singer/Multi-Singer.github.io
|
189 |
|
|
|
60 |
periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2
|
61 |
# CREPE was not trained on silent audio. some error on silent need filter.
|
62 |
periodicity = torchcrepe.filter.median(periodicity, 9)
|
63 |
+
pitch = torchcrepe.filter.mean(pitch, 3)
|
64 |
pitch[periodicity < 0.1] = 0
|
65 |
pitch = pitch.squeeze(0)
|
66 |
return pitch
|
|
|
72 |
hp.data.filter_length // 2 + 1,
|
73 |
hp.data.segment_size // hp.data.hop_length,
|
74 |
hp)
|
75 |
+
load_svc_model("vits_pretrain/sovits5.0_bigvgan.pth", model)
|
76 |
model.eval()
|
77 |
model.to(device)
|
78 |
|
|
|
116 |
has_audio = True
|
117 |
if (out_index == 0): # start frame
|
118 |
cut_s = out_index
|
119 |
+
cut_s_out = 0
|
120 |
else:
|
121 |
cut_s = out_index - hop_frame
|
122 |
+
cut_s_out = hop_frame * hop_size
|
123 |
|
124 |
if (out_index + out_chunk + hop_frame > all_frame): # end frame
|
125 |
cut_e = out_index + out_chunk
|
126 |
+
cut_e_out = 0
|
127 |
else:
|
128 |
cut_e = out_index + out_chunk + hop_frame
|
129 |
+
cut_e_out = -1 * hop_frame * hop_size
|
130 |
|
131 |
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
|
132 |
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
|
|
|
136 |
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
137 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
138 |
|
139 |
+
sub_out = sub_out[cut_s_out:cut_e_out]
|
140 |
out_audio.extend(sub_out)
|
141 |
out_index = out_index + out_chunk
|
142 |
|
143 |
if (out_index < all_frame):
|
144 |
if (has_audio):
|
145 |
cut_s = out_index - hop_frame
|
146 |
+
cut_s_out = hop_frame * hop_size
|
147 |
else:
|
148 |
cut_s = 0
|
149 |
+
cut_s_out = 0
|
150 |
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
|
151 |
sub_pit = pit[cut_s:].unsqueeze(0).to(device)
|
152 |
sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
|
|
|
154 |
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
155 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
156 |
|
157 |
+
sub_out = sub_out[cut_s_out:]
|
158 |
out_audio.extend(sub_out)
|
159 |
out_audio = np.asarray(out_audio)
|
160 |
|
|
|
175 |
wav_path = "temp.wav"
|
176 |
soundfile.write(wav_path, audio, 16000, format="wav")
|
177 |
out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
|
178 |
+
return "Success", (32000, out_audio)
|
179 |
|
180 |
|
181 |
app = gr.Blocks()
|
|
|
183 |
with gr.Tabs():
|
184 |
with gr.TabItem("sovits 5.0"):
|
185 |
gr.Markdown(value="""
|
186 |
+
最终版本,基于开源数据:Multi-Singer
|
187 |
|
188 |
https://github.com/Multi-Singer/Multi-Singer.github.io
|
189 |
|
configs/base.yaml
CHANGED
@@ -7,24 +7,24 @@ train:
|
|
7 |
lr_decay: 0.999875
|
8 |
eps: 1e-9
|
9 |
batch_size: 8
|
10 |
-
c_stft:
|
11 |
-
c_mel:
|
12 |
-
c_kl:
|
13 |
port: 8001
|
14 |
pretrain: ""
|
15 |
#############################
|
16 |
data:
|
17 |
training_files: "files/train.txt"
|
18 |
validation_files: "files/valid.txt"
|
19 |
-
segment_size:
|
20 |
max_wav_value: 32768.0
|
21 |
-
sampling_rate:
|
22 |
-
filter_length:
|
23 |
-
hop_length:
|
24 |
-
win_length:
|
25 |
-
mel_channels:
|
26 |
-
mel_fmin:
|
27 |
-
mel_fmax:
|
28 |
#############################
|
29 |
vits:
|
30 |
ppg_dim: 1024
|
@@ -36,9 +36,9 @@ vits:
|
|
36 |
#############################
|
37 |
gen:
|
38 |
upsample_input: 192
|
39 |
-
upsample_rates: [
|
40 |
-
upsample_kernel_sizes: [
|
41 |
-
upsample_initial_channel:
|
42 |
resblock_kernel_sizes: [3,7,11]
|
43 |
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
44 |
#############################
|
@@ -50,13 +50,13 @@ mpd:
|
|
50 |
lReLU_slope: 0.2
|
51 |
#############################
|
52 |
mrd:
|
53 |
-
resolutions: "[(1024, 120, 600), (2048, 240, 1200), (512, 50, 240)]" # (filter_length, hop_length, win_length)
|
54 |
use_spectral_norm: False
|
55 |
lReLU_slope: 0.2
|
56 |
#############################
|
57 |
log:
|
58 |
info_interval: 100
|
59 |
-
eval_interval:
|
60 |
save_interval: 5
|
61 |
num_audio: 6
|
62 |
pth_dir: 'chkpt'
|
|
|
7 |
lr_decay: 0.999875
|
8 |
eps: 1e-9
|
9 |
batch_size: 8
|
10 |
+
c_stft: 9
|
11 |
+
c_mel: 1.
|
12 |
+
c_kl: 0.2
|
13 |
port: 8001
|
14 |
pretrain: ""
|
15 |
#############################
|
16 |
data:
|
17 |
training_files: "files/train.txt"
|
18 |
validation_files: "files/valid.txt"
|
19 |
+
segment_size: 8000 # WARNING: base on hop_length
|
20 |
max_wav_value: 32768.0
|
21 |
+
sampling_rate: 32000
|
22 |
+
filter_length: 1024
|
23 |
+
hop_length: 320
|
24 |
+
win_length: 1024
|
25 |
+
mel_channels: 100
|
26 |
+
mel_fmin: 50.0
|
27 |
+
mel_fmax: 16000.0
|
28 |
#############################
|
29 |
vits:
|
30 |
ppg_dim: 1024
|
|
|
36 |
#############################
|
37 |
gen:
|
38 |
upsample_input: 192
|
39 |
+
upsample_rates: [5,4,4,2,2]
|
40 |
+
upsample_kernel_sizes: [15,8,8,4,4]
|
41 |
+
upsample_initial_channel: 320
|
42 |
resblock_kernel_sizes: [3,7,11]
|
43 |
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
44 |
#############################
|
|
|
50 |
lReLU_slope: 0.2
|
51 |
#############################
|
52 |
mrd:
|
53 |
+
resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
|
54 |
use_spectral_norm: False
|
55 |
lReLU_slope: 0.2
|
56 |
#############################
|
57 |
log:
|
58 |
info_interval: 100
|
59 |
+
eval_interval: 1
|
60 |
save_interval: 5
|
61 |
num_audio: 6
|
62 |
pth_dir: 'chkpt'
|
vits/data_utils.py
CHANGED
@@ -27,8 +27,8 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
27 |
def _filter(self):
|
28 |
lengths = []
|
29 |
items_new = []
|
30 |
-
items_min = int(self.segment_size / self.hop_length *
|
31 |
-
items_max = int(self.segment_size / self.hop_length *
|
32 |
for wavpath, spec, pitch, ppg, spk in self.items:
|
33 |
if not os.path.isfile(wavpath):
|
34 |
continue
|
@@ -87,7 +87,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
87 |
spk = torch.FloatTensor(spk)
|
88 |
|
89 |
len_pit = pit.size()[0]
|
90 |
-
len_ppg = ppg.size()[0]
|
91 |
len_min = min(len_pit, len_ppg)
|
92 |
len_wav = len_min * self.hop_length
|
93 |
|
@@ -255,6 +255,8 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
|
255 |
for i in range(len(self.buckets)):
|
256 |
bucket = self.buckets[i]
|
257 |
len_bucket = len(bucket)
|
|
|
|
|
258 |
ids_bucket = indices[i]
|
259 |
num_samples_bucket = self.num_samples_per_bucket[i]
|
260 |
|
|
|
27 |
def _filter(self):
|
28 |
lengths = []
|
29 |
items_new = []
|
30 |
+
items_min = int(self.segment_size / self.hop_length * 4) # 1 S
|
31 |
+
items_max = int(self.segment_size / self.hop_length * 16) # 4 S
|
32 |
for wavpath, spec, pitch, ppg, spk in self.items:
|
33 |
if not os.path.isfile(wavpath):
|
34 |
continue
|
|
|
87 |
spk = torch.FloatTensor(spk)
|
88 |
|
89 |
len_pit = pit.size()[0]
|
90 |
+
len_ppg = ppg.size()[0] - 2 # for safe
|
91 |
len_min = min(len_pit, len_ppg)
|
92 |
len_wav = len_min * self.hop_length
|
93 |
|
|
|
255 |
for i in range(len(self.buckets)):
|
256 |
bucket = self.buckets[i]
|
257 |
len_bucket = len(bucket)
|
258 |
+
if (len_bucket == 0):
|
259 |
+
continue
|
260 |
ids_bucket = indices[i]
|
261 |
num_samples_bucket = self.num_samples_per_bucket[i]
|
262 |
|
vits/models.py
CHANGED
@@ -8,6 +8,7 @@ from vits import commons
|
|
8 |
from vits import modules
|
9 |
from vits.utils import f0_to_coarse
|
10 |
from vits_decoder.generator import Generator
|
|
|
11 |
|
12 |
|
13 |
class TextEncoder(nn.Module):
|
@@ -44,7 +45,7 @@ class TextEncoder(nn.Module):
|
|
44 |
stats = self.proj(x) * x_mask
|
45 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
46 |
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
47 |
-
return z, m, logs, x_mask
|
48 |
|
49 |
|
50 |
class ResidualCouplingBlock(nn.Module):
|
@@ -151,6 +152,10 @@ class SynthesizerTrn(nn.Module):
|
|
151 |
3,
|
152 |
0.1,
|
153 |
)
|
|
|
|
|
|
|
|
|
154 |
self.enc_q = PosteriorEncoder(
|
155 |
spec_channels,
|
156 |
hp.vits.inter_channels,
|
@@ -171,8 +176,9 @@ class SynthesizerTrn(nn.Module):
|
|
171 |
self.dec = Generator(hp=hp)
|
172 |
|
173 |
def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
|
|
|
174 |
g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
|
175 |
-
z_p, m_p, logs_p, ppg_mask = self.enc_p(
|
176 |
ppg, ppg_l, f0=f0_to_coarse(pit))
|
177 |
z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
|
178 |
|
@@ -183,10 +189,13 @@ class SynthesizerTrn(nn.Module):
|
|
183 |
# SNAC to flow
|
184 |
z_f, logdet_f = self.flow(z_q, spec_mask, g=spk)
|
185 |
z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True)
|
186 |
-
|
|
|
|
|
187 |
|
188 |
def infer(self, ppg, pit, spk, ppg_l):
|
189 |
-
|
|
|
190 |
ppg, ppg_l, f0=f0_to_coarse(pit))
|
191 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
192 |
o = self.dec(spk, z * ppg_mask, f0=pit)
|
@@ -233,7 +242,7 @@ class SynthesizerInfer(nn.Module):
|
|
233 |
return self.dec.source2wav(source)
|
234 |
|
235 |
def inference(self, ppg, pit, spk, ppg_l, source):
|
236 |
-
z_p, m_p, logs_p, ppg_mask = self.enc_p(
|
237 |
ppg, ppg_l, f0=f0_to_coarse(pit))
|
238 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
239 |
o = self.dec.inference(spk, z * ppg_mask, source)
|
|
|
8 |
from vits import modules
|
9 |
from vits.utils import f0_to_coarse
|
10 |
from vits_decoder.generator import Generator
|
11 |
+
from vits.modules_grl import SpeakerClassifier
|
12 |
|
13 |
|
14 |
class TextEncoder(nn.Module):
|
|
|
45 |
stats = self.proj(x) * x_mask
|
46 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
47 |
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
48 |
+
return z, m, logs, x_mask, x
|
49 |
|
50 |
|
51 |
class ResidualCouplingBlock(nn.Module):
|
|
|
152 |
3,
|
153 |
0.1,
|
154 |
)
|
155 |
+
self.speaker_classifier = SpeakerClassifier(
|
156 |
+
hp.vits.hidden_channels,
|
157 |
+
hp.vits.spk_dim,
|
158 |
+
)
|
159 |
self.enc_q = PosteriorEncoder(
|
160 |
spec_channels,
|
161 |
hp.vits.inter_channels,
|
|
|
176 |
self.dec = Generator(hp=hp)
|
177 |
|
178 |
def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
|
179 |
+
ppg = ppg + torch.randn_like(ppg) # Perturbation
|
180 |
g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
|
181 |
+
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
182 |
ppg, ppg_l, f0=f0_to_coarse(pit))
|
183 |
z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
|
184 |
|
|
|
189 |
# SNAC to flow
|
190 |
z_f, logdet_f = self.flow(z_q, spec_mask, g=spk)
|
191 |
z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True)
|
192 |
+
# speaker
|
193 |
+
spk_preds = self.speaker_classifier(x)
|
194 |
+
return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
|
195 |
|
196 |
def infer(self, ppg, pit, spk, ppg_l):
|
197 |
+
ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
|
198 |
+
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
199 |
ppg, ppg_l, f0=f0_to_coarse(pit))
|
200 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
201 |
o = self.dec(spk, z * ppg_mask, f0=pit)
|
|
|
242 |
return self.dec.source2wav(source)
|
243 |
|
244 |
def inference(self, ppg, pit, spk, ppg_l, source):
|
245 |
+
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
246 |
ppg, ppg_l, f0=f0_to_coarse(pit))
|
247 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
248 |
o = self.dec.inference(spk, z * ppg_mask, source)
|
vits/modules.py
CHANGED
@@ -1,16 +1,7 @@
|
|
1 |
-
import copy
|
2 |
-
import math
|
3 |
-
import numpy as np
|
4 |
-
import scipy
|
5 |
import torch
|
6 |
from torch import nn
|
7 |
from torch.nn import functional as F
|
8 |
-
|
9 |
-
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
10 |
-
from torch.nn.utils import weight_norm, remove_weight_norm
|
11 |
-
|
12 |
from vits import commons
|
13 |
-
from vits.commons import init_weights, get_padding
|
14 |
|
15 |
|
16 |
LRELU_SLOPE = 0.1
|
@@ -220,148 +211,6 @@ class WN(torch.nn.Module):
|
|
220 |
torch.nn.utils.remove_weight_norm(l)
|
221 |
|
222 |
|
223 |
-
class ResBlock1(torch.nn.Module):
|
224 |
-
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
225 |
-
super(ResBlock1, self).__init__()
|
226 |
-
self.convs1 = nn.ModuleList(
|
227 |
-
[
|
228 |
-
weight_norm(
|
229 |
-
Conv1d(
|
230 |
-
channels,
|
231 |
-
channels,
|
232 |
-
kernel_size,
|
233 |
-
1,
|
234 |
-
dilation=dilation[0],
|
235 |
-
padding=get_padding(kernel_size, dilation[0]),
|
236 |
-
)
|
237 |
-
),
|
238 |
-
weight_norm(
|
239 |
-
Conv1d(
|
240 |
-
channels,
|
241 |
-
channels,
|
242 |
-
kernel_size,
|
243 |
-
1,
|
244 |
-
dilation=dilation[1],
|
245 |
-
padding=get_padding(kernel_size, dilation[1]),
|
246 |
-
)
|
247 |
-
),
|
248 |
-
weight_norm(
|
249 |
-
Conv1d(
|
250 |
-
channels,
|
251 |
-
channels,
|
252 |
-
kernel_size,
|
253 |
-
1,
|
254 |
-
dilation=dilation[2],
|
255 |
-
padding=get_padding(kernel_size, dilation[2]),
|
256 |
-
)
|
257 |
-
),
|
258 |
-
]
|
259 |
-
)
|
260 |
-
self.convs1.apply(init_weights)
|
261 |
-
|
262 |
-
self.convs2 = nn.ModuleList(
|
263 |
-
[
|
264 |
-
weight_norm(
|
265 |
-
Conv1d(
|
266 |
-
channels,
|
267 |
-
channels,
|
268 |
-
kernel_size,
|
269 |
-
1,
|
270 |
-
dilation=1,
|
271 |
-
padding=get_padding(kernel_size, 1),
|
272 |
-
)
|
273 |
-
),
|
274 |
-
weight_norm(
|
275 |
-
Conv1d(
|
276 |
-
channels,
|
277 |
-
channels,
|
278 |
-
kernel_size,
|
279 |
-
1,
|
280 |
-
dilation=1,
|
281 |
-
padding=get_padding(kernel_size, 1),
|
282 |
-
)
|
283 |
-
),
|
284 |
-
weight_norm(
|
285 |
-
Conv1d(
|
286 |
-
channels,
|
287 |
-
channels,
|
288 |
-
kernel_size,
|
289 |
-
1,
|
290 |
-
dilation=1,
|
291 |
-
padding=get_padding(kernel_size, 1),
|
292 |
-
)
|
293 |
-
),
|
294 |
-
]
|
295 |
-
)
|
296 |
-
self.convs2.apply(init_weights)
|
297 |
-
|
298 |
-
def forward(self, x, x_mask=None):
|
299 |
-
for c1, c2 in zip(self.convs1, self.convs2):
|
300 |
-
xt = F.leaky_relu(x, LRELU_SLOPE)
|
301 |
-
if x_mask is not None:
|
302 |
-
xt = xt * x_mask
|
303 |
-
xt = c1(xt)
|
304 |
-
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
305 |
-
if x_mask is not None:
|
306 |
-
xt = xt * x_mask
|
307 |
-
xt = c2(xt)
|
308 |
-
x = xt + x
|
309 |
-
if x_mask is not None:
|
310 |
-
x = x * x_mask
|
311 |
-
return x
|
312 |
-
|
313 |
-
def remove_weight_norm(self):
|
314 |
-
for l in self.convs1:
|
315 |
-
remove_weight_norm(l)
|
316 |
-
for l in self.convs2:
|
317 |
-
remove_weight_norm(l)
|
318 |
-
|
319 |
-
|
320 |
-
class ResBlock2(torch.nn.Module):
|
321 |
-
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
322 |
-
super(ResBlock2, self).__init__()
|
323 |
-
self.convs = nn.ModuleList(
|
324 |
-
[
|
325 |
-
weight_norm(
|
326 |
-
Conv1d(
|
327 |
-
channels,
|
328 |
-
channels,
|
329 |
-
kernel_size,
|
330 |
-
1,
|
331 |
-
dilation=dilation[0],
|
332 |
-
padding=get_padding(kernel_size, dilation[0]),
|
333 |
-
)
|
334 |
-
),
|
335 |
-
weight_norm(
|
336 |
-
Conv1d(
|
337 |
-
channels,
|
338 |
-
channels,
|
339 |
-
kernel_size,
|
340 |
-
1,
|
341 |
-
dilation=dilation[1],
|
342 |
-
padding=get_padding(kernel_size, dilation[1]),
|
343 |
-
)
|
344 |
-
),
|
345 |
-
]
|
346 |
-
)
|
347 |
-
self.convs.apply(init_weights)
|
348 |
-
|
349 |
-
def forward(self, x, x_mask=None):
|
350 |
-
for c in self.convs:
|
351 |
-
xt = F.leaky_relu(x, LRELU_SLOPE)
|
352 |
-
if x_mask is not None:
|
353 |
-
xt = xt * x_mask
|
354 |
-
xt = c(xt)
|
355 |
-
x = xt + x
|
356 |
-
if x_mask is not None:
|
357 |
-
x = x * x_mask
|
358 |
-
return x
|
359 |
-
|
360 |
-
def remove_weight_norm(self):
|
361 |
-
for l in self.convs:
|
362 |
-
remove_weight_norm(l)
|
363 |
-
|
364 |
-
|
365 |
class Log(nn.Module):
|
366 |
def forward(self, x, x_mask, reverse=False, **kwargs):
|
367 |
if not reverse:
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
from torch import nn
|
3 |
from torch.nn import functional as F
|
|
|
|
|
|
|
|
|
4 |
from vits import commons
|
|
|
5 |
|
6 |
|
7 |
LRELU_SLOPE = 0.1
|
|
|
211 |
torch.nn.utils.remove_weight_norm(l)
|
212 |
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
class Log(nn.Module):
|
215 |
def forward(self, x, x_mask, reverse=False, **kwargs):
|
216 |
if not reverse:
|
vits/modules_grl.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0
|
2 |
+
# Unsupervised Domain Adaptation by Backpropagation
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
|
7 |
+
from torch.autograd import Function
|
8 |
+
from torch.nn.utils import weight_norm
|
9 |
+
|
10 |
+
|
11 |
+
class GradientReversalFunction(Function):
|
12 |
+
@staticmethod
|
13 |
+
def forward(ctx, x, lambda_):
|
14 |
+
ctx.lambda_ = lambda_
|
15 |
+
return x.clone()
|
16 |
+
|
17 |
+
@staticmethod
|
18 |
+
def backward(ctx, grads):
|
19 |
+
lambda_ = ctx.lambda_
|
20 |
+
lambda_ = grads.new_tensor(lambda_)
|
21 |
+
dx = -lambda_ * grads
|
22 |
+
return dx, None
|
23 |
+
|
24 |
+
|
25 |
+
class GradientReversal(torch.nn.Module):
|
26 |
+
''' Gradient Reversal Layer
|
27 |
+
Y. Ganin, V. Lempitsky,
|
28 |
+
"Unsupervised Domain Adaptation by Backpropagation",
|
29 |
+
in ICML, 2015.
|
30 |
+
Forward pass is the identity function
|
31 |
+
In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed)
|
32 |
+
'''
|
33 |
+
|
34 |
+
def __init__(self, lambda_reversal=1):
|
35 |
+
super(GradientReversal, self).__init__()
|
36 |
+
self.lambda_ = lambda_reversal
|
37 |
+
|
38 |
+
def forward(self, x):
|
39 |
+
return GradientReversalFunction.apply(x, self.lambda_)
|
40 |
+
|
41 |
+
|
42 |
+
class SpeakerClassifier(nn.Module):
|
43 |
+
|
44 |
+
def __init__(self, embed_dim, spk_dim):
|
45 |
+
super(SpeakerClassifier, self).__init__()
|
46 |
+
self.classifier = nn.Sequential(
|
47 |
+
GradientReversal(lambda_reversal=1),
|
48 |
+
weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
|
49 |
+
nn.ReLU(),
|
50 |
+
weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
|
51 |
+
nn.ReLU(),
|
52 |
+
weight_norm(nn.Conv1d(embed_dim, spk_dim, kernel_size=5, padding=2))
|
53 |
+
)
|
54 |
+
|
55 |
+
def forward(self, x):
|
56 |
+
''' Forward function of Speaker Classifier:
|
57 |
+
x = (B, embed_dim, len)
|
58 |
+
'''
|
59 |
+
# pass through classifier
|
60 |
+
outputs = self.classifier(x) # (B, nb_speakers)
|
61 |
+
outputs = torch.mean(outputs, dim=-1)
|
62 |
+
return outputs
|
vits/utils.py
CHANGED
@@ -1,10 +1,6 @@
|
|
1 |
-
import os
|
2 |
-
import argparse
|
3 |
-
import numpy as np
|
4 |
import torch
|
5 |
-
|
6 |
from scipy.io.wavfile import read
|
7 |
-
from omegaconf import OmegaConf
|
8 |
|
9 |
MATPLOTLIB_FLAG = False
|
10 |
|
@@ -35,18 +31,3 @@ def f0_to_coarse(f0):
|
|
35 |
assert f0_coarse.max() <= 255 and f0_coarse.min(
|
36 |
) >= 1, (f0_coarse.max(), f0_coarse.min())
|
37 |
return f0_coarse
|
38 |
-
|
39 |
-
|
40 |
-
def get_hparams(init=True):
|
41 |
-
parser = argparse.ArgumentParser()
|
42 |
-
parser.add_argument('-c', '--config', type=str, default="./configs/base.yaml",
|
43 |
-
help='YAML file for configuration')
|
44 |
-
args = parser.parse_args()
|
45 |
-
hparams = OmegaConf.load(args.config)
|
46 |
-
model_dir = os.path.join("./logs", hparams.train.model)
|
47 |
-
if not os.path.exists(model_dir):
|
48 |
-
os.makedirs(model_dir)
|
49 |
-
config_save_path = os.path.join(model_dir, "config.json")
|
50 |
-
os.system(f"cp {args.config} {config_save_path}")
|
51 |
-
hparams.model_dir = model_dir
|
52 |
-
return hparams
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
+
import numpy as np
|
3 |
from scipy.io.wavfile import read
|
|
|
4 |
|
5 |
MATPLOTLIB_FLAG = False
|
6 |
|
|
|
31 |
assert f0_coarse.max() <= 255 and f0_coarse.min(
|
32 |
) >= 1, (f0_coarse.max(), f0_coarse.min())
|
33 |
return f0_coarse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vits_decoder/__init__.py
CHANGED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .alias.act import SnakeAlias
|
vits_decoder/alias/act.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1 |
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
2 |
# LICENSE is in incl_licenses directory.
|
3 |
|
|
|
4 |
import torch.nn as nn
|
|
|
|
|
|
|
|
|
5 |
from .resample import UpSample1d, DownSample1d
|
6 |
|
7 |
|
@@ -19,6 +24,102 @@ class Activation1d(nn.Module):
|
|
19 |
self.upsample = UpSample1d(up_ratio, up_kernel_size)
|
20 |
self.downsample = DownSample1d(down_ratio, down_kernel_size)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# x: [B,C,T]
|
23 |
def forward(self, x):
|
24 |
x = self.upsample(x)
|
|
|
1 |
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
|
2 |
# LICENSE is in incl_licenses directory.
|
3 |
|
4 |
+
import torch
|
5 |
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
from torch import sin, pow
|
9 |
+
from torch.nn import Parameter
|
10 |
from .resample import UpSample1d, DownSample1d
|
11 |
|
12 |
|
|
|
24 |
self.upsample = UpSample1d(up_ratio, up_kernel_size)
|
25 |
self.downsample = DownSample1d(down_ratio, down_kernel_size)
|
26 |
|
27 |
+
# x: [B,C,T]
|
28 |
+
def forward(self, x):
|
29 |
+
x = self.upsample(x)
|
30 |
+
x = self.act(x)
|
31 |
+
x = self.downsample(x)
|
32 |
+
|
33 |
+
return x
|
34 |
+
|
35 |
+
|
36 |
+
class SnakeBeta(nn.Module):
|
37 |
+
'''
|
38 |
+
A modified Snake function which uses separate parameters for the magnitude of the periodic components
|
39 |
+
Shape:
|
40 |
+
- Input: (B, C, T)
|
41 |
+
- Output: (B, C, T), same shape as the input
|
42 |
+
Parameters:
|
43 |
+
- alpha - trainable parameter that controls frequency
|
44 |
+
- beta - trainable parameter that controls magnitude
|
45 |
+
References:
|
46 |
+
- This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
|
47 |
+
https://arxiv.org/abs/2006.08195
|
48 |
+
Examples:
|
49 |
+
>>> a1 = snakebeta(256)
|
50 |
+
>>> x = torch.randn(256)
|
51 |
+
>>> x = a1(x)
|
52 |
+
'''
|
53 |
+
|
54 |
+
def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
|
55 |
+
'''
|
56 |
+
Initialization.
|
57 |
+
INPUT:
|
58 |
+
- in_features: shape of the input
|
59 |
+
- alpha - trainable parameter that controls frequency
|
60 |
+
- beta - trainable parameter that controls magnitude
|
61 |
+
alpha is initialized to 1 by default, higher values = higher-frequency.
|
62 |
+
beta is initialized to 1 by default, higher values = higher-magnitude.
|
63 |
+
alpha will be trained along with the rest of your model.
|
64 |
+
'''
|
65 |
+
super(SnakeBeta, self).__init__()
|
66 |
+
self.in_features = in_features
|
67 |
+
# initialize alpha
|
68 |
+
self.alpha_logscale = alpha_logscale
|
69 |
+
if self.alpha_logscale: # log scale alphas initialized to zeros
|
70 |
+
self.alpha = Parameter(torch.zeros(in_features) * alpha)
|
71 |
+
self.beta = Parameter(torch.zeros(in_features) * alpha)
|
72 |
+
else: # linear scale alphas initialized to ones
|
73 |
+
self.alpha = Parameter(torch.ones(in_features) * alpha)
|
74 |
+
self.beta = Parameter(torch.ones(in_features) * alpha)
|
75 |
+
self.alpha.requires_grad = alpha_trainable
|
76 |
+
self.beta.requires_grad = alpha_trainable
|
77 |
+
self.no_div_by_zero = 0.000000001
|
78 |
+
|
79 |
+
def forward(self, x):
|
80 |
+
'''
|
81 |
+
Forward pass of the function.
|
82 |
+
Applies the function to the input elementwise.
|
83 |
+
SnakeBeta = x + 1/b * sin^2 (xa)
|
84 |
+
'''
|
85 |
+
alpha = self.alpha.unsqueeze(
|
86 |
+
0).unsqueeze(-1) # line up with x to [B, C, T]
|
87 |
+
beta = self.beta.unsqueeze(0).unsqueeze(-1)
|
88 |
+
if self.alpha_logscale:
|
89 |
+
alpha = torch.exp(alpha)
|
90 |
+
beta = torch.exp(beta)
|
91 |
+
x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
|
92 |
+
return x
|
93 |
+
|
94 |
+
|
95 |
+
class Mish(nn.Module):
|
96 |
+
"""
|
97 |
+
Mish activation function is proposed in "Mish: A Self
|
98 |
+
Regularized Non-Monotonic Neural Activation Function"
|
99 |
+
paper, https://arxiv.org/abs/1908.08681.
|
100 |
+
"""
|
101 |
+
|
102 |
+
def __init__(self):
|
103 |
+
super().__init__()
|
104 |
+
|
105 |
+
def forward(self, x):
|
106 |
+
return x * torch.tanh(F.softplus(x))
|
107 |
+
|
108 |
+
|
109 |
+
class SnakeAlias(nn.Module):
|
110 |
+
def __init__(self,
|
111 |
+
channels,
|
112 |
+
up_ratio: int = 2,
|
113 |
+
down_ratio: int = 2,
|
114 |
+
up_kernel_size: int = 12,
|
115 |
+
down_kernel_size: int = 12):
|
116 |
+
super().__init__()
|
117 |
+
self.up_ratio = up_ratio
|
118 |
+
self.down_ratio = down_ratio
|
119 |
+
self.act = SnakeBeta(channels, alpha_logscale=True)
|
120 |
+
self.upsample = UpSample1d(up_ratio, up_kernel_size)
|
121 |
+
self.downsample = DownSample1d(down_ratio, down_kernel_size)
|
122 |
+
|
123 |
# x: [B,C,T]
|
124 |
def forward(self, x):
|
125 |
x = self.upsample(x)
|
vits_decoder/alias/activations.py
DELETED
File without changes
|
vits_decoder/bigv.py
CHANGED
@@ -1,14 +1,9 @@
|
|
1 |
import torch
|
2 |
-
import torch.nn.functional as F
|
3 |
import torch.nn as nn
|
4 |
|
5 |
-
from torch import nn, sin, pow
|
6 |
-
from torch.nn import Parameter
|
7 |
from torch.nn import Conv1d
|
8 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
9 |
-
|
10 |
-
|
11 |
-
from .alias import *
|
12 |
|
13 |
|
14 |
def init_weights(m, mean=0.0, std=0.01):
|
@@ -21,69 +16,9 @@ def get_padding(kernel_size, dilation=1):
|
|
21 |
return int((kernel_size*dilation - dilation)/2)
|
22 |
|
23 |
|
24 |
-
class SnakeBeta(nn.Module):
|
25 |
-
'''
|
26 |
-
A modified Snake function which uses separate parameters for the magnitude of the periodic components
|
27 |
-
Shape:
|
28 |
-
- Input: (B, C, T)
|
29 |
-
- Output: (B, C, T), same shape as the input
|
30 |
-
Parameters:
|
31 |
-
- alpha - trainable parameter that controls frequency
|
32 |
-
- beta - trainable parameter that controls magnitude
|
33 |
-
References:
|
34 |
-
- This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
|
35 |
-
https://arxiv.org/abs/2006.08195
|
36 |
-
Examples:
|
37 |
-
>>> a1 = snakebeta(256)
|
38 |
-
>>> x = torch.randn(256)
|
39 |
-
>>> x = a1(x)
|
40 |
-
'''
|
41 |
-
|
42 |
-
def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
|
43 |
-
'''
|
44 |
-
Initialization.
|
45 |
-
INPUT:
|
46 |
-
- in_features: shape of the input
|
47 |
-
- alpha - trainable parameter that controls frequency
|
48 |
-
- beta - trainable parameter that controls magnitude
|
49 |
-
alpha is initialized to 1 by default, higher values = higher-frequency.
|
50 |
-
beta is initialized to 1 by default, higher values = higher-magnitude.
|
51 |
-
alpha will be trained along with the rest of your model.
|
52 |
-
'''
|
53 |
-
super(SnakeBeta, self).__init__()
|
54 |
-
self.in_features = in_features
|
55 |
-
# initialize alpha
|
56 |
-
self.alpha_logscale = alpha_logscale
|
57 |
-
if self.alpha_logscale: # log scale alphas initialized to zeros
|
58 |
-
self.alpha = Parameter(torch.zeros(in_features) * alpha)
|
59 |
-
self.beta = Parameter(torch.zeros(in_features) * alpha)
|
60 |
-
else: # linear scale alphas initialized to ones
|
61 |
-
self.alpha = Parameter(torch.ones(in_features) * alpha)
|
62 |
-
self.beta = Parameter(torch.ones(in_features) * alpha)
|
63 |
-
self.alpha.requires_grad = alpha_trainable
|
64 |
-
self.beta.requires_grad = alpha_trainable
|
65 |
-
self.no_div_by_zero = 0.000000001
|
66 |
-
|
67 |
-
def forward(self, x):
|
68 |
-
'''
|
69 |
-
Forward pass of the function.
|
70 |
-
Applies the function to the input elementwise.
|
71 |
-
SnakeBeta ∶= x + 1/b * sin^2 (xa)
|
72 |
-
'''
|
73 |
-
alpha = self.alpha.unsqueeze(
|
74 |
-
0).unsqueeze(-1) # line up with x to [B, C, T]
|
75 |
-
beta = self.beta.unsqueeze(0).unsqueeze(-1)
|
76 |
-
if self.alpha_logscale:
|
77 |
-
alpha = torch.exp(alpha)
|
78 |
-
beta = torch.exp(beta)
|
79 |
-
x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
|
80 |
-
return x
|
81 |
-
|
82 |
-
|
83 |
class AMPBlock(torch.nn.Module):
|
84 |
-
def __init__(self,
|
85 |
super(AMPBlock, self).__init__()
|
86 |
-
self.h = h
|
87 |
self.convs1 = nn.ModuleList([
|
88 |
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
89 |
padding=get_padding(kernel_size, dilation[0]))),
|
@@ -109,9 +44,7 @@ class AMPBlock(torch.nn.Module):
|
|
109 |
|
110 |
# periodic nonlinearity with snakebeta function and anti-aliasing
|
111 |
self.activations = nn.ModuleList([
|
112 |
-
|
113 |
-
activation=SnakeBeta(channels, alpha_logscale=True))
|
114 |
-
for _ in range(self.num_layers)
|
115 |
])
|
116 |
|
117 |
def forward(self, x):
|
|
|
1 |
import torch
|
|
|
2 |
import torch.nn as nn
|
3 |
|
|
|
|
|
4 |
from torch.nn import Conv1d
|
5 |
from torch.nn.utils import weight_norm, remove_weight_norm
|
6 |
+
from .alias.act import SnakeAlias
|
|
|
|
|
7 |
|
8 |
|
9 |
def init_weights(m, mean=0.0, std=0.01):
|
|
|
16 |
return int((kernel_size*dilation - dilation)/2)
|
17 |
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
class AMPBlock(torch.nn.Module):
|
20 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
21 |
super(AMPBlock, self).__init__()
|
|
|
22 |
self.convs1 = nn.ModuleList([
|
23 |
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
24 |
padding=get_padding(kernel_size, dilation[0]))),
|
|
|
44 |
|
45 |
# periodic nonlinearity with snakebeta function and anti-aliasing
|
46 |
self.activations = nn.ModuleList([
|
47 |
+
SnakeAlias(channels) for _ in range(self.num_layers)
|
|
|
|
|
48 |
])
|
49 |
|
50 |
def forward(self, x):
|
vits_decoder/discriminator.py
CHANGED
@@ -1,32 +1,39 @@
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
|
|
|
|
|
4 |
from .mpd import MultiPeriodDiscriminator
|
5 |
from .mrd import MultiResolutionDiscriminator
|
6 |
-
|
7 |
|
8 |
class Discriminator(nn.Module):
|
9 |
def __init__(self, hp):
|
10 |
super(Discriminator, self).__init__()
|
11 |
self.MRD = MultiResolutionDiscriminator(hp)
|
12 |
self.MPD = MultiPeriodDiscriminator(hp)
|
|
|
13 |
|
14 |
def forward(self, x):
|
15 |
-
|
|
|
|
|
|
|
|
|
16 |
|
17 |
if __name__ == '__main__':
|
18 |
-
hp = OmegaConf.load('../config/
|
19 |
model = Discriminator(hp)
|
20 |
|
21 |
x = torch.randn(3, 1, 16384)
|
22 |
print(x.shape)
|
23 |
|
24 |
-
|
25 |
-
for features, score in
|
26 |
for feat in features:
|
27 |
print(feat.shape)
|
28 |
print(score.shape)
|
29 |
|
30 |
-
pytorch_total_params = sum(p.numel()
|
|
|
31 |
print(pytorch_total_params)
|
32 |
-
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
|
4 |
+
from omegaconf import OmegaConf
|
5 |
+
from .msd import ScaleDiscriminator
|
6 |
from .mpd import MultiPeriodDiscriminator
|
7 |
from .mrd import MultiResolutionDiscriminator
|
8 |
+
|
9 |
|
10 |
class Discriminator(nn.Module):
|
11 |
def __init__(self, hp):
|
12 |
super(Discriminator, self).__init__()
|
13 |
self.MRD = MultiResolutionDiscriminator(hp)
|
14 |
self.MPD = MultiPeriodDiscriminator(hp)
|
15 |
+
self.MSD = ScaleDiscriminator()
|
16 |
|
17 |
def forward(self, x):
|
18 |
+
r = self.MRD(x)
|
19 |
+
p = self.MPD(x)
|
20 |
+
s = self.MSD(x)
|
21 |
+
return r + p + s
|
22 |
+
|
23 |
|
24 |
if __name__ == '__main__':
|
25 |
+
hp = OmegaConf.load('../config/base.yaml')
|
26 |
model = Discriminator(hp)
|
27 |
|
28 |
x = torch.randn(3, 1, 16384)
|
29 |
print(x.shape)
|
30 |
|
31 |
+
output = model(x)
|
32 |
+
for features, score in output:
|
33 |
for feat in features:
|
34 |
print(feat.shape)
|
35 |
print(score.shape)
|
36 |
|
37 |
+
pytorch_total_params = sum(p.numel()
|
38 |
+
for p in model.parameters() if p.requires_grad)
|
39 |
print(pytorch_total_params)
|
|
vits_decoder/generator.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
|
|
3 |
import numpy as np
|
4 |
|
5 |
from torch.nn import Conv1d
|
@@ -8,8 +9,7 @@ from torch.nn.utils import weight_norm
|
|
8 |
from torch.nn.utils import remove_weight_norm
|
9 |
|
10 |
from .nsf import SourceModuleHnNSF
|
11 |
-
from .bigv import init_weights,
|
12 |
-
from .alias import Activation1d
|
13 |
|
14 |
|
15 |
class SpeakerAdapter(nn.Module):
|
@@ -57,24 +57,28 @@ class Generator(torch.nn.Module):
|
|
57 |
# speaker adaper, 256 should change by what speaker encoder you use
|
58 |
self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input)
|
59 |
# pre conv
|
60 |
-
self.conv_pre =
|
61 |
-
|
62 |
# nsf
|
63 |
self.f0_upsamp = torch.nn.Upsample(
|
64 |
scale_factor=np.prod(hp.gen.upsample_rates))
|
65 |
-
self.m_source = SourceModuleHnNSF()
|
66 |
self.noise_convs = nn.ModuleList()
|
67 |
# transposed conv-based upsamplers. does not apply anti-aliasing
|
68 |
self.ups = nn.ModuleList()
|
69 |
for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
|
70 |
# print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
|
71 |
# base
|
72 |
-
self.ups.append(
|
73 |
-
weight_norm(
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
78 |
# nsf
|
79 |
if i + 1 < len(hp.gen.upsample_rates):
|
80 |
stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
|
@@ -99,32 +103,30 @@ class Generator(torch.nn.Module):
|
|
99 |
for i in range(len(self.ups)):
|
100 |
ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
|
101 |
for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
|
102 |
-
self.resblocks.append(AMPBlock(
|
103 |
|
104 |
# post conv
|
105 |
-
activation_post =
|
106 |
-
self.
|
107 |
-
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
108 |
-
|
109 |
# weight initialization
|
110 |
-
|
111 |
-
self.ups[i].apply(init_weights)
|
112 |
-
self.conv_post.apply(init_weights)
|
113 |
|
114 |
def forward(self, spk, x, f0):
|
|
|
|
|
115 |
# adapter
|
116 |
x = self.adapter(x, spk)
|
|
|
|
|
117 |
# nsf
|
118 |
f0 = f0[:, None]
|
119 |
f0 = self.f0_upsamp(f0).transpose(1, 2)
|
120 |
har_source = self.m_source(f0)
|
121 |
har_source = har_source.transpose(1, 2)
|
122 |
-
x = self.conv_pre(x)
|
123 |
|
124 |
for i in range(self.num_upsamples):
|
125 |
# upsampling
|
126 |
-
|
127 |
-
x = self.ups[i][i_up](x)
|
128 |
# nsf
|
129 |
x_source = self.noise_convs[i](har_source)
|
130 |
x = x + x_source
|
@@ -145,12 +147,9 @@ class Generator(torch.nn.Module):
|
|
145 |
|
146 |
def remove_weight_norm(self):
|
147 |
for l in self.ups:
|
148 |
-
|
149 |
-
remove_weight_norm(l_i)
|
150 |
for l in self.resblocks:
|
151 |
l.remove_weight_norm()
|
152 |
-
remove_weight_norm(self.conv_pre)
|
153 |
-
remove_weight_norm(self.conv_post)
|
154 |
|
155 |
def eval(self, inference=False):
|
156 |
super(Generator, self).eval()
|
@@ -177,11 +176,11 @@ class Generator(torch.nn.Module):
|
|
177 |
# adapter
|
178 |
x = self.adapter(x, spk)
|
179 |
x = self.conv_pre(x)
|
|
|
180 |
|
181 |
for i in range(self.num_upsamples):
|
182 |
# upsampling
|
183 |
-
|
184 |
-
x = self.ups[i][i_up](x)
|
185 |
# nsf
|
186 |
x_source = self.noise_convs[i](har_source)
|
187 |
x = x + x_source
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
import numpy as np
|
5 |
|
6 |
from torch.nn import Conv1d
|
|
|
9 |
from torch.nn.utils import remove_weight_norm
|
10 |
|
11 |
from .nsf import SourceModuleHnNSF
|
12 |
+
from .bigv import init_weights, AMPBlock, SnakeAlias
|
|
|
13 |
|
14 |
|
15 |
class SpeakerAdapter(nn.Module):
|
|
|
57 |
# speaker adaper, 256 should change by what speaker encoder you use
|
58 |
self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input)
|
59 |
# pre conv
|
60 |
+
self.conv_pre = Conv1d(hp.gen.upsample_input,
|
61 |
+
hp.gen.upsample_initial_channel, 7, 1, padding=3)
|
62 |
# nsf
|
63 |
self.f0_upsamp = torch.nn.Upsample(
|
64 |
scale_factor=np.prod(hp.gen.upsample_rates))
|
65 |
+
self.m_source = SourceModuleHnNSF(sampling_rate=hp.data.sampling_rate)
|
66 |
self.noise_convs = nn.ModuleList()
|
67 |
# transposed conv-based upsamplers. does not apply anti-aliasing
|
68 |
self.ups = nn.ModuleList()
|
69 |
for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
|
70 |
# print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
|
71 |
# base
|
72 |
+
self.ups.append(
|
73 |
+
weight_norm(
|
74 |
+
ConvTranspose1d(
|
75 |
+
hp.gen.upsample_initial_channel // (2 ** i),
|
76 |
+
hp.gen.upsample_initial_channel // (2 ** (i + 1)),
|
77 |
+
k,
|
78 |
+
u,
|
79 |
+
padding=(k - u) // 2)
|
80 |
+
)
|
81 |
+
)
|
82 |
# nsf
|
83 |
if i + 1 < len(hp.gen.upsample_rates):
|
84 |
stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
|
|
|
103 |
for i in range(len(self.ups)):
|
104 |
ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
|
105 |
for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
|
106 |
+
self.resblocks.append(AMPBlock(ch, k, d))
|
107 |
|
108 |
# post conv
|
109 |
+
self.activation_post = SnakeAlias(ch)
|
110 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
|
|
|
|
111 |
# weight initialization
|
112 |
+
self.ups.apply(init_weights)
|
|
|
|
|
113 |
|
114 |
def forward(self, spk, x, f0):
|
115 |
+
# Perturbation
|
116 |
+
x = x + torch.randn_like(x)
|
117 |
# adapter
|
118 |
x = self.adapter(x, spk)
|
119 |
+
x = self.conv_pre(x)
|
120 |
+
x = x * torch.tanh(F.softplus(x))
|
121 |
# nsf
|
122 |
f0 = f0[:, None]
|
123 |
f0 = self.f0_upsamp(f0).transpose(1, 2)
|
124 |
har_source = self.m_source(f0)
|
125 |
har_source = har_source.transpose(1, 2)
|
|
|
126 |
|
127 |
for i in range(self.num_upsamples):
|
128 |
# upsampling
|
129 |
+
x = self.ups[i](x)
|
|
|
130 |
# nsf
|
131 |
x_source = self.noise_convs[i](har_source)
|
132 |
x = x + x_source
|
|
|
147 |
|
148 |
def remove_weight_norm(self):
|
149 |
for l in self.ups:
|
150 |
+
remove_weight_norm(l)
|
|
|
151 |
for l in self.resblocks:
|
152 |
l.remove_weight_norm()
|
|
|
|
|
153 |
|
154 |
def eval(self, inference=False):
|
155 |
super(Generator, self).eval()
|
|
|
176 |
# adapter
|
177 |
x = self.adapter(x, spk)
|
178 |
x = self.conv_pre(x)
|
179 |
+
x = x * torch.tanh(F.softplus(x))
|
180 |
|
181 |
for i in range(self.num_upsamples):
|
182 |
# upsampling
|
183 |
+
x = self.ups[i](x)
|
|
|
184 |
# nsf
|
185 |
x_source = self.noise_convs[i](har_source)
|
186 |
x = x + x_source
|
vits_decoder/med.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
import typing as T
|
4 |
+
|
5 |
+
|
6 |
+
class MelspecDiscriminator(torch.nn.Module):
|
7 |
+
"""mel spectrogram (frequency domain) discriminator"""
|
8 |
+
|
9 |
+
def __init__(self) -> None:
|
10 |
+
super().__init__()
|
11 |
+
self.SAMPLE_RATE = 48000
|
12 |
+
# mel filterbank transform
|
13 |
+
self._melspec = torchaudio.transforms.MelSpectrogram(
|
14 |
+
sample_rate=self.SAMPLE_RATE,
|
15 |
+
n_fft=2048,
|
16 |
+
win_length=int(0.025 * self.SAMPLE_RATE),
|
17 |
+
hop_length=int(0.010 * self.SAMPLE_RATE),
|
18 |
+
n_mels=128,
|
19 |
+
power=1,
|
20 |
+
)
|
21 |
+
|
22 |
+
# time-frequency 2D convolutions
|
23 |
+
kernel_sizes = [(7, 7), (4, 4), (4, 4), (4, 4)]
|
24 |
+
strides = [(1, 2), (1, 2), (1, 2), (1, 2)]
|
25 |
+
self._convs = torch.nn.ModuleList(
|
26 |
+
[
|
27 |
+
torch.nn.Sequential(
|
28 |
+
torch.nn.Conv2d(
|
29 |
+
in_channels=1 if i == 0 else 32,
|
30 |
+
out_channels=64,
|
31 |
+
kernel_size=k,
|
32 |
+
stride=s,
|
33 |
+
padding=(1, 2),
|
34 |
+
bias=False,
|
35 |
+
),
|
36 |
+
torch.nn.BatchNorm2d(num_features=64),
|
37 |
+
torch.nn.GLU(dim=1),
|
38 |
+
)
|
39 |
+
for i, (k, s) in enumerate(zip(kernel_sizes, strides))
|
40 |
+
]
|
41 |
+
)
|
42 |
+
|
43 |
+
# output adversarial projection
|
44 |
+
self._postnet = torch.nn.Conv2d(
|
45 |
+
in_channels=32,
|
46 |
+
out_channels=1,
|
47 |
+
kernel_size=(15, 3),
|
48 |
+
stride=(1, 2),
|
49 |
+
)
|
50 |
+
|
51 |
+
def forward(self, x: torch.Tensor) -> T.Tuple[torch.Tensor, T.List[torch.Tensor]]:
|
52 |
+
# apply the log-scale mel spectrogram transform
|
53 |
+
x = torch.log(self._melspec(x) + 1e-5)
|
54 |
+
|
55 |
+
# compute hidden layers and feature maps
|
56 |
+
f = []
|
57 |
+
for c in self._convs:
|
58 |
+
x = c(x)
|
59 |
+
f.append(x)
|
60 |
+
|
61 |
+
# apply the output projection and global average pooling
|
62 |
+
x = self._postnet(x)
|
63 |
+
x = x.mean(dim=[-2, -1])
|
64 |
+
|
65 |
+
return [(f, x)]
|
vits_decoder/msd.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch.nn.utils import weight_norm
|
5 |
+
|
6 |
+
|
7 |
+
class ScaleDiscriminator(torch.nn.Module):
|
8 |
+
def __init__(self):
|
9 |
+
super(ScaleDiscriminator, self).__init__()
|
10 |
+
self.convs = nn.ModuleList([
|
11 |
+
weight_norm(nn.Conv1d(1, 16, 15, 1, padding=7)),
|
12 |
+
weight_norm(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
13 |
+
weight_norm(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
14 |
+
weight_norm(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
15 |
+
weight_norm(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
16 |
+
weight_norm(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
|
17 |
+
])
|
18 |
+
self.conv_post = weight_norm(nn.Conv1d(1024, 1, 3, 1, padding=1))
|
19 |
+
|
20 |
+
def forward(self, x):
|
21 |
+
fmap = []
|
22 |
+
for l in self.convs:
|
23 |
+
x = l(x)
|
24 |
+
x = F.leaky_relu(x, 0.1)
|
25 |
+
fmap.append(x)
|
26 |
+
x = self.conv_post(x)
|
27 |
+
fmap.append(x)
|
28 |
+
x = torch.flatten(x, 1, -1)
|
29 |
+
return [(fmap, x)]
|
vits_decoder/nsf.py
CHANGED
@@ -356,34 +356,15 @@ class SourceModuleCycNoise_v1(torch.nn.Module):
|
|
356 |
|
357 |
|
358 |
class SourceModuleHnNSF(torch.nn.Module):
|
359 |
-
"""SourceModule for hn-nsf
|
360 |
-
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
361 |
-
add_noise_std=0.003, voiced_threshod=0)
|
362 |
-
sampling_rate: sampling_rate in Hz
|
363 |
-
harmonic_num: number of harmonic above F0 (default: 0)
|
364 |
-
sine_amp: amplitude of sine source signal (default: 0.1)
|
365 |
-
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
366 |
-
note that amplitude of noise in unvoiced is decided
|
367 |
-
by sine_amp
|
368 |
-
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
369 |
-
|
370 |
-
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
371 |
-
F0_sampled (batchsize, length, 1)
|
372 |
-
Sine_source (batchsize, length, 1)
|
373 |
-
noise_source (batchsize, length 1)
|
374 |
-
uv (batchsize, length, 1)
|
375 |
-
"""
|
376 |
-
|
377 |
def __init__(
|
378 |
self,
|
379 |
-
sampling_rate=
|
380 |
-
harmonic_num=10,
|
381 |
sine_amp=0.1,
|
382 |
add_noise_std=0.003,
|
383 |
voiced_threshod=0,
|
384 |
):
|
385 |
super(SourceModuleHnNSF, self).__init__()
|
386 |
-
|
387 |
self.sine_amp = sine_amp
|
388 |
self.noise_std = add_noise_std
|
389 |
|
@@ -393,17 +374,21 @@ class SourceModuleHnNSF(torch.nn.Module):
|
|
393 |
)
|
394 |
|
395 |
# to merge source harmonics into a single excitation
|
396 |
-
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
397 |
self.l_tanh = torch.nn.Tanh()
|
|
|
|
|
|
|
|
|
398 |
|
399 |
def forward(self, x):
|
400 |
"""
|
401 |
-
Sine_source
|
402 |
F0_sampled (batchsize, length, 1)
|
403 |
Sine_source (batchsize, length, 1)
|
404 |
-
noise_source (batchsize, length 1)
|
405 |
"""
|
406 |
# source for harmonic branch
|
407 |
sine_wavs = self.l_sin_gen(x)
|
408 |
-
|
|
|
|
|
409 |
return sine_merge
|
|
|
356 |
|
357 |
|
358 |
class SourceModuleHnNSF(torch.nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
def __init__(
|
360 |
self,
|
361 |
+
sampling_rate=32000,
|
|
|
362 |
sine_amp=0.1,
|
363 |
add_noise_std=0.003,
|
364 |
voiced_threshod=0,
|
365 |
):
|
366 |
super(SourceModuleHnNSF, self).__init__()
|
367 |
+
harmonic_num = 10
|
368 |
self.sine_amp = sine_amp
|
369 |
self.noise_std = add_noise_std
|
370 |
|
|
|
374 |
)
|
375 |
|
376 |
# to merge source harmonics into a single excitation
|
|
|
377 |
self.l_tanh = torch.nn.Tanh()
|
378 |
+
self.register_buffer('merge_w', torch.FloatTensor([[
|
379 |
+
0.2942, -0.2243, 0.0033, -0.0056, -0.0020, -0.0046,
|
380 |
+
0.0221, -0.0083, -0.0241, -0.0036, -0.0581]]))
|
381 |
+
self.register_buffer('merge_b', torch.FloatTensor([0.0008]))
|
382 |
|
383 |
def forward(self, x):
|
384 |
"""
|
385 |
+
Sine_source = SourceModuleHnNSF(F0_sampled)
|
386 |
F0_sampled (batchsize, length, 1)
|
387 |
Sine_source (batchsize, length, 1)
|
|
|
388 |
"""
|
389 |
# source for harmonic branch
|
390 |
sine_wavs = self.l_sin_gen(x)
|
391 |
+
sine_wavs = torch_nn_func.linear(
|
392 |
+
sine_wavs, self.merge_w) + self.merge_b
|
393 |
+
sine_merge = self.l_tanh(sine_wavs)
|
394 |
return sine_merge
|
vits_pretrain/{sovits5.0-48k-debug.pth → sovits5.0_bigvgan.pth}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffed3845044b8bef076d72272da19791e1344ad3b750a02d6e4980acf6cb0a0b
|
3 |
+
size 74825605
|