Little-W commited on
Commit
26a9232
·
1 Parent(s): 0cbc49b
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from inference.infer_tool_grad import VitsSvc
2
+ import gradio as gr
3
+ import os
4
+
5
+ class VitsGradio:
6
+ def __init__(self):
7
+ self.so = VitsSvc()
8
+ self.lspk = []
9
+ self.modelPaths = []
10
+ for root,dirs,files in os.walk("checkpoints"):
11
+ for dir in dirs:
12
+ self.modelPaths.append(dir)
13
+ with gr.Blocks() as self.Vits:
14
+ with gr.Tab("VoiceConversion"):
15
+ with gr.Row(visible=False) as self.VoiceConversion:
16
+ with gr.Column():
17
+ with gr.Row():
18
+ with gr.Column():
19
+ self.srcaudio = gr.Audio(label = "输入音频")
20
+ self.btnVC = gr.Button("说话人转换")
21
+ with gr.Column():
22
+ self.dsid = gr.Dropdown(label = "目标角色", choices = self.lspk)
23
+ self.tran = gr.Slider(label = "升降调", maximum = 60, minimum = -60, step = 1, value = 0)
24
+ self.th = gr.Slider(label = "切片阈值", maximum = 32767, minimum = -32768, step = 0.1, value = -40)
25
+ with gr.Row():
26
+ self.VCOutputs = gr.Audio()
27
+ self.btnVC.click(self.so.inference, inputs=[self.srcaudio,self.dsid,self.tran,self.th], outputs=[self.VCOutputs])
28
+ with gr.Tab("SelectModel"):
29
+ with gr.Column():
30
+ modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
31
+ devicestrs = gr.Dropdown(label = "设备", choices = ["cpu","cuda"], value = "cpu", type = "value")
32
+ btnMod = gr.Button("载入模型")
33
+ btnMod.click(self.loadModel, inputs=[modelstrs,devicestrs], outputs = [self.dsid,self.VoiceConversion])
34
+
35
+ def loadModel(self, path, device):
36
+ self.lspk = []
37
+ self.so.set_device(device)
38
+ self.so.loadCheckpoint(path)
39
+ for spk, sid in self.so.hps.spk.items():
40
+ self.lspk.append(spk)
41
+ VChange = gr.update(visible = True)
42
+ SDChange = gr.update(choices = self.lspk, value = self.lspk[0])
43
+ return [SDChange,VChange]
44
+
45
+ grVits = VitsGradio()
46
+
47
+ grVits.Vits.launch()
checkpoints/nozomi/config.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 12,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 17920,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 384,
23
+ "port": "8001"
24
+ },
25
+ "data": {
26
+ "training_files": "filelists/train.txt",
27
+ "validation_files": "filelists/val.txt",
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 32000,
30
+ "filter_length": 1280,
31
+ "hop_length": 320,
32
+ "win_length": 1280,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null
36
+ },
37
+ "model": {
38
+ "inter_channels": 192,
39
+ "hidden_channels": 192,
40
+ "filter_channels": 768,
41
+ "n_heads": 2,
42
+ "n_layers": 6,
43
+ "kernel_size": 3,
44
+ "p_dropout": 0.1,
45
+ "resblock": "1",
46
+ "resblock_kernel_sizes": [
47
+ 3,
48
+ 7,
49
+ 11
50
+ ],
51
+ "resblock_dilation_sizes": [
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ],
57
+ [
58
+ 1,
59
+ 3,
60
+ 5
61
+ ],
62
+ [
63
+ 1,
64
+ 3,
65
+ 5
66
+ ]
67
+ ],
68
+ "upsample_rates": [
69
+ 10,
70
+ 8,
71
+ 2,
72
+ 2
73
+ ],
74
+ "upsample_initial_channel": 512,
75
+ "upsample_kernel_sizes": [
76
+ 16,
77
+ 16,
78
+ 4,
79
+ 4
80
+ ],
81
+ "n_layers_q": 3,
82
+ "use_spectral_norm": false,
83
+ "gin_channels": 256,
84
+ "ssl_dim": 256,
85
+ "n_speakers": 2
86
+ },
87
+ "spk": {
88
+ "speaker0": 0
89
+ }
90
+ }
checkpoints/nozomi/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaed87ecd86eb78a1eba23df903950f5a1e7a3568ddfb7ecea5cf4796edec91a
3
+ size 699497375
inference/__init__.py ADDED
File without changes
inference/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (133 Bytes). View file
 
inference/__pycache__/infer_tool_grad.cpython-38.pyc ADDED
Binary file (5.94 kB). View file
 
inference/__pycache__/slicer.cpython-38.pyc ADDED
Binary file (3.83 kB). View file
 
inference/infer_tool.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import librosa
9
+ import maad
10
+ import numpy as np
11
+ # import onnxruntime
12
+ import parselmouth
13
+ import soundfile
14
+ import torch
15
+ import torchaudio
16
+
17
+ from hubert import hubert_model
18
+ import utils
19
+ from models import SynthesizerTrn
20
+
21
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
+
23
+
24
+ def read_temp(file_name):
25
+ if not os.path.exists(file_name):
26
+ with open(file_name, "w") as f:
27
+ f.write(json.dumps({"info": "temp_dict"}))
28
+ return {}
29
+ else:
30
+ try:
31
+ with open(file_name, "r") as f:
32
+ data = f.read()
33
+ data_dict = json.loads(data)
34
+ if os.path.getsize(file_name) > 50 * 1024 * 1024:
35
+ f_name = file_name.replace("\\", "/").split("/")[-1]
36
+ print(f"clean {f_name}")
37
+ for wav_hash in list(data_dict.keys()):
38
+ if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
39
+ del data_dict[wav_hash]
40
+ except Exception as e:
41
+ print(e)
42
+ print(f"{file_name} error,auto rebuild file")
43
+ data_dict = {"info": "temp_dict"}
44
+ return data_dict
45
+
46
+
47
+ def write_temp(file_name, data):
48
+ with open(file_name, "w") as f:
49
+ f.write(json.dumps(data))
50
+
51
+
52
+ def timeit(func):
53
+ def run(*args, **kwargs):
54
+ t = time.time()
55
+ res = func(*args, **kwargs)
56
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
57
+ return res
58
+
59
+ return run
60
+
61
+
62
+ def format_wav(audio_path):
63
+ if Path(audio_path).suffix == '.wav':
64
+ return
65
+ raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
66
+ soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
67
+
68
+
69
+ def get_end_file(dir_path, end):
70
+ file_lists = []
71
+ for root, dirs, files in os.walk(dir_path):
72
+ files = [f for f in files if f[0] != '.']
73
+ dirs[:] = [d for d in dirs if d[0] != '.']
74
+ for f_file in files:
75
+ if f_file.endswith(end):
76
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
77
+ return file_lists
78
+
79
+
80
+ def get_md5(content):
81
+ return hashlib.new("md5", content).hexdigest()
82
+
83
+
84
+ def resize2d_f0(x, target_len):
85
+ source = np.array(x)
86
+ source[source < 0.001] = np.nan
87
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
88
+ source)
89
+ res = np.nan_to_num(target)
90
+ return res
91
+
92
+ def get_f0(x, p_len,f0_up_key=0):
93
+
94
+ time_step = 160 / 16000 * 1000
95
+ f0_min = 50
96
+ f0_max = 1100
97
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
98
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
99
+
100
+ f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
101
+ time_step=time_step / 1000, voicing_threshold=0.6,
102
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
103
+ if len(f0) > p_len:
104
+ f0 = f0[:p_len]
105
+ pad_size=(p_len - len(f0) + 1) // 2
106
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
107
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
108
+
109
+ f0 *= pow(2, f0_up_key / 12)
110
+ f0_mel = 1127 * np.log(1 + f0 / 700)
111
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
112
+ f0_mel[f0_mel <= 1] = 1
113
+ f0_mel[f0_mel > 255] = 255
114
+ f0_coarse = np.rint(f0_mel).astype(np.int)
115
+ return f0_coarse, f0
116
+
117
+ def clean_pitch(input_pitch):
118
+ num_nan = np.sum(input_pitch == 1)
119
+ if num_nan / len(input_pitch) > 0.9:
120
+ input_pitch[input_pitch != 1] = 1
121
+ return input_pitch
122
+
123
+
124
+ def plt_pitch(input_pitch):
125
+ input_pitch = input_pitch.astype(float)
126
+ input_pitch[input_pitch == 1] = np.nan
127
+ return input_pitch
128
+
129
+
130
+ def f0_to_pitch(ff):
131
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
132
+ return f0_pitch
133
+
134
+
135
+ def fill_a_to_b(a, b):
136
+ if len(a) < len(b):
137
+ for _ in range(0, len(b) - len(a)):
138
+ a.append(a[0])
139
+
140
+
141
+ def mkdir(paths: list):
142
+ for path in paths:
143
+ if not os.path.exists(path):
144
+ os.mkdir(path)
145
+
146
+
147
+ class Svc(object):
148
+ def __init__(self, net_g_path, config_path, hubert_path="hubert/hubert-soft-0d54a1f4.pt",
149
+ onnx=False):
150
+ self.onnx = onnx
151
+ self.net_g_path = net_g_path
152
+ self.hubert_path = hubert_path
153
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
154
+ self.net_g_ms = None
155
+ self.hps_ms = utils.get_hparams_from_file(config_path)
156
+ self.target_sample = self.hps_ms.data.sampling_rate
157
+ self.hop_size = self.hps_ms.data.hop_length
158
+ self.speakers = {}
159
+ for spk, sid in self.hps_ms.spk.items():
160
+ self.speakers[sid] = spk
161
+ self.spk2id = self.hps_ms.spk
162
+ # 加载hubert
163
+ self.hubert_soft = hubert_model.hubert_soft(hubert_path)
164
+ if torch.cuda.is_available():
165
+ self.hubert_soft = self.hubert_soft.cuda()
166
+ self.load_model()
167
+
168
+ def load_model(self):
169
+ # 获取模型配置
170
+ if self.onnx:
171
+ raise NotImplementedError
172
+ # self.net_g_ms = SynthesizerTrnForONNX(
173
+ # 178,
174
+ # self.hps_ms.data.filter_length // 2 + 1,
175
+ # self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
176
+ # n_speakers=self.hps_ms.data.n_speakers,
177
+ # **self.hps_ms.model)
178
+ # _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
179
+ else:
180
+ self.net_g_ms = SynthesizerTrn(
181
+ self.hps_ms.data.filter_length // 2 + 1,
182
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
183
+ **self.hps_ms.model)
184
+ _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
185
+ if "half" in self.net_g_path and torch.cuda.is_available():
186
+ _ = self.net_g_ms.half().eval().to(self.dev)
187
+ else:
188
+ _ = self.net_g_ms.eval().to(self.dev)
189
+
190
+ def get_units(self, source, sr):
191
+
192
+ source = source.unsqueeze(0).to(self.dev)
193
+ with torch.inference_mode():
194
+ start = time.time()
195
+ units = self.hubert_soft.units(source)
196
+ use_time = time.time() - start
197
+ print("hubert use time:{}".format(use_time))
198
+ return units
199
+
200
+
201
+ def get_unit_pitch(self, in_path, tran):
202
+ source, sr = torchaudio.load(in_path)
203
+ source = torchaudio.functional.resample(source, sr, 16000)
204
+ if len(source.shape) == 2 and source.shape[1] >= 2:
205
+ source = torch.mean(source, dim=0).unsqueeze(0)
206
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
207
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
208
+ return soft, f0
209
+
210
+ def infer(self, speaker_id, tran, raw_path):
211
+ if type(speaker_id) == str:
212
+ speaker_id = self.spk2id[speaker_id]
213
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
214
+ soft, pitch = self.get_unit_pitch(raw_path, tran)
215
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev)
216
+ if "half" in self.net_g_path and torch.cuda.is_available():
217
+ stn_tst = torch.HalfTensor(soft)
218
+ else:
219
+ stn_tst = torch.FloatTensor(soft)
220
+ with torch.no_grad():
221
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
222
+ start = time.time()
223
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
224
+ audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
225
+ use_time = time.time() - start
226
+ print("vits use time:{}".format(use_time))
227
+ return audio, audio.shape[-1]
228
+
229
+
230
+ # class SvcONNXInferModel(object):
231
+ # def __init__(self, hubert_onnx, vits_onnx, config_path):
232
+ # self.config_path = config_path
233
+ # self.vits_onnx = vits_onnx
234
+ # self.hubert_onnx = hubert_onnx
235
+ # self.hubert_onnx_session = onnxruntime.InferenceSession(hubert_onnx, providers=['CUDAExecutionProvider', ])
236
+ # self.inspect_onnx(self.hubert_onnx_session)
237
+ # self.vits_onnx_session = onnxruntime.InferenceSession(vits_onnx, providers=['CUDAExecutionProvider', ])
238
+ # self.inspect_onnx(self.vits_onnx_session)
239
+ # self.hps_ms = utils.get_hparams_from_file(self.config_path)
240
+ # self.target_sample = self.hps_ms.data.sampling_rate
241
+ # self.feature_input = FeatureInput(self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length)
242
+ #
243
+ # @staticmethod
244
+ # def inspect_onnx(session):
245
+ # for i in session.get_inputs():
246
+ # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
247
+ # for i in session.get_outputs():
248
+ # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
249
+ #
250
+ # def infer(self, speaker_id, tran, raw_path):
251
+ # sid = np.array([int(speaker_id)], dtype=np.int64)
252
+ # soft, pitch = self.get_unit_pitch(raw_path, tran)
253
+ # pitch = np.expand_dims(pitch, axis=0).astype(np.int64)
254
+ # stn_tst = soft
255
+ # x_tst = np.expand_dims(stn_tst, axis=0)
256
+ # x_tst_lengths = np.array([stn_tst.shape[0]], dtype=np.int64)
257
+ # # 使用ONNX Runtime进行推理
258
+ # start = time.time()
259
+ # audio = self.vits_onnx_session.run(output_names=["audio"],
260
+ # input_feed={
261
+ # "hidden_unit": x_tst,
262
+ # "lengths": x_tst_lengths,
263
+ # "pitch": pitch,
264
+ # "sid": sid,
265
+ # })[0][0, 0]
266
+ # use_time = time.time() - start
267
+ # print("vits_onnx_session.run time:{}".format(use_time))
268
+ # audio = torch.from_numpy(audio)
269
+ # return audio, audio.shape[-1]
270
+ #
271
+ # def get_units(self, source, sr):
272
+ # source = torchaudio.functional.resample(source, sr, 16000)
273
+ # if len(source.shape) == 2 and source.shape[1] >= 2:
274
+ # source = torch.mean(source, dim=0).unsqueeze(0)
275
+ # source = source.unsqueeze(0)
276
+ # # 使用ONNX Runtime进行推理
277
+ # start = time.time()
278
+ # units = self.hubert_onnx_session.run(output_names=["embed"],
279
+ # input_feed={"source": source.numpy()})[0]
280
+ # use_time = time.time() - start
281
+ # print("hubert_onnx_session.run time:{}".format(use_time))
282
+ # return units
283
+ #
284
+ # def transcribe(self, source, sr, length, transform):
285
+ # feature_pit = self.feature_input.compute_f0(source, sr)
286
+ # feature_pit = feature_pit * 2 ** (transform / 12)
287
+ # feature_pit = resize2d_f0(feature_pit, length)
288
+ # coarse_pit = self.feature_input.coarse_f0(feature_pit)
289
+ # return coarse_pit
290
+ #
291
+ # def get_unit_pitch(self, in_path, tran):
292
+ # source, sr = torchaudio.load(in_path)
293
+ # soft = self.get_units(source, sr).squeeze(0)
294
+ # input_pitch = self.transcribe(source.numpy()[0], sr, soft.shape[0], tran)
295
+ # return soft, input_pitch
296
+
297
+
298
+ class RealTimeVC:
299
+ def __init__(self):
300
+ self.last_chunk = None
301
+ self.last_o = None
302
+ self.chunk_len = 16000 # 区块长度
303
+ self.pre_len = 3840 # 交叉淡化长度,640的倍数
304
+
305
+ """输入输出都是1维numpy 音频波形数组"""
306
+
307
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
308
+ audio, sr = torchaudio.load(input_wav_path)
309
+ audio = audio.cpu().numpy()[0]
310
+ temp_wav = io.BytesIO()
311
+ if self.last_chunk is None:
312
+ input_wav_path.seek(0)
313
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
314
+ audio = audio.cpu().numpy()
315
+ self.last_chunk = audio[-self.pre_len:]
316
+ self.last_o = audio
317
+ return audio[-self.chunk_len:]
318
+ else:
319
+ audio = np.concatenate([self.last_chunk, audio])
320
+ soundfile.write(temp_wav, audio, sr, format="wav")
321
+ temp_wav.seek(0)
322
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
323
+ audio = audio.cpu().numpy()
324
+ ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
325
+ self.last_chunk = audio[-self.pre_len:]
326
+ self.last_o = audio
327
+ return ret[self.chunk_len:2 * self.chunk_len]
inference/infer_tool_grad.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
+ import time
6
+ from pathlib import Path
7
+ import io
8
+ import librosa
9
+ import maad
10
+ import numpy as np
11
+ from inference import slicer
12
+ import parselmouth
13
+ import soundfile
14
+ import torch
15
+ import torchaudio
16
+
17
+ from hubert import hubert_model
18
+ import utils
19
+ from models import SynthesizerTrn
20
+ logging.getLogger('numba').setLevel(logging.WARNING)
21
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
+
23
+ def resize2d_f0(x, target_len):
24
+ source = np.array(x)
25
+ source[source < 0.001] = np.nan
26
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
27
+ source)
28
+ res = np.nan_to_num(target)
29
+ return res
30
+
31
+ def get_f0(x, p_len,f0_up_key=0):
32
+
33
+ time_step = 160 / 16000 * 1000
34
+ f0_min = 50
35
+ f0_max = 1100
36
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
37
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
38
+
39
+ f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
40
+ time_step=time_step / 1000, voicing_threshold=0.6,
41
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
42
+
43
+ pad_size=(p_len - len(f0) + 1) // 2
44
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
45
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
46
+
47
+ f0 *= pow(2, f0_up_key / 12)
48
+ f0_mel = 1127 * np.log(1 + f0 / 700)
49
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
50
+ f0_mel[f0_mel <= 1] = 1
51
+ f0_mel[f0_mel > 255] = 255
52
+ f0_coarse = np.rint(f0_mel).astype(np.int)
53
+ return f0_coarse, f0
54
+
55
+ def clean_pitch(input_pitch):
56
+ num_nan = np.sum(input_pitch == 1)
57
+ if num_nan / len(input_pitch) > 0.9:
58
+ input_pitch[input_pitch != 1] = 1
59
+ return input_pitch
60
+
61
+
62
+ def plt_pitch(input_pitch):
63
+ input_pitch = input_pitch.astype(float)
64
+ input_pitch[input_pitch == 1] = np.nan
65
+ return input_pitch
66
+
67
+
68
+ def f0_to_pitch(ff):
69
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
70
+ return f0_pitch
71
+
72
+
73
+ def fill_a_to_b(a, b):
74
+ if len(a) < len(b):
75
+ for _ in range(0, len(b) - len(a)):
76
+ a.append(a[0])
77
+
78
+
79
+ def mkdir(paths: list):
80
+ for path in paths:
81
+ if not os.path.exists(path):
82
+ os.mkdir(path)
83
+
84
+
85
+ class VitsSvc(object):
86
+ def __init__(self):
87
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
+ self.SVCVITS = None
89
+ self.hps = None
90
+ self.speakers = None
91
+ self.hubert_soft = hubert_model.hubert_soft("hubert/model.pt")
92
+
93
+ def set_device(self, device):
94
+ self.device = torch.device(device)
95
+ self.hubert_soft.to(self.device)
96
+ if self.SVCVITS != None:
97
+ self.SVCVITS.to(self.device)
98
+
99
+ def loadCheckpoint(self, path):
100
+ self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
101
+ self.SVCVITS = SynthesizerTrn(
102
+ self.hps.data.filter_length // 2 + 1,
103
+ self.hps.train.segment_size // self.hps.data.hop_length,
104
+ **self.hps.model)
105
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
106
+ _ = self.SVCVITS.eval().to(self.device)
107
+ self.speakers = self.hps.spk
108
+
109
+ def get_units(self, source, sr):
110
+ source = source.unsqueeze(0).to(self.device)
111
+ with torch.inference_mode():
112
+ units = self.hubert_soft.units(source)
113
+ return units
114
+
115
+
116
+ def get_unit_pitch(self, in_path, tran):
117
+ source, sr = torchaudio.load(in_path)
118
+ source = torchaudio.functional.resample(source, sr, 16000)
119
+ if len(source.shape) == 2 and source.shape[1] >= 2:
120
+ source = torch.mean(source, dim=0).unsqueeze(0)
121
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
122
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
123
+ return soft, f0
124
+
125
+ def infer(self, speaker_id, tran, raw_path):
126
+ speaker_id = self.speakers[speaker_id]
127
+ sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
128
+ soft, pitch = self.get_unit_pitch(raw_path, tran)
129
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
130
+ stn_tst = torch.FloatTensor(soft)
131
+ with torch.no_grad():
132
+ x_tst = stn_tst.unsqueeze(0).to(self.device)
133
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
134
+ audio = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
135
+ return audio, audio.shape[-1]
136
+
137
+ def inference(self,srcaudio,chara,tran,slice_db):
138
+ sampling_rate, audio = srcaudio
139
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
140
+ if len(audio.shape) > 1:
141
+ audio = librosa.to_mono(audio.transpose(1, 0))
142
+ if sampling_rate != 16000:
143
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
144
+ soundfile.write("tmpwav.wav", audio, 16000, format="wav")
145
+ chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
146
+ audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
147
+ audio = []
148
+ for (slice_tag, data) in audio_data:
149
+ length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
150
+ raw_path = io.BytesIO()
151
+ soundfile.write(raw_path, data, audio_sr, format="wav")
152
+ raw_path.seek(0)
153
+ if slice_tag:
154
+ _audio = np.zeros(length)
155
+ else:
156
+ out_audio, out_sr = self.infer(chara, tran, raw_path)
157
+ _audio = out_audio.cpu().numpy()
158
+ audio.extend(list(_audio))
159
+ audio = (np.array(audio) * 32768.0).astype('int16')
160
+ return (self.hps.data.sampling_rate,audio)
inference/slicer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torchaudio
4
+
5
+
6
+ class Slicer:
7
+ def __init__(self,
8
+ sr: int,
9
+ threshold: float = -40.,
10
+ min_length: int = 5000,
11
+ min_interval: int = 300,
12
+ hop_size: int = 20,
13
+ max_sil_kept: int = 5000):
14
+ if not min_length >= min_interval >= hop_size:
15
+ raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
16
+ if not max_sil_kept >= hop_size:
17
+ raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
18
+ min_interval = sr * min_interval / 1000
19
+ self.threshold = 10 ** (threshold / 20.)
20
+ self.hop_size = round(sr * hop_size / 1000)
21
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
22
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
23
+ self.min_interval = round(min_interval / self.hop_size)
24
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
25
+
26
+ def _apply_slice(self, waveform, begin, end):
27
+ if len(waveform.shape) > 1:
28
+ return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
29
+ else:
30
+ return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
31
+
32
+ # @timeit
33
+ def slice(self, waveform):
34
+ if len(waveform.shape) > 1:
35
+ samples = librosa.to_mono(waveform)
36
+ else:
37
+ samples = waveform
38
+ if samples.shape[0] <= self.min_length:
39
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
40
+ rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
41
+ sil_tags = []
42
+ silence_start = None
43
+ clip_start = 0
44
+ for i, rms in enumerate(rms_list):
45
+ # Keep looping while frame is silent.
46
+ if rms < self.threshold:
47
+ # Record start of silent frames.
48
+ if silence_start is None:
49
+ silence_start = i
50
+ continue
51
+ # Keep looping while frame is not silent and silence start has not been recorded.
52
+ if silence_start is None:
53
+ continue
54
+ # Clear recorded silence start if interval is not enough or clip is too short
55
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
56
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
57
+ if not is_leading_silence and not need_slice_middle:
58
+ silence_start = None
59
+ continue
60
+ # Need slicing. Record the range of silent frames to be removed.
61
+ if i - silence_start <= self.max_sil_kept:
62
+ pos = rms_list[silence_start: i + 1].argmin() + silence_start
63
+ if silence_start == 0:
64
+ sil_tags.append((0, pos))
65
+ else:
66
+ sil_tags.append((pos, pos))
67
+ clip_start = pos
68
+ elif i - silence_start <= self.max_sil_kept * 2:
69
+ pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
70
+ pos += i - self.max_sil_kept
71
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
72
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
73
+ if silence_start == 0:
74
+ sil_tags.append((0, pos_r))
75
+ clip_start = pos_r
76
+ else:
77
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
78
+ clip_start = max(pos_r, pos)
79
+ else:
80
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
81
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
82
+ if silence_start == 0:
83
+ sil_tags.append((0, pos_r))
84
+ else:
85
+ sil_tags.append((pos_l, pos_r))
86
+ clip_start = pos_r
87
+ silence_start = None
88
+ # Deal with trailing silence.
89
+ total_frames = rms_list.shape[0]
90
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
91
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
92
+ pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
93
+ sil_tags.append((pos, total_frames + 1))
94
+ # Apply and return slices.
95
+ if len(sil_tags) == 0:
96
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
97
+ else:
98
+ chunks = []
99
+ # 第一段静音并非从头开始,补上有声片段
100
+ if sil_tags[0][0]:
101
+ chunks.append(
102
+ {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
103
+ for i in range(0, len(sil_tags)):
104
+ # 标识有声片段(跳过第一段)
105
+ if i:
106
+ chunks.append({"slice": False,
107
+ "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
108
+ # 标识所有静音片段
109
+ chunks.append({"slice": True,
110
+ "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
111
+ # 最后一段静音并非结尾,补上结尾片段
112
+ if sil_tags[-1][1] * self.hop_size < len(waveform):
113
+ chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
114
+ chunk_dict = {}
115
+ for i in range(len(chunks)):
116
+ chunk_dict[str(i)] = chunks[i]
117
+ return chunk_dict
118
+
119
+
120
+ def cut(audio_path, db_thresh=-30, min_len=5000):
121
+ audio, sr = librosa.load(audio_path, sr=None)
122
+ slicer = Slicer(
123
+ sr=sr,
124
+ threshold=db_thresh,
125
+ min_length=min_len
126
+ )
127
+ chunks = slicer.slice(audio)
128
+ return chunks
129
+
130
+
131
+ def chunks2audio(audio_path, chunks):
132
+ chunks = dict(chunks)
133
+ audio, sr = torchaudio.load(audio_path)
134
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
135
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
136
+ audio = audio.cpu().numpy()[0]
137
+ result = []
138
+ for k, v in chunks.items():
139
+ tag = v["split_time"].split(",")
140
+ if tag[0] != tag[1]:
141
+ result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
142
+ return result, sr