kevinwang676 commited on
Commit
a22c130
1 Parent(s): ba7d697

Upload vc_infer_pipeline.py

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +320 -0
vc_infer_pipeline.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import scipy.signal as signal
5
+ import pyworld, os, traceback, faiss
6
+ from scipy import signal
7
+
8
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
9
+
10
+
11
+ class VC(object):
12
+ def __init__(self, tgt_sr, config):
13
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
14
+ config.x_pad,
15
+ config.x_query,
16
+ config.x_center,
17
+ config.x_max,
18
+ config.is_half,
19
+ )
20
+ self.sr = 16000 # hubert输入采样率
21
+ self.window = 160 # 每帧点数
22
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
23
+ self.t_pad_tgt = tgt_sr * self.x_pad
24
+ self.t_pad2 = self.t_pad * 2
25
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
26
+ self.t_center = self.sr * self.x_center # 查询切点位置
27
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
28
+ self.device = config.device
29
+
30
+ def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
31
+ time_step = self.window / self.sr * 1000
32
+ f0_min = 50
33
+ f0_max = 1100
34
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
35
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
36
+ if f0_method == "pm":
37
+ f0 = (
38
+ parselmouth.Sound(x, self.sr)
39
+ .to_pitch_ac(
40
+ time_step=time_step / 1000,
41
+ voicing_threshold=0.6,
42
+ pitch_floor=f0_min,
43
+ pitch_ceiling=f0_max,
44
+ )
45
+ .selected_array["frequency"]
46
+ )
47
+ pad_size = (p_len - len(f0) + 1) // 2
48
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
49
+ f0 = np.pad(
50
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
51
+ )
52
+ elif f0_method == "harvest":
53
+ f0, t = pyworld.harvest(
54
+ x.astype(np.double),
55
+ fs=self.sr,
56
+ f0_ceil=f0_max,
57
+ f0_floor=f0_min,
58
+ frame_period=10,
59
+ )
60
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
61
+ f0 = signal.medfilt(f0, 3)
62
+ f0 *= pow(2, f0_up_key / 12)
63
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
64
+ tf0 = self.sr // self.window # 每秒f0点数
65
+ if inp_f0 is not None:
66
+ delta_t = np.round(
67
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
68
+ ).astype("int16")
69
+ replace_f0 = np.interp(
70
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
71
+ )
72
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
73
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
74
+ :shape
75
+ ]
76
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
77
+ f0bak = f0.copy()
78
+ f0_mel = 1127 * np.log(1 + f0 / 700)
79
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
80
+ f0_mel_max - f0_mel_min
81
+ ) + 1
82
+ f0_mel[f0_mel <= 1] = 1
83
+ f0_mel[f0_mel > 255] = 255
84
+ f0_coarse = np.rint(f0_mel).astype(np.int)
85
+ return f0_coarse, f0bak # 1-0
86
+
87
+ def vc(
88
+ self,
89
+ model,
90
+ net_g,
91
+ sid,
92
+ audio0,
93
+ pitch,
94
+ pitchf,
95
+ times,
96
+ index,
97
+ big_npy,
98
+ index_rate,
99
+ ): # ,file_index,file_big_npy
100
+ feats = torch.from_numpy(audio0)
101
+ if self.is_half:
102
+ feats = feats.half()
103
+ else:
104
+ feats = feats.float()
105
+ if feats.dim() == 2: # double channels
106
+ feats = feats.mean(-1)
107
+ assert feats.dim() == 1, feats.dim()
108
+ feats = feats.view(1, -1)
109
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
110
+
111
+ inputs = {
112
+ "source": feats.to(self.device),
113
+ "padding_mask": padding_mask,
114
+ "output_layer": 9, # layer 9
115
+ }
116
+ t0 = ttime()
117
+ with torch.no_grad():
118
+ logits = model.extract_features(**inputs)
119
+ feats = model.final_proj(logits[0])
120
+
121
+ if (
122
+ isinstance(index, type(None)) == False
123
+ and isinstance(big_npy, type(None)) == False
124
+ and index_rate != 0
125
+ ):
126
+ npy = feats[0].cpu().numpy()
127
+ if self.is_half:
128
+ npy = npy.astype("float32")
129
+
130
+ # _, I = index.search(npy, 1)
131
+ # npy = big_npy[I.squeeze()]
132
+
133
+ score, ix = index.search(npy, k=8)
134
+ weight = np.square(1 / score)
135
+ weight /= weight.sum(axis=1, keepdims=True)
136
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
137
+
138
+ if self.is_half:
139
+ npy = npy.astype("float16")
140
+ feats = (
141
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
142
+ + (1 - index_rate) * feats
143
+ )
144
+
145
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
146
+ t1 = ttime()
147
+ p_len = audio0.shape[0] // self.window
148
+ if feats.shape[1] < p_len:
149
+ p_len = feats.shape[1]
150
+ if pitch != None and pitchf != None:
151
+ pitch = pitch[:, :p_len]
152
+ pitchf = pitchf[:, :p_len]
153
+ p_len = torch.tensor([p_len], device=self.device).long()
154
+ with torch.no_grad():
155
+ if pitch != None and pitchf != None:
156
+ audio1 = (
157
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
158
+ .data.cpu()
159
+ .float()
160
+ .numpy()
161
+ .astype(np.int16)
162
+ )
163
+ else:
164
+ audio1 = (
165
+ (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768)
166
+ .data.cpu()
167
+ .float()
168
+ .numpy()
169
+ .astype(np.int16)
170
+ )
171
+ del feats, p_len, padding_mask
172
+ if torch.cuda.is_available():
173
+ torch.cuda.empty_cache()
174
+ t2 = ttime()
175
+ times[0] += t1 - t0
176
+ times[2] += t2 - t1
177
+ return audio1
178
+
179
+ def pipeline(
180
+ self,
181
+ model,
182
+ net_g,
183
+ sid,
184
+ audio,
185
+ times,
186
+ f0_up_key,
187
+ f0_method,
188
+ file_index,
189
+ # file_big_npy,
190
+ index_rate,
191
+ if_f0,
192
+ f0_file=None,
193
+ ):
194
+ if (
195
+ file_index != ""
196
+ # and file_big_npy != ""
197
+ # and os.path.exists(file_big_npy) == True
198
+ and os.path.exists(file_index) == True
199
+ and index_rate != 0
200
+ ):
201
+ try:
202
+ index = faiss.read_index(file_index)
203
+ # big_npy = np.load(file_big_npy)
204
+ big_npy = index.reconstruct_n(0, index.ntotal)
205
+ except:
206
+ traceback.print_exc()
207
+ index = big_npy = None
208
+ else:
209
+ index = big_npy = None
210
+ audio = signal.filtfilt(bh, ah, audio)
211
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
212
+ opt_ts = []
213
+ if audio_pad.shape[0] > self.t_max:
214
+ audio_sum = np.zeros_like(audio)
215
+ for i in range(self.window):
216
+ audio_sum += audio_pad[i : i - self.window]
217
+ for t in range(self.t_center, audio.shape[0], self.t_center):
218
+ opt_ts.append(
219
+ t
220
+ - self.t_query
221
+ + np.where(
222
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
223
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
224
+ )[0][0]
225
+ )
226
+ s = 0
227
+ audio_opt = []
228
+ t = None
229
+ t1 = ttime()
230
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
231
+ p_len = audio_pad.shape[0] // self.window
232
+ inp_f0 = None
233
+ if hasattr(f0_file, "name") == True:
234
+ try:
235
+ with open(f0_file.name, "r") as f:
236
+ lines = f.read().strip("\n").split("\n")
237
+ inp_f0 = []
238
+ for line in lines:
239
+ inp_f0.append([float(i) for i in line.split(",")])
240
+ inp_f0 = np.array(inp_f0, dtype="float32")
241
+ except:
242
+ traceback.print_exc()
243
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
244
+ pitch, pitchf = None, None
245
+ if if_f0 == 1:
246
+ pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
247
+ pitch = pitch[:p_len]
248
+ pitchf = pitchf[:p_len]
249
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
250
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
251
+ t2 = ttime()
252
+ times[1] += t2 - t1
253
+ for t in opt_ts:
254
+ t = t // self.window * self.window
255
+ if if_f0 == 1:
256
+ audio_opt.append(
257
+ self.vc(
258
+ model,
259
+ net_g,
260
+ sid,
261
+ audio_pad[s : t + self.t_pad2 + self.window],
262
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
263
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
264
+ times,
265
+ index,
266
+ big_npy,
267
+ index_rate,
268
+ )[self.t_pad_tgt : -self.t_pad_tgt]
269
+ )
270
+ else:
271
+ audio_opt.append(
272
+ self.vc(
273
+ model,
274
+ net_g,
275
+ sid,
276
+ audio_pad[s : t + self.t_pad2 + self.window],
277
+ None,
278
+ None,
279
+ times,
280
+ index,
281
+ big_npy,
282
+ index_rate,
283
+ )[self.t_pad_tgt : -self.t_pad_tgt]
284
+ )
285
+ s = t
286
+ if if_f0 == 1:
287
+ audio_opt.append(
288
+ self.vc(
289
+ model,
290
+ net_g,
291
+ sid,
292
+ audio_pad[t:],
293
+ pitch[:, t // self.window :] if t is not None else pitch,
294
+ pitchf[:, t // self.window :] if t is not None else pitchf,
295
+ times,
296
+ index,
297
+ big_npy,
298
+ index_rate,
299
+ )[self.t_pad_tgt : -self.t_pad_tgt]
300
+ )
301
+ else:
302
+ audio_opt.append(
303
+ self.vc(
304
+ model,
305
+ net_g,
306
+ sid,
307
+ audio_pad[t:],
308
+ None,
309
+ None,
310
+ times,
311
+ index,
312
+ big_npy,
313
+ index_rate,
314
+ )[self.t_pad_tgt : -self.t_pad_tgt]
315
+ )
316
+ audio_opt = np.concatenate(audio_opt)
317
+ del pitch, pitchf, sid
318
+ if torch.cuda.is_available():
319
+ torch.cuda.empty_cache()
320
+ return audio_opt