Spaces:
Sleeping
Sleeping
Jarod Castillo
commited on
Commit
•
9d54d5c
1
Parent(s):
bb70eb3
comments added
Browse files- app.py +21 -4
- config.py +38 -0
- lib/infer_pack/models.py +16 -0
- vc_infer_pipeline.py +55 -22
app.py
CHANGED
@@ -30,9 +30,12 @@ from config import Config
|
|
30 |
from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
|
31 |
|
32 |
config = Config()
|
33 |
-
|
|
|
|
|
34 |
spaces = os.getenv("SYSTEM") == "spaces"
|
35 |
force_support = None
|
|
|
36 |
if config.unsupported is False:
|
37 |
if config.device == "mps" or config.device == "cpu":
|
38 |
force_support = False
|
@@ -134,6 +137,7 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
134 |
return vc_fn
|
135 |
|
136 |
def load_model():
|
|
|
137 |
categories = []
|
138 |
if os.path.isfile("weights/folder_info.json"):
|
139 |
with open("weights/folder_info.json", "r", encoding="utf-8") as f:
|
@@ -153,9 +157,10 @@ def load_model():
|
|
153 |
model_title = info['title']
|
154 |
model_name = info['model_path']
|
155 |
model_author = info.get("author", None)
|
156 |
-
model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
|
157 |
model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
|
158 |
-
cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
|
|
|
159 |
tgt_sr = cpt["config"][-1]
|
160 |
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
161 |
if_f0 = cpt.get("f0", 1)
|
@@ -172,15 +177,27 @@ def load_model():
|
|
172 |
else:
|
173 |
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
174 |
model_version = "V2"
|
|
|
|
|
175 |
del net_g.enc_q
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
net_g.eval().to(config.device)
|
|
|
178 |
if config.is_half:
|
179 |
net_g = net_g.half()
|
180 |
else:
|
181 |
net_g = net_g.float()
|
|
|
182 |
vc = VC(tgt_sr, config)
|
|
|
183 |
print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
|
|
|
184 |
models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
|
185 |
categories.append([category_title, category_folder, description, models])
|
186 |
else:
|
|
|
30 |
from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
|
31 |
|
32 |
config = Config()
|
33 |
+
# This can be any name, just a way to output logs during runtime
|
34 |
+
logging.getLogger("smotto").setLevel(logging.WARNING)
|
35 |
+
# Checking if it's a huggingface space that's running this file
|
36 |
spaces = os.getenv("SYSTEM") == "spaces"
|
37 |
force_support = None
|
38 |
+
# If we're using CPU, disable force_support
|
39 |
if config.unsupported is False:
|
40 |
if config.device == "mps" or config.device == "cpu":
|
41 |
force_support = False
|
|
|
137 |
return vc_fn
|
138 |
|
139 |
def load_model():
|
140 |
+
logs = []
|
141 |
categories = []
|
142 |
if os.path.isfile("weights/folder_info.json"):
|
143 |
with open("weights/folder_info.json", "r", encoding="utf-8") as f:
|
|
|
157 |
model_title = info['title']
|
158 |
model_name = info['model_path']
|
159 |
model_author = info.get("author", None)
|
160 |
+
model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}" # Just a photo of the model
|
161 |
model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
|
162 |
+
cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
|
163 |
+
|
164 |
tgt_sr = cpt["config"][-1]
|
165 |
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
166 |
if_f0 = cpt.get("f0", 1)
|
|
|
177 |
else:
|
178 |
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
179 |
model_version = "V2"
|
180 |
+
|
181 |
+
# Deleting the posterior encoder, assuming that it's not needed for inference.
|
182 |
del net_g.enc_q
|
183 |
+
|
184 |
+
logs.append(f"Net Generator after posterior encoder deletion: {net_g}\n{info}")
|
185 |
+
|
186 |
+
# Loading weights from the checkpoint into the neural network. Strict means we can load with missing dictionary keys
|
187 |
+
net_g.load_state_dict(cpt["weight"], strict=False)
|
188 |
+
|
189 |
+
# Prepare the model for inference
|
190 |
net_g.eval().to(config.device)
|
191 |
+
|
192 |
if config.is_half:
|
193 |
net_g = net_g.half()
|
194 |
else:
|
195 |
net_g = net_g.float()
|
196 |
+
|
197 |
vc = VC(tgt_sr, config)
|
198 |
+
|
199 |
print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
|
200 |
+
# Create the voice conversion method
|
201 |
models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
|
202 |
categories.append([category_title, category_folder, description, models])
|
203 |
else:
|
config.py
CHANGED
@@ -4,7 +4,18 @@ import torch
|
|
4 |
from multiprocessing import cpu_count
|
5 |
|
6 |
class Config:
|
|
|
|
|
|
|
|
|
|
|
7 |
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
self.device = "cuda:0"
|
9 |
self.is_half = True
|
10 |
self.n_cpu = 0
|
@@ -19,6 +30,12 @@ class Config:
|
|
19 |
|
20 |
@staticmethod
|
21 |
def arg_parse() -> tuple:
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
parser = argparse.ArgumentParser()
|
23 |
parser.add_argument("--colab", action="store_true", help="Launch in colab")
|
24 |
parser.add_argument("--api", action="store_true", help="Launch with api")
|
@@ -35,6 +52,11 @@ class Config:
|
|
35 |
# check `getattr` and try it for compatibility
|
36 |
@staticmethod
|
37 |
def has_mps() -> bool:
|
|
|
|
|
|
|
|
|
|
|
38 |
if not torch.backends.mps.is_available():
|
39 |
return False
|
40 |
try:
|
@@ -44,6 +66,22 @@ class Config:
|
|
44 |
return False
|
45 |
|
46 |
def device_config(self) -> tuple:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
if torch.cuda.is_available():
|
48 |
i_device = int(self.device.split(":")[-1])
|
49 |
self.gpu_name = torch.cuda.get_device_name(i_device)
|
|
|
4 |
from multiprocessing import cpu_count
|
5 |
|
6 |
class Config:
|
7 |
+
"""
|
8 |
+
The code focuses on adapting the configuration based on available
|
9 |
+
hardware resources and specified command-line arguments,
|
10 |
+
aiming to optimize the performance and capabilities of the voice conversion process.
|
11 |
+
"""
|
12 |
def __init__(self):
|
13 |
+
"""
|
14 |
+
Calls the arg_parse() and device_config() methods to set up configuration based on command-line arguments
|
15 |
+
and available hardware.
|
16 |
+
|
17 |
+
Returns: None
|
18 |
+
"""
|
19 |
self.device = "cuda:0"
|
20 |
self.is_half = True
|
21 |
self.n_cpu = 0
|
|
|
30 |
|
31 |
@staticmethod
|
32 |
def arg_parse() -> tuple:
|
33 |
+
"""
|
34 |
+
Uses the argparse library to parse command-line arguments.
|
35 |
+
Three boolean arguments are defined: --colab, --api, and --unsupported.
|
36 |
+
|
37 |
+
Returns: a tuple indicating whether each argument is specified or not.
|
38 |
+
"""
|
39 |
parser = argparse.ArgumentParser()
|
40 |
parser.add_argument("--colab", action="store_true", help="Launch in colab")
|
41 |
parser.add_argument("--api", action="store_true", help="Launch with api")
|
|
|
52 |
# check `getattr` and try it for compatibility
|
53 |
@staticmethod
|
54 |
def has_mps() -> bool:
|
55 |
+
"""
|
56 |
+
Determines if Multi-Process Service (MPS) is available in the current PyTorch backend.
|
57 |
+
If MPS is available, it checks whether it can be used by trying to move a tensor to the "mps" device.
|
58 |
+
Returns a boolean indicating MPS support.
|
59 |
+
"""
|
60 |
if not torch.backends.mps.is_available():
|
61 |
return False
|
62 |
try:
|
|
|
66 |
return False
|
67 |
|
68 |
def device_config(self) -> tuple:
|
69 |
+
"""
|
70 |
+
Checks if a CUDA-compatible GPU is available.
|
71 |
+
|
72 |
+
If a compatible GPU is found:
|
73 |
+
Determines the GPU's name and memory capacity.
|
74 |
+
Adjusts the is_half parameter based on the GPU's characteristics.
|
75 |
+
|
76 |
+
If no compatible GPU is found and MPS is available, configures the device to use MPS.
|
77 |
+
|
78 |
+
If no compatible GPU and MPS support, configures the device to use CPU.
|
79 |
+
|
80 |
+
Determines the number of available CPU cores (n_cpu).
|
81 |
+
|
82 |
+
Based on the is_half value and GPU memory capacity, configures several variables related to voice conversion,
|
83 |
+
such as x_pad, x_query, x_center, and x_max.
|
84 |
+
"""
|
85 |
if torch.cuda.is_available():
|
86 |
i_device = int(self.device.split(":")[-1])
|
87 |
self.gpu_name = torch.cuda.get_device_name(i_device)
|
lib/infer_pack/models.py
CHANGED
@@ -158,6 +158,12 @@ class ResidualCouplingBlock(nn.Module):
|
|
158 |
|
159 |
|
160 |
class PosteriorEncoder(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
def __init__(
|
162 |
self,
|
163 |
in_channels,
|
@@ -168,6 +174,16 @@ class PosteriorEncoder(nn.Module):
|
|
168 |
n_layers,
|
169 |
gin_channels=0,
|
170 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
super().__init__()
|
172 |
self.in_channels = in_channels
|
173 |
self.out_channels = out_channels
|
|
|
158 |
|
159 |
|
160 |
class PosteriorEncoder(nn.Module):
|
161 |
+
"""
|
162 |
+
Responsible for transforming input data (possibly spectrogram representations of audio)
|
163 |
+
into a latent representation (or "posterior") that can be used for generating audio samples
|
164 |
+
or for other purposes within a generative model.
|
165 |
+
|
166 |
+
"""
|
167 |
def __init__(
|
168 |
self,
|
169 |
in_channels,
|
|
|
174 |
n_layers,
|
175 |
gin_channels=0,
|
176 |
):
|
177 |
+
"""
|
178 |
+
Initializes the parameters and layers of the posterior encoder.
|
179 |
+
in_channels: Number of input channels (e.g., frequency bins in a spectrogram).
|
180 |
+
out_channels: Number of output channels in the latent representation.
|
181 |
+
hidden_channels: Number of hidden channels in intermediate layers.
|
182 |
+
kernel_size: Size of the convolutional kernel.
|
183 |
+
dilation_rate: Dilation rate for dilated convolutions.
|
184 |
+
n_layers: Number of layers in the posterior encoder.
|
185 |
+
gin_channels: Optional number of global conditioning channels (if applicable).
|
186 |
+
"""
|
187 |
super().__init__()
|
188 |
self.in_channels = in_channels
|
189 |
self.out_channels = out_channels
|
vc_infer_pipeline.py
CHANGED
@@ -9,7 +9,7 @@ from functools import lru_cache
|
|
9 |
now_dir = os.getcwd()
|
10 |
sys.path.append(now_dir)
|
11 |
|
12 |
-
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
13 |
|
14 |
input_audio_path2wav = {}
|
15 |
|
@@ -51,6 +51,9 @@ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出
|
|
51 |
|
52 |
|
53 |
class VC(object):
|
|
|
|
|
|
|
54 |
def __init__(self, tgt_sr, config):
|
55 |
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
|
56 |
config.x_pad,
|
@@ -59,14 +62,18 @@ class VC(object):
|
|
59 |
config.x_max,
|
60 |
config.is_half,
|
61 |
)
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
65 |
self.t_pad_tgt = tgt_sr * self.x_pad
|
66 |
self.t_pad2 = self.t_pad * 2
|
67 |
-
self.t_query = self.sr * self.x_query #
|
68 |
-
self.t_center = self.sr * self.x_center #
|
69 |
-
self.t_max = self.sr * self.x_max #
|
70 |
self.device = config.device
|
71 |
|
72 |
def get_f0(
|
@@ -79,6 +86,11 @@ class VC(object):
|
|
79 |
filter_radius,
|
80 |
inp_f0=None,
|
81 |
):
|
|
|
|
|
|
|
|
|
|
|
82 |
global input_audio_path2wav
|
83 |
time_step = self.window / self.sr * 1000
|
84 |
f0_min = 50
|
@@ -168,7 +180,7 @@ class VC(object):
|
|
168 |
sid,
|
169 |
audio0,
|
170 |
pitch,
|
171 |
-
pitchf,
|
172 |
times,
|
173 |
index,
|
174 |
big_npy,
|
@@ -176,6 +188,13 @@ class VC(object):
|
|
176 |
version,
|
177 |
protect,
|
178 |
): # ,file_index,file_big_npy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
feats = torch.from_numpy(audio0)
|
180 |
if self.is_half:
|
181 |
feats = feats.half()
|
@@ -261,6 +280,7 @@ class VC(object):
|
|
261 |
t2 = ttime()
|
262 |
times[0] += t1 - t0
|
263 |
times[2] += t2 - t1
|
|
|
264 |
return audio1
|
265 |
|
266 |
def pipeline(
|
@@ -285,6 +305,14 @@ class VC(object):
|
|
285 |
protect,
|
286 |
f0_file=None,
|
287 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
if (
|
289 |
file_index != ""
|
290 |
# and file_big_npy != ""
|
@@ -293,23 +321,25 @@ class VC(object):
|
|
293 |
and index_rate != 0
|
294 |
):
|
295 |
try:
|
296 |
-
index = faiss.read_index(file_index)
|
297 |
# big_npy = np.load(file_big_npy)
|
298 |
-
big_npy = index.reconstruct_n(0, index.ntotal)
|
299 |
except:
|
300 |
traceback.print_exc()
|
301 |
index = big_npy = None
|
302 |
else:
|
303 |
-
index = big_npy = None
|
304 |
-
|
305 |
-
|
306 |
-
|
|
|
|
|
307 |
if audio_pad.shape[0] > self.t_max:
|
308 |
-
audio_sum = np.zeros_like(audio)
|
309 |
for i in range(self.window):
|
310 |
audio_sum += audio_pad[i : i - self.window]
|
311 |
for t in range(self.t_center, audio.shape[0], self.t_center):
|
312 |
-
|
313 |
t
|
314 |
- self.t_query
|
315 |
+ np.where(
|
@@ -317,12 +347,13 @@ class VC(object):
|
|
317 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
318 |
)[0][0]
|
319 |
)
|
|
|
320 |
s = 0
|
321 |
audio_opt = []
|
322 |
t = None
|
323 |
t1 = ttime()
|
324 |
-
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
325 |
-
|
326 |
inp_f0 = None
|
327 |
if hasattr(f0_file, "name") == True:
|
328 |
try:
|
@@ -337,24 +368,26 @@ class VC(object):
|
|
337 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
338 |
pitch, pitchf = None, None
|
339 |
if if_f0 == 1:
|
|
|
|
|
340 |
pitch, pitchf = self.get_f0(
|
341 |
input_audio_path,
|
342 |
audio_pad,
|
343 |
-
|
344 |
f0_up_key,
|
345 |
f0_method,
|
346 |
filter_radius,
|
347 |
inp_f0,
|
348 |
)
|
349 |
-
pitch = pitch[:
|
350 |
-
pitchf = pitchf[:
|
351 |
if self.device == "mps":
|
352 |
pitchf = pitchf.astype(np.float32)
|
353 |
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
354 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
355 |
t2 = ttime()
|
356 |
times[1] += t2 - t1
|
357 |
-
for t in
|
358 |
t = t // self.window * self.window
|
359 |
if if_f0 == 1:
|
360 |
audio_opt.append(
|
|
|
9 |
now_dir = os.getcwd()
|
10 |
sys.path.append(now_dir)
|
11 |
|
12 |
+
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) # Design for the audio filter
|
13 |
|
14 |
input_audio_path2wav = {}
|
15 |
|
|
|
51 |
|
52 |
|
53 |
class VC(object):
|
54 |
+
"""
|
55 |
+
Voice Conversion system.
|
56 |
+
"""
|
57 |
def __init__(self, tgt_sr, config):
|
58 |
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
|
59 |
config.x_pad,
|
|
|
62 |
config.x_max,
|
63 |
config.is_half,
|
64 |
)
|
65 |
+
"""
|
66 |
+
The class has several parameters that get initialized such as `x_pad`, `x_query`, and so on from the configuration object provided.
|
67 |
+
These parameters appear to be related to audio processing, specifying things like sample rate, window size, padding amounts, etc.
|
68 |
+
"""
|
69 |
+
self.sr = 16000 # Hubert input sample rate
|
70 |
+
self.window = 160 # Number of points per frame
|
71 |
+
self.t_pad = self.sr * self.x_pad # Padding time before and after each segment
|
72 |
self.t_pad_tgt = tgt_sr * self.x_pad
|
73 |
self.t_pad2 = self.t_pad * 2
|
74 |
+
self.t_query = self.sr * self.x_query # Query time before and after each query point
|
75 |
+
self.t_center = self.sr * self.x_center # Query point position
|
76 |
+
self.t_max = self.sr * self.x_max # Duration threshold for non-query time
|
77 |
self.device = config.device
|
78 |
|
79 |
def get_f0(
|
|
|
86 |
filter_radius,
|
87 |
inp_f0=None,
|
88 |
):
|
89 |
+
"""
|
90 |
+
Extracts fundamental frequency ('F0' or pitch) from a given audio signal
|
91 |
+
Multiple methods are available, such as 'pm', 'harvest', 'crepe', 'rmvpe'
|
92 |
+
Libraries 'parselmouth', 'torchcrepe' compute pitch, and 'cache_harvest_f0' is being used to compute pitch
|
93 |
+
"""
|
94 |
global input_audio_path2wav
|
95 |
time_step = self.window / self.sr * 1000
|
96 |
f0_min = 50
|
|
|
180 |
sid,
|
181 |
audio0,
|
182 |
pitch,
|
183 |
+
pitchf, # ???
|
184 |
times,
|
185 |
index,
|
186 |
big_npy,
|
|
|
188 |
version,
|
189 |
protect,
|
190 |
): # ,file_index,file_big_npy
|
191 |
+
"""
|
192 |
+
The holy grail, the main conversion function.
|
193 |
+
Takes an numpy audio signal, processes it through a model, spits out a numpy audio signal.
|
194 |
+
Modifies the pitch (or 'F0') of the audio signal, given the 'pitch' and 'pitchf' parameters.
|
195 |
+
Neural network generator (net_g) infers the voice.
|
196 |
+
'index' and 'big_npy' is used to retrieve similar audio features from a pre-computed database for better conversion quality.
|
197 |
+
"""
|
198 |
feats = torch.from_numpy(audio0)
|
199 |
if self.is_half:
|
200 |
feats = feats.half()
|
|
|
280 |
t2 = ttime()
|
281 |
times[0] += t1 - t0
|
282 |
times[2] += t2 - t1
|
283 |
+
|
284 |
return audio1
|
285 |
|
286 |
def pipeline(
|
|
|
305 |
protect,
|
306 |
f0_file=None,
|
307 |
):
|
308 |
+
"""
|
309 |
+
This is a pipeline function that strings together multiple operations for voice conversion.
|
310 |
+
The function does some preprocessing on the input audio(e.g. filtering)
|
311 |
+
The function then segments the audio into pieces and processes each segment through the voice conversion ('vc') method
|
312 |
+
The converted segments are then concatenated to produce the final converted audio
|
313 |
+
"""
|
314 |
+
|
315 |
+
# Phase 1: Load index file
|
316 |
if (
|
317 |
file_index != ""
|
318 |
# and file_big_npy != ""
|
|
|
321 |
and index_rate != 0
|
322 |
):
|
323 |
try:
|
324 |
+
index = faiss.read_index(file_index) # Read from the vector store
|
325 |
# big_npy = np.load(file_big_npy)
|
326 |
+
big_npy = index.reconstruct_n(0, index.ntotal) # Convert index into a big numpy array
|
327 |
except:
|
328 |
traceback.print_exc()
|
329 |
index = big_npy = None
|
330 |
else:
|
331 |
+
index = big_npy = None # If we don't have the index file, it's ok we won't use it.
|
332 |
+
|
333 |
+
# Phase 2: Filter audio signal
|
334 |
+
audio = signal.filtfilt(bh, ah, audio) # Avoid phase distortion
|
335 |
+
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") # Padding to ensure we calculate the beginning and end accurately
|
336 |
+
optimal_time_shifts = []
|
337 |
if audio_pad.shape[0] > self.t_max:
|
338 |
+
audio_sum = np.zeros_like(audio) # Create numpy array filled with zeros and same shape as audio
|
339 |
for i in range(self.window):
|
340 |
audio_sum += audio_pad[i : i - self.window]
|
341 |
for t in range(self.t_center, audio.shape[0], self.t_center):
|
342 |
+
optimal_time_shifts.append(
|
343 |
t
|
344 |
- self.t_query
|
345 |
+ np.where(
|
|
|
347 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
348 |
)[0][0]
|
349 |
)
|
350 |
+
|
351 |
s = 0
|
352 |
audio_opt = []
|
353 |
t = None
|
354 |
t1 = ttime()
|
355 |
+
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") # Reflection of the signal's edges on both ends
|
356 |
+
number_of_analysis_frames = audio_pad.shape[0] // self.window
|
357 |
inp_f0 = None
|
358 |
if hasattr(f0_file, "name") == True:
|
359 |
try:
|
|
|
368 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
369 |
pitch, pitchf = None, None
|
370 |
if if_f0 == 1:
|
371 |
+
# Calls the `get_f0` method to calculate pitch values based on audio features.
|
372 |
+
# These calculated pitch values are used for voice conversion.
|
373 |
pitch, pitchf = self.get_f0(
|
374 |
input_audio_path,
|
375 |
audio_pad,
|
376 |
+
number_of_analysis_frames,
|
377 |
f0_up_key,
|
378 |
f0_method,
|
379 |
filter_radius,
|
380 |
inp_f0,
|
381 |
)
|
382 |
+
pitch = pitch[:number_of_analysis_frames]
|
383 |
+
pitchf = pitchf[:number_of_analysis_frames]
|
384 |
if self.device == "mps":
|
385 |
pitchf = pitchf.astype(np.float32)
|
386 |
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
387 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
388 |
t2 = ttime()
|
389 |
times[1] += t2 - t1
|
390 |
+
for t in optimal_time_shifts:
|
391 |
t = t // self.window * self.window
|
392 |
if if_f0 == 1:
|
393 |
audio_opt.append(
|