Jarod Castillo commited on
Commit
9d54d5c
1 Parent(s): bb70eb3

comments added

Browse files
Files changed (4) hide show
  1. app.py +21 -4
  2. config.py +38 -0
  3. lib/infer_pack/models.py +16 -0
  4. vc_infer_pipeline.py +55 -22
app.py CHANGED
@@ -30,9 +30,12 @@ from config import Config
30
  from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
31
 
32
  config = Config()
33
- logging.getLogger("numba").setLevel(logging.WARNING)
 
 
34
  spaces = os.getenv("SYSTEM") == "spaces"
35
  force_support = None
 
36
  if config.unsupported is False:
37
  if config.device == "mps" or config.device == "cpu":
38
  force_support = False
@@ -134,6 +137,7 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
134
  return vc_fn
135
 
136
  def load_model():
 
137
  categories = []
138
  if os.path.isfile("weights/folder_info.json"):
139
  with open("weights/folder_info.json", "r", encoding="utf-8") as f:
@@ -153,9 +157,10 @@ def load_model():
153
  model_title = info['title']
154
  model_name = info['model_path']
155
  model_author = info.get("author", None)
156
- model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
157
  model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
158
- cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
 
159
  tgt_sr = cpt["config"][-1]
160
  cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
161
  if_f0 = cpt.get("f0", 1)
@@ -172,15 +177,27 @@ def load_model():
172
  else:
173
  net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
174
  model_version = "V2"
 
 
175
  del net_g.enc_q
176
- print(net_g.load_state_dict(cpt["weight"], strict=False))
 
 
 
 
 
 
177
  net_g.eval().to(config.device)
 
178
  if config.is_half:
179
  net_g = net_g.half()
180
  else:
181
  net_g = net_g.float()
 
182
  vc = VC(tgt_sr, config)
 
183
  print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
 
184
  models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
185
  categories.append([category_title, category_folder, description, models])
186
  else:
 
30
  from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
31
 
32
  config = Config()
33
+ # This can be any name, just a way to output logs during runtime
34
+ logging.getLogger("smotto").setLevel(logging.WARNING)
35
+ # Checking if it's a huggingface space that's running this file
36
  spaces = os.getenv("SYSTEM") == "spaces"
37
  force_support = None
38
+ # If we're using CPU, disable force_support
39
  if config.unsupported is False:
40
  if config.device == "mps" or config.device == "cpu":
41
  force_support = False
 
137
  return vc_fn
138
 
139
  def load_model():
140
+ logs = []
141
  categories = []
142
  if os.path.isfile("weights/folder_info.json"):
143
  with open("weights/folder_info.json", "r", encoding="utf-8") as f:
 
157
  model_title = info['title']
158
  model_name = info['model_path']
159
  model_author = info.get("author", None)
160
+ model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}" # Just a photo of the model
161
  model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
162
+ cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
163
+
164
  tgt_sr = cpt["config"][-1]
165
  cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
166
  if_f0 = cpt.get("f0", 1)
 
177
  else:
178
  net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
179
  model_version = "V2"
180
+
181
+ # Deleting the posterior encoder, assuming that it's not needed for inference.
182
  del net_g.enc_q
183
+
184
+ logs.append(f"Net Generator after posterior encoder deletion: {net_g}\n{info}")
185
+
186
+ # Loading weights from the checkpoint into the neural network. Strict means we can load with missing dictionary keys
187
+ net_g.load_state_dict(cpt["weight"], strict=False)
188
+
189
+ # Prepare the model for inference
190
  net_g.eval().to(config.device)
191
+
192
  if config.is_half:
193
  net_g = net_g.half()
194
  else:
195
  net_g = net_g.float()
196
+
197
  vc = VC(tgt_sr, config)
198
+
199
  print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
200
+ # Create the voice conversion method
201
  models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
202
  categories.append([category_title, category_folder, description, models])
203
  else:
config.py CHANGED
@@ -4,7 +4,18 @@ import torch
4
  from multiprocessing import cpu_count
5
 
6
  class Config:
 
 
 
 
 
7
  def __init__(self):
 
 
 
 
 
 
8
  self.device = "cuda:0"
9
  self.is_half = True
10
  self.n_cpu = 0
@@ -19,6 +30,12 @@ class Config:
19
 
20
  @staticmethod
21
  def arg_parse() -> tuple:
 
 
 
 
 
 
22
  parser = argparse.ArgumentParser()
23
  parser.add_argument("--colab", action="store_true", help="Launch in colab")
24
  parser.add_argument("--api", action="store_true", help="Launch with api")
@@ -35,6 +52,11 @@ class Config:
35
  # check `getattr` and try it for compatibility
36
  @staticmethod
37
  def has_mps() -> bool:
 
 
 
 
 
38
  if not torch.backends.mps.is_available():
39
  return False
40
  try:
@@ -44,6 +66,22 @@ class Config:
44
  return False
45
 
46
  def device_config(self) -> tuple:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  if torch.cuda.is_available():
48
  i_device = int(self.device.split(":")[-1])
49
  self.gpu_name = torch.cuda.get_device_name(i_device)
 
4
  from multiprocessing import cpu_count
5
 
6
  class Config:
7
+ """
8
+ The code focuses on adapting the configuration based on available
9
+ hardware resources and specified command-line arguments,
10
+ aiming to optimize the performance and capabilities of the voice conversion process.
11
+ """
12
  def __init__(self):
13
+ """
14
+ Calls the arg_parse() and device_config() methods to set up configuration based on command-line arguments
15
+ and available hardware.
16
+
17
+ Returns: None
18
+ """
19
  self.device = "cuda:0"
20
  self.is_half = True
21
  self.n_cpu = 0
 
30
 
31
  @staticmethod
32
  def arg_parse() -> tuple:
33
+ """
34
+ Uses the argparse library to parse command-line arguments.
35
+ Three boolean arguments are defined: --colab, --api, and --unsupported.
36
+
37
+ Returns: a tuple indicating whether each argument is specified or not.
38
+ """
39
  parser = argparse.ArgumentParser()
40
  parser.add_argument("--colab", action="store_true", help="Launch in colab")
41
  parser.add_argument("--api", action="store_true", help="Launch with api")
 
52
  # check `getattr` and try it for compatibility
53
  @staticmethod
54
  def has_mps() -> bool:
55
+ """
56
+ Determines if Multi-Process Service (MPS) is available in the current PyTorch backend.
57
+ If MPS is available, it checks whether it can be used by trying to move a tensor to the "mps" device.
58
+ Returns a boolean indicating MPS support.
59
+ """
60
  if not torch.backends.mps.is_available():
61
  return False
62
  try:
 
66
  return False
67
 
68
  def device_config(self) -> tuple:
69
+ """
70
+ Checks if a CUDA-compatible GPU is available.
71
+
72
+ If a compatible GPU is found:
73
+ Determines the GPU's name and memory capacity.
74
+ Adjusts the is_half parameter based on the GPU's characteristics.
75
+
76
+ If no compatible GPU is found and MPS is available, configures the device to use MPS.
77
+
78
+ If no compatible GPU and MPS support, configures the device to use CPU.
79
+
80
+ Determines the number of available CPU cores (n_cpu).
81
+
82
+ Based on the is_half value and GPU memory capacity, configures several variables related to voice conversion,
83
+ such as x_pad, x_query, x_center, and x_max.
84
+ """
85
  if torch.cuda.is_available():
86
  i_device = int(self.device.split(":")[-1])
87
  self.gpu_name = torch.cuda.get_device_name(i_device)
lib/infer_pack/models.py CHANGED
@@ -158,6 +158,12 @@ class ResidualCouplingBlock(nn.Module):
158
 
159
 
160
  class PosteriorEncoder(nn.Module):
 
 
 
 
 
 
161
  def __init__(
162
  self,
163
  in_channels,
@@ -168,6 +174,16 @@ class PosteriorEncoder(nn.Module):
168
  n_layers,
169
  gin_channels=0,
170
  ):
 
 
 
 
 
 
 
 
 
 
171
  super().__init__()
172
  self.in_channels = in_channels
173
  self.out_channels = out_channels
 
158
 
159
 
160
  class PosteriorEncoder(nn.Module):
161
+ """
162
+ Responsible for transforming input data (possibly spectrogram representations of audio)
163
+ into a latent representation (or "posterior") that can be used for generating audio samples
164
+ or for other purposes within a generative model.
165
+
166
+ """
167
  def __init__(
168
  self,
169
  in_channels,
 
174
  n_layers,
175
  gin_channels=0,
176
  ):
177
+ """
178
+ Initializes the parameters and layers of the posterior encoder.
179
+ in_channels: Number of input channels (e.g., frequency bins in a spectrogram).
180
+ out_channels: Number of output channels in the latent representation.
181
+ hidden_channels: Number of hidden channels in intermediate layers.
182
+ kernel_size: Size of the convolutional kernel.
183
+ dilation_rate: Dilation rate for dilated convolutions.
184
+ n_layers: Number of layers in the posterior encoder.
185
+ gin_channels: Optional number of global conditioning channels (if applicable).
186
+ """
187
  super().__init__()
188
  self.in_channels = in_channels
189
  self.out_channels = out_channels
vc_infer_pipeline.py CHANGED
@@ -9,7 +9,7 @@ from functools import lru_cache
9
  now_dir = os.getcwd()
10
  sys.path.append(now_dir)
11
 
12
- bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
13
 
14
  input_audio_path2wav = {}
15
 
@@ -51,6 +51,9 @@ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出
51
 
52
 
53
  class VC(object):
 
 
 
54
  def __init__(self, tgt_sr, config):
55
  self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
56
  config.x_pad,
@@ -59,14 +62,18 @@ class VC(object):
59
  config.x_max,
60
  config.is_half,
61
  )
62
- self.sr = 16000 # hubert输入采样率
63
- self.window = 160 # 每帧点数
64
- self.t_pad = self.sr * self.x_pad # 每条前后pad时间
 
 
 
 
65
  self.t_pad_tgt = tgt_sr * self.x_pad
66
  self.t_pad2 = self.t_pad * 2
67
- self.t_query = self.sr * self.x_query # 查询切点前后查询时间
68
- self.t_center = self.sr * self.x_center # 查询切点位置
69
- self.t_max = self.sr * self.x_max # 免查询时长阈值
70
  self.device = config.device
71
 
72
  def get_f0(
@@ -79,6 +86,11 @@ class VC(object):
79
  filter_radius,
80
  inp_f0=None,
81
  ):
 
 
 
 
 
82
  global input_audio_path2wav
83
  time_step = self.window / self.sr * 1000
84
  f0_min = 50
@@ -168,7 +180,7 @@ class VC(object):
168
  sid,
169
  audio0,
170
  pitch,
171
- pitchf,
172
  times,
173
  index,
174
  big_npy,
@@ -176,6 +188,13 @@ class VC(object):
176
  version,
177
  protect,
178
  ): # ,file_index,file_big_npy
 
 
 
 
 
 
 
179
  feats = torch.from_numpy(audio0)
180
  if self.is_half:
181
  feats = feats.half()
@@ -261,6 +280,7 @@ class VC(object):
261
  t2 = ttime()
262
  times[0] += t1 - t0
263
  times[2] += t2 - t1
 
264
  return audio1
265
 
266
  def pipeline(
@@ -285,6 +305,14 @@ class VC(object):
285
  protect,
286
  f0_file=None,
287
  ):
 
 
 
 
 
 
 
 
288
  if (
289
  file_index != ""
290
  # and file_big_npy != ""
@@ -293,23 +321,25 @@ class VC(object):
293
  and index_rate != 0
294
  ):
295
  try:
296
- index = faiss.read_index(file_index)
297
  # big_npy = np.load(file_big_npy)
298
- big_npy = index.reconstruct_n(0, index.ntotal)
299
  except:
300
  traceback.print_exc()
301
  index = big_npy = None
302
  else:
303
- index = big_npy = None
304
- audio = signal.filtfilt(bh, ah, audio)
305
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
306
- opt_ts = []
 
 
307
  if audio_pad.shape[0] > self.t_max:
308
- audio_sum = np.zeros_like(audio)
309
  for i in range(self.window):
310
  audio_sum += audio_pad[i : i - self.window]
311
  for t in range(self.t_center, audio.shape[0], self.t_center):
312
- opt_ts.append(
313
  t
314
  - self.t_query
315
  + np.where(
@@ -317,12 +347,13 @@ class VC(object):
317
  == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
318
  )[0][0]
319
  )
 
320
  s = 0
321
  audio_opt = []
322
  t = None
323
  t1 = ttime()
324
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
325
- p_len = audio_pad.shape[0] // self.window
326
  inp_f0 = None
327
  if hasattr(f0_file, "name") == True:
328
  try:
@@ -337,24 +368,26 @@ class VC(object):
337
  sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
338
  pitch, pitchf = None, None
339
  if if_f0 == 1:
 
 
340
  pitch, pitchf = self.get_f0(
341
  input_audio_path,
342
  audio_pad,
343
- p_len,
344
  f0_up_key,
345
  f0_method,
346
  filter_radius,
347
  inp_f0,
348
  )
349
- pitch = pitch[:p_len]
350
- pitchf = pitchf[:p_len]
351
  if self.device == "mps":
352
  pitchf = pitchf.astype(np.float32)
353
  pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
354
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
355
  t2 = ttime()
356
  times[1] += t2 - t1
357
- for t in opt_ts:
358
  t = t // self.window * self.window
359
  if if_f0 == 1:
360
  audio_opt.append(
 
9
  now_dir = os.getcwd()
10
  sys.path.append(now_dir)
11
 
12
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) # Design for the audio filter
13
 
14
  input_audio_path2wav = {}
15
 
 
51
 
52
 
53
  class VC(object):
54
+ """
55
+ Voice Conversion system.
56
+ """
57
  def __init__(self, tgt_sr, config):
58
  self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
59
  config.x_pad,
 
62
  config.x_max,
63
  config.is_half,
64
  )
65
+ """
66
+ The class has several parameters that get initialized such as `x_pad`, `x_query`, and so on from the configuration object provided.
67
+ These parameters appear to be related to audio processing, specifying things like sample rate, window size, padding amounts, etc.
68
+ """
69
+ self.sr = 16000 # Hubert input sample rate
70
+ self.window = 160 # Number of points per frame
71
+ self.t_pad = self.sr * self.x_pad # Padding time before and after each segment
72
  self.t_pad_tgt = tgt_sr * self.x_pad
73
  self.t_pad2 = self.t_pad * 2
74
+ self.t_query = self.sr * self.x_query # Query time before and after each query point
75
+ self.t_center = self.sr * self.x_center # Query point position
76
+ self.t_max = self.sr * self.x_max # Duration threshold for non-query time
77
  self.device = config.device
78
 
79
  def get_f0(
 
86
  filter_radius,
87
  inp_f0=None,
88
  ):
89
+ """
90
+ Extracts fundamental frequency ('F0' or pitch) from a given audio signal
91
+ Multiple methods are available, such as 'pm', 'harvest', 'crepe', 'rmvpe'
92
+ Libraries 'parselmouth', 'torchcrepe' compute pitch, and 'cache_harvest_f0' is being used to compute pitch
93
+ """
94
  global input_audio_path2wav
95
  time_step = self.window / self.sr * 1000
96
  f0_min = 50
 
180
  sid,
181
  audio0,
182
  pitch,
183
+ pitchf, # ???
184
  times,
185
  index,
186
  big_npy,
 
188
  version,
189
  protect,
190
  ): # ,file_index,file_big_npy
191
+ """
192
+ The holy grail, the main conversion function.
193
+ Takes an numpy audio signal, processes it through a model, spits out a numpy audio signal.
194
+ Modifies the pitch (or 'F0') of the audio signal, given the 'pitch' and 'pitchf' parameters.
195
+ Neural network generator (net_g) infers the voice.
196
+ 'index' and 'big_npy' is used to retrieve similar audio features from a pre-computed database for better conversion quality.
197
+ """
198
  feats = torch.from_numpy(audio0)
199
  if self.is_half:
200
  feats = feats.half()
 
280
  t2 = ttime()
281
  times[0] += t1 - t0
282
  times[2] += t2 - t1
283
+
284
  return audio1
285
 
286
  def pipeline(
 
305
  protect,
306
  f0_file=None,
307
  ):
308
+ """
309
+ This is a pipeline function that strings together multiple operations for voice conversion.
310
+ The function does some preprocessing on the input audio(e.g. filtering)
311
+ The function then segments the audio into pieces and processes each segment through the voice conversion ('vc') method
312
+ The converted segments are then concatenated to produce the final converted audio
313
+ """
314
+
315
+ # Phase 1: Load index file
316
  if (
317
  file_index != ""
318
  # and file_big_npy != ""
 
321
  and index_rate != 0
322
  ):
323
  try:
324
+ index = faiss.read_index(file_index) # Read from the vector store
325
  # big_npy = np.load(file_big_npy)
326
+ big_npy = index.reconstruct_n(0, index.ntotal) # Convert index into a big numpy array
327
  except:
328
  traceback.print_exc()
329
  index = big_npy = None
330
  else:
331
+ index = big_npy = None # If we don't have the index file, it's ok we won't use it.
332
+
333
+ # Phase 2: Filter audio signal
334
+ audio = signal.filtfilt(bh, ah, audio) # Avoid phase distortion
335
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") # Padding to ensure we calculate the beginning and end accurately
336
+ optimal_time_shifts = []
337
  if audio_pad.shape[0] > self.t_max:
338
+ audio_sum = np.zeros_like(audio) # Create numpy array filled with zeros and same shape as audio
339
  for i in range(self.window):
340
  audio_sum += audio_pad[i : i - self.window]
341
  for t in range(self.t_center, audio.shape[0], self.t_center):
342
+ optimal_time_shifts.append(
343
  t
344
  - self.t_query
345
  + np.where(
 
347
  == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
348
  )[0][0]
349
  )
350
+
351
  s = 0
352
  audio_opt = []
353
  t = None
354
  t1 = ttime()
355
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") # Reflection of the signal's edges on both ends
356
+ number_of_analysis_frames = audio_pad.shape[0] // self.window
357
  inp_f0 = None
358
  if hasattr(f0_file, "name") == True:
359
  try:
 
368
  sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
369
  pitch, pitchf = None, None
370
  if if_f0 == 1:
371
+ # Calls the `get_f0` method to calculate pitch values based on audio features.
372
+ # These calculated pitch values are used for voice conversion.
373
  pitch, pitchf = self.get_f0(
374
  input_audio_path,
375
  audio_pad,
376
+ number_of_analysis_frames,
377
  f0_up_key,
378
  f0_method,
379
  filter_radius,
380
  inp_f0,
381
  )
382
+ pitch = pitch[:number_of_analysis_frames]
383
+ pitchf = pitchf[:number_of_analysis_frames]
384
  if self.device == "mps":
385
  pitchf = pitchf.astype(np.float32)
386
  pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
387
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
388
  t2 = ttime()
389
  times[1] += t2 - t1
390
+ for t in optimal_time_shifts:
391
  t = t // self.window * self.window
392
  if if_f0 == 1:
393
  audio_opt.append(