jlopez00 commited on
Commit
f017d24
1 Parent(s): a376e00

Upload folder using huggingface_hub

Browse files
.devcontainer/docker-compose.yml CHANGED
@@ -1,9 +1,9 @@
1
  services:
2
  dev:
3
  profiles:
4
- - devcontainer
5
  build: dev
6
  volumes:
7
- - ../..:/workspaces:cached
8
- - ..:/workspaces/tts-service:cached
9
  command: sleep infinity
 
1
  services:
2
  dev:
3
  profiles:
4
+ - devcontainer
5
  build: dev
6
  volumes:
7
+ - ../..:/workspaces:cached
8
+ - ..:/workspaces/tts-service:cached
9
  command: sleep infinity
.pre-commit-config.yaml CHANGED
@@ -31,6 +31,7 @@ repos:
31
  - id: mypy
32
  name: mypy
33
  entry: mypy
 
34
  language: system
35
  types: [python]
36
  pass_filenames: false
 
31
  - id: mypy
32
  name: mypy
33
  entry: mypy
34
+ args: ["--explicit-package-bases", "--namespace-packages"]
35
  language: system
36
  types: [python]
37
  pass_filenames: false
.vscode/settings.json CHANGED
@@ -13,6 +13,8 @@
13
  "files.trimTrailingWhitespace": true,
14
  "kubernetes-yaml-formatter-x.indentlessArrays": true,
15
  "kubernetes-yaml-formatter-x.retainLineBreaksSingle": true,
 
 
16
  "python.analysis.importFormat": "relative",
17
  "python.analysis.autoFormatStrings": true,
18
  "python.analysis.autoImportCompletions": true,
@@ -30,9 +32,9 @@
30
  "prettier.tabWidth": 4,
31
  "editor.defaultFormatter": "esbenp.prettier-vscode"
32
  },
33
- "[python]": {
34
- "editor.defaultFormatter": "charliermarsh.ruff"
35
- },
36
  "[yaml]": {
37
  "editor.defaultFormatter": "kiliantyler.kubernetes-yaml-formatter-x"
38
  }
 
13
  "files.trimTrailingWhitespace": true,
14
  "kubernetes-yaml-formatter-x.indentlessArrays": true,
15
  "kubernetes-yaml-formatter-x.retainLineBreaksSingle": true,
16
+ "mypy.runUsingActiveInterpreter": true,
17
+ "mypy.extraArguments": ["--explicit-package-bases", "--namespace-packages"],
18
  "python.analysis.importFormat": "relative",
19
  "python.analysis.autoFormatStrings": true,
20
  "python.analysis.autoImportCompletions": true,
 
32
  "prettier.tabWidth": 4,
33
  "editor.defaultFormatter": "esbenp.prettier-vscode"
34
  },
35
+ // "[python]": {
36
+ // "editor.defaultFormatter": "charliermarsh.ruff"
37
+ // },
38
  "[yaml]": {
39
  "editor.defaultFormatter": "kiliantyler.kubernetes-yaml-formatter-x"
40
  }
assets/flask/server.py CHANGED
@@ -34,7 +34,7 @@ def start_flask():
34
  try:
35
  subprocess.Popen(
36
  [ENV_PATH, FLASK_SCRIPT_PATH],
37
- creationflags=subprocess.CREATE_NEW_CONSOLE,
38
  )
39
  except Exception as error:
40
  print(f"An error occurred starting the Flask server: {error}")
 
34
  try:
35
  subprocess.Popen(
36
  [ENV_PATH, FLASK_SCRIPT_PATH],
37
+ creationflags=getattr(subprocess, "CREATE_NEW_CONSOLE", 0),
38
  )
39
  except Exception as error:
40
  print(f"An error occurred starting the Flask server: {error}")
assets/installation_checker.py CHANGED
@@ -13,7 +13,7 @@ class InstallationError(Exception):
13
 
14
  def check_installation():
15
  try:
16
- system_drive = os.getenv("SystemDrive")
17
  current_drive = os.path.splitdrive(now_dir)[0]
18
  if current_drive.upper() != system_drive.upper():
19
  raise InstallationError(
 
13
 
14
  def check_installation():
15
  try:
16
+ system_drive = os.getenv("SystemDrive", "")
17
  current_drive = os.path.splitdrive(now_dir)[0]
18
  if current_drive.upper() != system_drive.upper():
19
  raise InstallationError(
core/__init__.py CHANGED
@@ -72,7 +72,7 @@ def run_infer_script(
72
  upscale_audio: bool,
73
  f0_file: str,
74
  embedder_model: str,
75
- embedder_model_custom: str = None,
76
  formant_shifting: bool = False,
77
  formant_qfrency: float = 1.0,
78
  formant_timbre: float = 1.0,
@@ -210,7 +210,7 @@ def run_batch_infer_script(
210
  upscale_audio: bool,
211
  f0_file: str,
212
  embedder_model: str,
213
- embedder_model_custom: str = None,
214
  formant_shifting: bool = False,
215
  formant_qfrency: float = 1.0,
216
  formant_timbre: float = 1.0,
@@ -351,7 +351,7 @@ def run_tts_script(
351
  upscale_audio: bool,
352
  f0_file: str,
353
  embedder_model: str,
354
- embedder_model_custom: str = None,
355
  sid: int = 0,
356
  ):
357
 
@@ -470,7 +470,7 @@ def run_extract_script(
470
  gpu: int,
471
  sample_rate: int,
472
  embedder_model: str,
473
- embedder_model_custom: str = None,
474
  ):
475
 
476
  model_path = os.path.join(logs_path, model_name)
@@ -519,8 +519,8 @@ def run_train_script(
519
  index_algorithm: str = "Auto",
520
  cache_data_in_gpu: bool = False,
521
  custom_pretrained: bool = False,
522
- g_pretrained_path: str = None,
523
- d_pretrained_path: str = None,
524
  ):
525
 
526
  if pretrained == True:
@@ -737,15 +737,15 @@ def parse_arguments():
737
  default="rmvpe",
738
  )
739
  infer_parser.add_argument(
740
- "--input_path",
741
  type=str,
742
- help="Full path to the input audio file.",
743
  required=True,
744
  )
745
  infer_parser.add_argument(
746
- "--output_path",
747
  type=str,
748
- help="Full path to the output audio file.",
749
  required=True,
750
  )
751
  pth_path_description = "Full path to the RVC model file (.pth)."
@@ -2440,8 +2440,8 @@ def main():
2440
  protect=args.protect,
2441
  hop_length=args.hop_length,
2442
  f0_method=args.f0_method,
2443
- input_path=args.input_path,
2444
- output_path=args.output_path,
2445
  pth_path=args.pth_path,
2446
  index_path=args.index_path,
2447
  split_audio=args.split_audio,
 
72
  upscale_audio: bool,
73
  f0_file: str,
74
  embedder_model: str,
75
+ embedder_model_custom: str | None = None,
76
  formant_shifting: bool = False,
77
  formant_qfrency: float = 1.0,
78
  formant_timbre: float = 1.0,
 
210
  upscale_audio: bool,
211
  f0_file: str,
212
  embedder_model: str,
213
+ embedder_model_custom: str | None = None,
214
  formant_shifting: bool = False,
215
  formant_qfrency: float = 1.0,
216
  formant_timbre: float = 1.0,
 
351
  upscale_audio: bool,
352
  f0_file: str,
353
  embedder_model: str,
354
+ embedder_model_custom: str | None = None,
355
  sid: int = 0,
356
  ):
357
 
 
470
  gpu: int,
471
  sample_rate: int,
472
  embedder_model: str,
473
+ embedder_model_custom: str | None = None,
474
  ):
475
 
476
  model_path = os.path.join(logs_path, model_name)
 
519
  index_algorithm: str = "Auto",
520
  cache_data_in_gpu: bool = False,
521
  custom_pretrained: bool = False,
522
+ g_pretrained_path: str | None = None,
523
+ d_pretrained_path: str | None = None,
524
  ):
525
 
526
  if pretrained == True:
 
737
  default="rmvpe",
738
  )
739
  infer_parser.add_argument(
740
+ "--output_rvc_path",
741
  type=str,
742
+ help="Full path to the output RVC file.",
743
  required=True,
744
  )
745
  infer_parser.add_argument(
746
+ "--output_tts_path",
747
  type=str,
748
+ help="Full path to the output TTS audio file.",
749
  required=True,
750
  )
751
  pth_path_description = "Full path to the RVC model file (.pth)."
 
2440
  protect=args.protect,
2441
  hop_length=args.hop_length,
2442
  f0_method=args.f0_method,
2443
+ output_rvc_path=args.output_rvc_path,
2444
+ output_tts_path=args.output_tts_path,
2445
  pth_path=args.pth_path,
2446
  index_path=args.index_path,
2447
  split_audio=args.split_audio,
pyproject.toml CHANGED
@@ -112,12 +112,23 @@ select = [
112
 
113
  [tool.mypy]
114
  packages = "assets,core,rvc,tabs,tts_service,tests"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- #1181 errors
117
  [[tool.mypy.overrides]]
118
  module = [
119
- "core.*",
120
- "rvc.infer.infer",
121
  "rvc.infer.pipeline",
122
  "rvc.lib.algorithm.attentions",
123
  "rvc.lib.algorithm.commons",
@@ -127,6 +138,7 @@ module = [
127
  "rvc.train.train",
128
  "rvc.train.data_utils",
129
  "rvc.train.extract.extract",
 
130
  "rvc.train.preprocess.preprocess",
131
  "rvc.train.preprocess.slicer",
132
  "rvc.train.process.extract_small_model",
@@ -147,6 +159,7 @@ module = [
147
  "local_attention",
148
  "matplotlib.*",
149
  "noisereduce",
 
150
  "pydub",
151
  "pypresence",
152
  "resampy",
 
112
 
113
  [tool.mypy]
114
  packages = "assets,core,rvc,tabs,tts_service,tests"
115
+ check_untyped_defs = true
116
+
117
+ [[tool.mypy.overrides]]
118
+ module = [
119
+ "rvc.lib.algorithm.generators",
120
+ "rvc.lib.algorithm.residuals",
121
+ "rvc.lib.predictors.RMVPE",
122
+ "rvc.lib.tools.gdown",
123
+ "rvc.lib.tools.model_download",
124
+ "rvc.train.losses",
125
+ "rvc.train.process.extract_model",
126
+ "rvc.train.process.model_blender",
127
+ ]
128
+ check_untyped_defs = false
129
 
 
130
  [[tool.mypy.overrides]]
131
  module = [
 
 
132
  "rvc.infer.pipeline",
133
  "rvc.lib.algorithm.attentions",
134
  "rvc.lib.algorithm.commons",
 
138
  "rvc.train.train",
139
  "rvc.train.data_utils",
140
  "rvc.train.extract.extract",
141
+ "rvc.train.mel_processing",
142
  "rvc.train.preprocess.preprocess",
143
  "rvc.train.preprocess.slicer",
144
  "rvc.train.process.extract_small_model",
 
159
  "local_attention",
160
  "matplotlib.*",
161
  "noisereduce",
162
+ "pedalboard_native",
163
  "pydub",
164
  "pypresence",
165
  "resampy",
rvc/infer/infer.py CHANGED
@@ -10,6 +10,8 @@ import soundfile as sf
10
  import noisereduce as nr
11
  from pedalboard import (
12
  Pedalboard,
 
 
13
  Chorus,
14
  Distortion,
15
  Reverb,
@@ -60,7 +62,7 @@ class VoiceConverter:
60
  self.use_f0 = None # Whether the model uses F0
61
  self.loaded_model = None
62
 
63
- def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
64
  """
65
  Loads the HuBERT model for speaker embedding extraction.
66
 
@@ -201,7 +203,7 @@ class VoiceConverter:
201
  model_path: str,
202
  index_path: str,
203
  pitch: int = 0,
204
- f0_file: str = None,
205
  f0_method: str = "rmvpe",
206
  index_rate: float = 0.75,
207
  volume_envelope: float = 1,
@@ -212,7 +214,7 @@ class VoiceConverter:
212
  f0_autotune_strength: float = 1,
213
  filter_radius: int = 3,
214
  embedder_model: str = "contentvec",
215
- embedder_model_custom: str = None,
216
  clean_audio: bool = False,
217
  clean_strength: float = 0.5,
218
  export_format: str = "WAV",
@@ -294,6 +296,8 @@ class VoiceConverter:
294
 
295
  converted_chunks = []
296
  for c in chunks:
 
 
297
  audio_opt = self.vc.pipeline(
298
  model=self.hubert_model,
299
  net_g=self.net_g,
 
10
  import noisereduce as nr
11
  from pedalboard import (
12
  Pedalboard,
13
+ )
14
+ from pedalboard_native import (
15
  Chorus,
16
  Distortion,
17
  Reverb,
 
62
  self.use_f0 = None # Whether the model uses F0
63
  self.loaded_model = None
64
 
65
+ def load_hubert(self, embedder_model: str, embedder_model_custom: str | None = None):
66
  """
67
  Loads the HuBERT model for speaker embedding extraction.
68
 
 
203
  model_path: str,
204
  index_path: str,
205
  pitch: int = 0,
206
+ f0_file: str | None = None,
207
  f0_method: str = "rmvpe",
208
  index_rate: float = 0.75,
209
  volume_envelope: float = 1,
 
214
  f0_autotune_strength: float = 1,
215
  filter_radius: int = 3,
216
  embedder_model: str = "contentvec",
217
+ embedder_model_custom: str | None = None,
218
  clean_audio: bool = False,
219
  clean_strength: float = 0.5,
220
  export_format: str = "WAV",
 
296
 
297
  converted_chunks = []
298
  for c in chunks:
299
+ if self.vc is None:
300
+ raise Exception("Voice conversion model not loaded.")
301
  audio_opt = self.vc.pipeline(
302
  model=self.hubert_model,
303
  net_g=self.net_g,
rvc/infer/pipeline.py CHANGED
@@ -8,6 +8,7 @@ import torchcrepe
8
  import faiss
9
  import librosa
10
  import numpy as np
 
11
  from scipy import signal
12
  from torch import Tensor
13
 
@@ -29,7 +30,7 @@ bh, ah = signal.butter(
29
  N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
30
  )
31
 
32
- input_audio_path2wav = {}
33
 
34
 
35
  class AudioProcessor:
@@ -37,6 +38,7 @@ class AudioProcessor:
37
  A class for processing audio signals, specifically for adjusting RMS levels.
38
  """
39
 
 
40
  def change_rms(
41
  source_audio: np.ndarray,
42
  source_rate: int,
@@ -292,9 +294,10 @@ class Pipeline:
292
  for method in methods:
293
  f0 = None
294
  if method == "crepe":
295
- f0 = self.get_f0_crepe_computation(
296
- x, f0_min, f0_max, p_len, int(hop_length)
297
- )
 
298
  elif method == "rmvpe":
299
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
300
  f0 = f0[1:]
@@ -323,8 +326,8 @@ class Pipeline:
323
 
324
  def get_f0(
325
  self,
326
- input_audio_path,
327
- x,
328
  p_len,
329
  pitch,
330
  f0_method,
@@ -382,7 +385,7 @@ class Pipeline:
382
  )
383
 
384
  if f0_autotune is True:
385
- f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
386
 
387
  f0 *= pow(2, pitch / 12)
388
  tf0 = self.sample_rate // self.window
@@ -404,7 +407,7 @@ class Pipeline:
404
  ) + 1
405
  f0_mel[f0_mel <= 1] = 1
406
  f0_mel[f0_mel > 255] = 255
407
- f0_coarse = np.rint(f0_mel).astype(np.int)
408
 
409
  return f0_coarse, f0bak
410
 
 
8
  import faiss
9
  import librosa
10
  import numpy as np
11
+ import numpy.typing as npt
12
  from scipy import signal
13
  from torch import Tensor
14
 
 
30
  N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
31
  )
32
 
33
+ input_audio_path2wav: dict[str, npt.NDArray] = {}
34
 
35
 
36
  class AudioProcessor:
 
38
  A class for processing audio signals, specifically for adjusting RMS levels.
39
  """
40
 
41
+ @staticmethod
42
  def change_rms(
43
  source_audio: np.ndarray,
44
  source_rate: int,
 
294
  for method in methods:
295
  f0 = None
296
  if method == "crepe":
297
+ raise ValueError("Crepe method is not supported in hybrid mode")
298
+ # f0 = self.get_f0_crepe_computation(
299
+ # x, f0_min, f0_max, p_len, int(hop_length)
300
+ # )
301
  elif method == "rmvpe":
302
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
303
  f0 = f0[1:]
 
326
 
327
  def get_f0(
328
  self,
329
+ input_audio_path: str,
330
+ x: npt.NDArray,
331
  p_len,
332
  pitch,
333
  f0_method,
 
385
  )
386
 
387
  if f0_autotune is True:
388
+ f0 = self.autotune.autotune_f0(f0, f0_autotune_strength)
389
 
390
  f0 *= pow(2, pitch / 12)
391
  tf0 = self.sample_rate // self.window
 
407
  ) + 1
408
  f0_mel[f0_mel <= 1] = 1
409
  f0_mel[f0_mel > 255] = 255
410
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
411
 
412
  return f0_coarse, f0bak
413
 
rvc/lib/algorithm/synthesizers.py CHANGED
@@ -56,6 +56,7 @@ class Synthesizer(torch.nn.Module):
56
  spk_embed_dim,
57
  gin_channels,
58
  sr,
 
59
  use_f0,
60
  text_enc_hidden_dim=768,
61
  **kwargs
 
56
  spk_embed_dim,
57
  gin_channels,
58
  sr,
59
+ *,
60
  use_f0,
61
  text_enc_hidden_dim=768,
62
  **kwargs