Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .devcontainer/docker-compose.yml +3 -3
- .pre-commit-config.yaml +1 -0
- .vscode/settings.json +5 -3
- assets/flask/server.py +1 -1
- assets/installation_checker.py +1 -1
- core/__init__.py +12 -12
- pyproject.toml +16 -3
- rvc/infer/infer.py +7 -3
- rvc/infer/pipeline.py +11 -8
- rvc/lib/algorithm/synthesizers.py +1 -0
.devcontainer/docker-compose.yml
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
services:
|
2 |
dev:
|
3 |
profiles:
|
4 |
-
|
5 |
build: dev
|
6 |
volumes:
|
7 |
-
|
8 |
-
|
9 |
command: sleep infinity
|
|
|
1 |
services:
|
2 |
dev:
|
3 |
profiles:
|
4 |
+
- devcontainer
|
5 |
build: dev
|
6 |
volumes:
|
7 |
+
- ../..:/workspaces:cached
|
8 |
+
- ..:/workspaces/tts-service:cached
|
9 |
command: sleep infinity
|
.pre-commit-config.yaml
CHANGED
@@ -31,6 +31,7 @@ repos:
|
|
31 |
- id: mypy
|
32 |
name: mypy
|
33 |
entry: mypy
|
|
|
34 |
language: system
|
35 |
types: [python]
|
36 |
pass_filenames: false
|
|
|
31 |
- id: mypy
|
32 |
name: mypy
|
33 |
entry: mypy
|
34 |
+
args: ["--explicit-package-bases", "--namespace-packages"]
|
35 |
language: system
|
36 |
types: [python]
|
37 |
pass_filenames: false
|
.vscode/settings.json
CHANGED
@@ -13,6 +13,8 @@
|
|
13 |
"files.trimTrailingWhitespace": true,
|
14 |
"kubernetes-yaml-formatter-x.indentlessArrays": true,
|
15 |
"kubernetes-yaml-formatter-x.retainLineBreaksSingle": true,
|
|
|
|
|
16 |
"python.analysis.importFormat": "relative",
|
17 |
"python.analysis.autoFormatStrings": true,
|
18 |
"python.analysis.autoImportCompletions": true,
|
@@ -30,9 +32,9 @@
|
|
30 |
"prettier.tabWidth": 4,
|
31 |
"editor.defaultFormatter": "esbenp.prettier-vscode"
|
32 |
},
|
33 |
-
"[python]": {
|
34 |
-
|
35 |
-
},
|
36 |
"[yaml]": {
|
37 |
"editor.defaultFormatter": "kiliantyler.kubernetes-yaml-formatter-x"
|
38 |
}
|
|
|
13 |
"files.trimTrailingWhitespace": true,
|
14 |
"kubernetes-yaml-formatter-x.indentlessArrays": true,
|
15 |
"kubernetes-yaml-formatter-x.retainLineBreaksSingle": true,
|
16 |
+
"mypy.runUsingActiveInterpreter": true,
|
17 |
+
"mypy.extraArguments": ["--explicit-package-bases", "--namespace-packages"],
|
18 |
"python.analysis.importFormat": "relative",
|
19 |
"python.analysis.autoFormatStrings": true,
|
20 |
"python.analysis.autoImportCompletions": true,
|
|
|
32 |
"prettier.tabWidth": 4,
|
33 |
"editor.defaultFormatter": "esbenp.prettier-vscode"
|
34 |
},
|
35 |
+
// "[python]": {
|
36 |
+
// "editor.defaultFormatter": "charliermarsh.ruff"
|
37 |
+
// },
|
38 |
"[yaml]": {
|
39 |
"editor.defaultFormatter": "kiliantyler.kubernetes-yaml-formatter-x"
|
40 |
}
|
assets/flask/server.py
CHANGED
@@ -34,7 +34,7 @@ def start_flask():
|
|
34 |
try:
|
35 |
subprocess.Popen(
|
36 |
[ENV_PATH, FLASK_SCRIPT_PATH],
|
37 |
-
creationflags=subprocess
|
38 |
)
|
39 |
except Exception as error:
|
40 |
print(f"An error occurred starting the Flask server: {error}")
|
|
|
34 |
try:
|
35 |
subprocess.Popen(
|
36 |
[ENV_PATH, FLASK_SCRIPT_PATH],
|
37 |
+
creationflags=getattr(subprocess, "CREATE_NEW_CONSOLE", 0),
|
38 |
)
|
39 |
except Exception as error:
|
40 |
print(f"An error occurred starting the Flask server: {error}")
|
assets/installation_checker.py
CHANGED
@@ -13,7 +13,7 @@ class InstallationError(Exception):
|
|
13 |
|
14 |
def check_installation():
|
15 |
try:
|
16 |
-
system_drive = os.getenv("SystemDrive")
|
17 |
current_drive = os.path.splitdrive(now_dir)[0]
|
18 |
if current_drive.upper() != system_drive.upper():
|
19 |
raise InstallationError(
|
|
|
13 |
|
14 |
def check_installation():
|
15 |
try:
|
16 |
+
system_drive = os.getenv("SystemDrive", "")
|
17 |
current_drive = os.path.splitdrive(now_dir)[0]
|
18 |
if current_drive.upper() != system_drive.upper():
|
19 |
raise InstallationError(
|
core/__init__.py
CHANGED
@@ -72,7 +72,7 @@ def run_infer_script(
|
|
72 |
upscale_audio: bool,
|
73 |
f0_file: str,
|
74 |
embedder_model: str,
|
75 |
-
embedder_model_custom: str = None,
|
76 |
formant_shifting: bool = False,
|
77 |
formant_qfrency: float = 1.0,
|
78 |
formant_timbre: float = 1.0,
|
@@ -210,7 +210,7 @@ def run_batch_infer_script(
|
|
210 |
upscale_audio: bool,
|
211 |
f0_file: str,
|
212 |
embedder_model: str,
|
213 |
-
embedder_model_custom: str = None,
|
214 |
formant_shifting: bool = False,
|
215 |
formant_qfrency: float = 1.0,
|
216 |
formant_timbre: float = 1.0,
|
@@ -351,7 +351,7 @@ def run_tts_script(
|
|
351 |
upscale_audio: bool,
|
352 |
f0_file: str,
|
353 |
embedder_model: str,
|
354 |
-
embedder_model_custom: str = None,
|
355 |
sid: int = 0,
|
356 |
):
|
357 |
|
@@ -470,7 +470,7 @@ def run_extract_script(
|
|
470 |
gpu: int,
|
471 |
sample_rate: int,
|
472 |
embedder_model: str,
|
473 |
-
embedder_model_custom: str = None,
|
474 |
):
|
475 |
|
476 |
model_path = os.path.join(logs_path, model_name)
|
@@ -519,8 +519,8 @@ def run_train_script(
|
|
519 |
index_algorithm: str = "Auto",
|
520 |
cache_data_in_gpu: bool = False,
|
521 |
custom_pretrained: bool = False,
|
522 |
-
g_pretrained_path: str = None,
|
523 |
-
d_pretrained_path: str = None,
|
524 |
):
|
525 |
|
526 |
if pretrained == True:
|
@@ -737,15 +737,15 @@ def parse_arguments():
|
|
737 |
default="rmvpe",
|
738 |
)
|
739 |
infer_parser.add_argument(
|
740 |
-
"--
|
741 |
type=str,
|
742 |
-
help="Full path to the
|
743 |
required=True,
|
744 |
)
|
745 |
infer_parser.add_argument(
|
746 |
-
"--
|
747 |
type=str,
|
748 |
-
help="Full path to the output audio file.",
|
749 |
required=True,
|
750 |
)
|
751 |
pth_path_description = "Full path to the RVC model file (.pth)."
|
@@ -2440,8 +2440,8 @@ def main():
|
|
2440 |
protect=args.protect,
|
2441 |
hop_length=args.hop_length,
|
2442 |
f0_method=args.f0_method,
|
2443 |
-
|
2444 |
-
|
2445 |
pth_path=args.pth_path,
|
2446 |
index_path=args.index_path,
|
2447 |
split_audio=args.split_audio,
|
|
|
72 |
upscale_audio: bool,
|
73 |
f0_file: str,
|
74 |
embedder_model: str,
|
75 |
+
embedder_model_custom: str | None = None,
|
76 |
formant_shifting: bool = False,
|
77 |
formant_qfrency: float = 1.0,
|
78 |
formant_timbre: float = 1.0,
|
|
|
210 |
upscale_audio: bool,
|
211 |
f0_file: str,
|
212 |
embedder_model: str,
|
213 |
+
embedder_model_custom: str | None = None,
|
214 |
formant_shifting: bool = False,
|
215 |
formant_qfrency: float = 1.0,
|
216 |
formant_timbre: float = 1.0,
|
|
|
351 |
upscale_audio: bool,
|
352 |
f0_file: str,
|
353 |
embedder_model: str,
|
354 |
+
embedder_model_custom: str | None = None,
|
355 |
sid: int = 0,
|
356 |
):
|
357 |
|
|
|
470 |
gpu: int,
|
471 |
sample_rate: int,
|
472 |
embedder_model: str,
|
473 |
+
embedder_model_custom: str | None = None,
|
474 |
):
|
475 |
|
476 |
model_path = os.path.join(logs_path, model_name)
|
|
|
519 |
index_algorithm: str = "Auto",
|
520 |
cache_data_in_gpu: bool = False,
|
521 |
custom_pretrained: bool = False,
|
522 |
+
g_pretrained_path: str | None = None,
|
523 |
+
d_pretrained_path: str | None = None,
|
524 |
):
|
525 |
|
526 |
if pretrained == True:
|
|
|
737 |
default="rmvpe",
|
738 |
)
|
739 |
infer_parser.add_argument(
|
740 |
+
"--output_rvc_path",
|
741 |
type=str,
|
742 |
+
help="Full path to the output RVC file.",
|
743 |
required=True,
|
744 |
)
|
745 |
infer_parser.add_argument(
|
746 |
+
"--output_tts_path",
|
747 |
type=str,
|
748 |
+
help="Full path to the output TTS audio file.",
|
749 |
required=True,
|
750 |
)
|
751 |
pth_path_description = "Full path to the RVC model file (.pth)."
|
|
|
2440 |
protect=args.protect,
|
2441 |
hop_length=args.hop_length,
|
2442 |
f0_method=args.f0_method,
|
2443 |
+
output_rvc_path=args.output_rvc_path,
|
2444 |
+
output_tts_path=args.output_tts_path,
|
2445 |
pth_path=args.pth_path,
|
2446 |
index_path=args.index_path,
|
2447 |
split_audio=args.split_audio,
|
pyproject.toml
CHANGED
@@ -112,12 +112,23 @@ select = [
|
|
112 |
|
113 |
[tool.mypy]
|
114 |
packages = "assets,core,rvc,tabs,tts_service,tests"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
-
#1181 errors
|
117 |
[[tool.mypy.overrides]]
|
118 |
module = [
|
119 |
-
"core.*",
|
120 |
-
"rvc.infer.infer",
|
121 |
"rvc.infer.pipeline",
|
122 |
"rvc.lib.algorithm.attentions",
|
123 |
"rvc.lib.algorithm.commons",
|
@@ -127,6 +138,7 @@ module = [
|
|
127 |
"rvc.train.train",
|
128 |
"rvc.train.data_utils",
|
129 |
"rvc.train.extract.extract",
|
|
|
130 |
"rvc.train.preprocess.preprocess",
|
131 |
"rvc.train.preprocess.slicer",
|
132 |
"rvc.train.process.extract_small_model",
|
@@ -147,6 +159,7 @@ module = [
|
|
147 |
"local_attention",
|
148 |
"matplotlib.*",
|
149 |
"noisereduce",
|
|
|
150 |
"pydub",
|
151 |
"pypresence",
|
152 |
"resampy",
|
|
|
112 |
|
113 |
[tool.mypy]
|
114 |
packages = "assets,core,rvc,tabs,tts_service,tests"
|
115 |
+
check_untyped_defs = true
|
116 |
+
|
117 |
+
[[tool.mypy.overrides]]
|
118 |
+
module = [
|
119 |
+
"rvc.lib.algorithm.generators",
|
120 |
+
"rvc.lib.algorithm.residuals",
|
121 |
+
"rvc.lib.predictors.RMVPE",
|
122 |
+
"rvc.lib.tools.gdown",
|
123 |
+
"rvc.lib.tools.model_download",
|
124 |
+
"rvc.train.losses",
|
125 |
+
"rvc.train.process.extract_model",
|
126 |
+
"rvc.train.process.model_blender",
|
127 |
+
]
|
128 |
+
check_untyped_defs = false
|
129 |
|
|
|
130 |
[[tool.mypy.overrides]]
|
131 |
module = [
|
|
|
|
|
132 |
"rvc.infer.pipeline",
|
133 |
"rvc.lib.algorithm.attentions",
|
134 |
"rvc.lib.algorithm.commons",
|
|
|
138 |
"rvc.train.train",
|
139 |
"rvc.train.data_utils",
|
140 |
"rvc.train.extract.extract",
|
141 |
+
"rvc.train.mel_processing",
|
142 |
"rvc.train.preprocess.preprocess",
|
143 |
"rvc.train.preprocess.slicer",
|
144 |
"rvc.train.process.extract_small_model",
|
|
|
159 |
"local_attention",
|
160 |
"matplotlib.*",
|
161 |
"noisereduce",
|
162 |
+
"pedalboard_native",
|
163 |
"pydub",
|
164 |
"pypresence",
|
165 |
"resampy",
|
rvc/infer/infer.py
CHANGED
@@ -10,6 +10,8 @@ import soundfile as sf
|
|
10 |
import noisereduce as nr
|
11 |
from pedalboard import (
|
12 |
Pedalboard,
|
|
|
|
|
13 |
Chorus,
|
14 |
Distortion,
|
15 |
Reverb,
|
@@ -60,7 +62,7 @@ class VoiceConverter:
|
|
60 |
self.use_f0 = None # Whether the model uses F0
|
61 |
self.loaded_model = None
|
62 |
|
63 |
-
def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
|
64 |
"""
|
65 |
Loads the HuBERT model for speaker embedding extraction.
|
66 |
|
@@ -201,7 +203,7 @@ class VoiceConverter:
|
|
201 |
model_path: str,
|
202 |
index_path: str,
|
203 |
pitch: int = 0,
|
204 |
-
f0_file: str = None,
|
205 |
f0_method: str = "rmvpe",
|
206 |
index_rate: float = 0.75,
|
207 |
volume_envelope: float = 1,
|
@@ -212,7 +214,7 @@ class VoiceConverter:
|
|
212 |
f0_autotune_strength: float = 1,
|
213 |
filter_radius: int = 3,
|
214 |
embedder_model: str = "contentvec",
|
215 |
-
embedder_model_custom: str = None,
|
216 |
clean_audio: bool = False,
|
217 |
clean_strength: float = 0.5,
|
218 |
export_format: str = "WAV",
|
@@ -294,6 +296,8 @@ class VoiceConverter:
|
|
294 |
|
295 |
converted_chunks = []
|
296 |
for c in chunks:
|
|
|
|
|
297 |
audio_opt = self.vc.pipeline(
|
298 |
model=self.hubert_model,
|
299 |
net_g=self.net_g,
|
|
|
10 |
import noisereduce as nr
|
11 |
from pedalboard import (
|
12 |
Pedalboard,
|
13 |
+
)
|
14 |
+
from pedalboard_native import (
|
15 |
Chorus,
|
16 |
Distortion,
|
17 |
Reverb,
|
|
|
62 |
self.use_f0 = None # Whether the model uses F0
|
63 |
self.loaded_model = None
|
64 |
|
65 |
+
def load_hubert(self, embedder_model: str, embedder_model_custom: str | None = None):
|
66 |
"""
|
67 |
Loads the HuBERT model for speaker embedding extraction.
|
68 |
|
|
|
203 |
model_path: str,
|
204 |
index_path: str,
|
205 |
pitch: int = 0,
|
206 |
+
f0_file: str | None = None,
|
207 |
f0_method: str = "rmvpe",
|
208 |
index_rate: float = 0.75,
|
209 |
volume_envelope: float = 1,
|
|
|
214 |
f0_autotune_strength: float = 1,
|
215 |
filter_radius: int = 3,
|
216 |
embedder_model: str = "contentvec",
|
217 |
+
embedder_model_custom: str | None = None,
|
218 |
clean_audio: bool = False,
|
219 |
clean_strength: float = 0.5,
|
220 |
export_format: str = "WAV",
|
|
|
296 |
|
297 |
converted_chunks = []
|
298 |
for c in chunks:
|
299 |
+
if self.vc is None:
|
300 |
+
raise Exception("Voice conversion model not loaded.")
|
301 |
audio_opt = self.vc.pipeline(
|
302 |
model=self.hubert_model,
|
303 |
net_g=self.net_g,
|
rvc/infer/pipeline.py
CHANGED
@@ -8,6 +8,7 @@ import torchcrepe
|
|
8 |
import faiss
|
9 |
import librosa
|
10 |
import numpy as np
|
|
|
11 |
from scipy import signal
|
12 |
from torch import Tensor
|
13 |
|
@@ -29,7 +30,7 @@ bh, ah = signal.butter(
|
|
29 |
N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
|
30 |
)
|
31 |
|
32 |
-
input_audio_path2wav = {}
|
33 |
|
34 |
|
35 |
class AudioProcessor:
|
@@ -37,6 +38,7 @@ class AudioProcessor:
|
|
37 |
A class for processing audio signals, specifically for adjusting RMS levels.
|
38 |
"""
|
39 |
|
|
|
40 |
def change_rms(
|
41 |
source_audio: np.ndarray,
|
42 |
source_rate: int,
|
@@ -292,9 +294,10 @@ class Pipeline:
|
|
292 |
for method in methods:
|
293 |
f0 = None
|
294 |
if method == "crepe":
|
295 |
-
|
296 |
-
|
297 |
-
)
|
|
|
298 |
elif method == "rmvpe":
|
299 |
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
300 |
f0 = f0[1:]
|
@@ -323,8 +326,8 @@ class Pipeline:
|
|
323 |
|
324 |
def get_f0(
|
325 |
self,
|
326 |
-
input_audio_path,
|
327 |
-
x,
|
328 |
p_len,
|
329 |
pitch,
|
330 |
f0_method,
|
@@ -382,7 +385,7 @@ class Pipeline:
|
|
382 |
)
|
383 |
|
384 |
if f0_autotune is True:
|
385 |
-
f0 =
|
386 |
|
387 |
f0 *= pow(2, pitch / 12)
|
388 |
tf0 = self.sample_rate // self.window
|
@@ -404,7 +407,7 @@ class Pipeline:
|
|
404 |
) + 1
|
405 |
f0_mel[f0_mel <= 1] = 1
|
406 |
f0_mel[f0_mel > 255] = 255
|
407 |
-
f0_coarse = np.rint(f0_mel).astype(np.
|
408 |
|
409 |
return f0_coarse, f0bak
|
410 |
|
|
|
8 |
import faiss
|
9 |
import librosa
|
10 |
import numpy as np
|
11 |
+
import numpy.typing as npt
|
12 |
from scipy import signal
|
13 |
from torch import Tensor
|
14 |
|
|
|
30 |
N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
|
31 |
)
|
32 |
|
33 |
+
input_audio_path2wav: dict[str, npt.NDArray] = {}
|
34 |
|
35 |
|
36 |
class AudioProcessor:
|
|
|
38 |
A class for processing audio signals, specifically for adjusting RMS levels.
|
39 |
"""
|
40 |
|
41 |
+
@staticmethod
|
42 |
def change_rms(
|
43 |
source_audio: np.ndarray,
|
44 |
source_rate: int,
|
|
|
294 |
for method in methods:
|
295 |
f0 = None
|
296 |
if method == "crepe":
|
297 |
+
raise ValueError("Crepe method is not supported in hybrid mode")
|
298 |
+
# f0 = self.get_f0_crepe_computation(
|
299 |
+
# x, f0_min, f0_max, p_len, int(hop_length)
|
300 |
+
# )
|
301 |
elif method == "rmvpe":
|
302 |
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
303 |
f0 = f0[1:]
|
|
|
326 |
|
327 |
def get_f0(
|
328 |
self,
|
329 |
+
input_audio_path: str,
|
330 |
+
x: npt.NDArray,
|
331 |
p_len,
|
332 |
pitch,
|
333 |
f0_method,
|
|
|
385 |
)
|
386 |
|
387 |
if f0_autotune is True:
|
388 |
+
f0 = self.autotune.autotune_f0(f0, f0_autotune_strength)
|
389 |
|
390 |
f0 *= pow(2, pitch / 12)
|
391 |
tf0 = self.sample_rate // self.window
|
|
|
407 |
) + 1
|
408 |
f0_mel[f0_mel <= 1] = 1
|
409 |
f0_mel[f0_mel > 255] = 255
|
410 |
+
f0_coarse = np.rint(f0_mel).astype(np.int32)
|
411 |
|
412 |
return f0_coarse, f0bak
|
413 |
|
rvc/lib/algorithm/synthesizers.py
CHANGED
@@ -56,6 +56,7 @@ class Synthesizer(torch.nn.Module):
|
|
56 |
spk_embed_dim,
|
57 |
gin_channels,
|
58 |
sr,
|
|
|
59 |
use_f0,
|
60 |
text_enc_hidden_dim=768,
|
61 |
**kwargs
|
|
|
56 |
spk_embed_dim,
|
57 |
gin_channels,
|
58 |
sr,
|
59 |
+
*,
|
60 |
use_f0,
|
61 |
text_enc_hidden_dim=768,
|
62 |
**kwargs
|