Spaces:

akdNIKY
/

nikyGUI-v2

Runtime error

App Files Files Community

Hev832 commited on Feb 5

Commit

9a8d204

•

1 Parent(s): 6c3ed39

Upload 17 files

Browse files

Files changed (17) hide show

tools/app.py +161 -0
tools/calc_rvc_model_similarity.py +96 -0
tools/dlmodels.bat +348 -0
tools/dlmodels.sh +566 -0
tools/download_models.py +79 -0
tools/export_onnx.py +54 -0
tools/infer/infer-pm-index256.py +203 -0
tools/infer/train-index-v2.py +80 -0
tools/infer/train-index.py +43 -0
tools/infer/trans_weights.py +18 -0
tools/infer_batch_rvc.py +72 -0
tools/infer_cli.py +67 -0
tools/onnx_inference_demo.py +23 -0
tools/rvc_for_realtime.py +443 -0
tools/torchgate/__init__.py +13 -0
tools/torchgate/torchgate.py +280 -0
tools/torchgate/utils.py +70 -0

tools/app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import logging
+import os
+# os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
+import gradio as gr
+from dotenv import load_dotenv
+from configs.config import Config
+from i18n.i18n import I18nAuto
+from infer.modules.vc.modules import VC
+logging.getLogger("numba").setLevel(logging.WARNING)
+logging.getLogger("markdown_it").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+logging.getLogger("matplotlib").setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+i18n = I18nAuto()
+logger.info(i18n)
+load_dotenv()
+config = Config()
+vc = VC(config)
+weight_root = os.getenv("weight_root")
+weight_uvr5_root = os.getenv("weight_uvr5_root")
+index_root = os.getenv("index_root")
+names = []
+hubert_model = None
+for name in os.listdir(weight_root):
+    if name.endswith(".pth"):
+        names.append(name)
+index_paths = []
+for root, dirs, files in os.walk(index_root, topdown=False):
+    for name in files:
+        if name.endswith(".index") and "trained" not in name:
+            index_paths.append("%s/%s" % (root, name))
+app = gr.Blocks()
+with app:
+    with gr.Tabs():
+        with gr.TabItem("在线demo"):
+            gr.Markdown(
+                value="""
+                RVC 在线demo
+                """
+            )
+            sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
+            with gr.Column():
+                spk_item = gr.Slider(
+                    minimum=0,
+                    maximum=2333,
+                    step=1,
+                    label=i18n("请选择说话人id"),
+                    value=0,
+                    visible=False,
+                    interactive=True,
+                )
+            sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item])
+            gr.Markdown(
+                value=i18n(
+                    "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. "
+                )
+            )
+            vc_input3 = gr.Audio(label="上传音频（长度小于90秒）")
+            vc_transform0 = gr.Number(
+                label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
+            )
+            f0method0 = gr.Radio(
+                label=i18n(
+                    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"
+                ),
+                choices=["pm", "harvest", "crepe", "rmvpe"],
+                value="pm",
+                interactive=True,
+            )
+            filter_radius0 = gr.Slider(
+                minimum=0,
+                maximum=7,
+                label=i18n(
+                    ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音"
+                ),
+                value=3,
+                step=1,
+                interactive=True,
+            )
+            with gr.Column():
+                file_index1 = gr.Textbox(
+                    label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
+                    value="",
+                    interactive=False,
+                    visible=False,
+                )
+            file_index2 = gr.Dropdown(
+                label=i18n("自动检测index路径,下拉式选择(dropdown)"),
+                choices=sorted(index_paths),
+                interactive=True,
+            )
+            index_rate1 = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label=i18n("检索特征占比"),
+                value=0.88,
+                interactive=True,
+            )
+            resample_sr0 = gr.Slider(
+                minimum=0,
+                maximum=48000,
+                label=i18n("后处理重采样至最终采样率，0为不进行重采样"),
+                value=0,
+                step=1,
+                interactive=True,
+            )
+            rms_mix_rate0 = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label=i18n(
+                    "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络"
+                ),
+                value=1,
+                interactive=True,
+            )
+            protect0 = gr.Slider(
+                minimum=0,
+                maximum=0.5,
+                label=i18n(
+                    "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果"
+                ),
+                value=0.33,
+                step=0.01,
+                interactive=True,
+            )
+            f0_file = gr.File(
+                label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")
+            )
+            but0 = gr.Button(i18n("转换"), variant="primary")
+            vc_output1 = gr.Textbox(label=i18n("输出信息"))
+            vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
+            but0.click(
+                vc.vc_single,
+                [
+                    spk_item,
+                    vc_input3,
+                    vc_transform0,
+                    f0_file,
+                    f0method0,
+                    file_index1,
+                    file_index2,
+                    # file_big_npy1,
+                    index_rate1,
+                    filter_radius0,
+                    resample_sr0,
+                    rms_mix_rate0,
+                    protect0,
+                ],
+                [vc_output1, vc_output2],
+            )
+app.launch()

tools/calc_rvc_model_similarity.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py
+# Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models.
+import os
+import logging
+logger = logging.getLogger(__name__)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def cal_cross_attn(to_q, to_k, to_v, rand_input):
+    hidden_dim, embed_dim = to_q.shape
+    attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False)
+    attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False)
+    attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False)
+    attn_to_q.load_state_dict({"weight": to_q})
+    attn_to_k.load_state_dict({"weight": to_k})
+    attn_to_v.load_state_dict({"weight": to_v})
+    return torch.einsum(
+        "ik, jk -> ik",
+        F.softmax(
+            torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)),
+            dim=-1,
+        ),
+        attn_to_v(rand_input),
+    )
+def model_hash(filename):
+    try:
+        with open(filename, "rb") as file:
+            import hashlib
+            m = hashlib.sha256()
+            file.seek(0x100000)
+            m.update(file.read(0x10000))
+            return m.hexdigest()[0:8]
+    except FileNotFoundError:
+        return "NOFILE"
+def eval(model, n, input):
+    qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight"
+    uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight"
+    vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
+    atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0]
+    attn = cal_cross_attn(atoq, atok, atov, input)
+    return attn
+def main(path, root):
+    torch.manual_seed(114514)
+    model_a = torch.load(path, map_location="cpu")["weight"]
+    logger.info("Query:\t\t%s\t%s" % (path, model_hash(path)))
+    map_attn_a = {}
+    map_rand_input = {}
+    for n in range(6):
+        hidden_dim, embed_dim, _ = model_a[
+            f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
+        ].shape
+        rand_input = torch.randn([embed_dim, hidden_dim])
+        map_attn_a[n] = eval(model_a, n, rand_input)
+        map_rand_input[n] = rand_input
+    del model_a
+    for name in sorted(list(os.listdir(root))):
+        path = "%s/%s" % (root, name)
+        model_b = torch.load(path, map_location="cpu")["weight"]
+        sims = []
+        for n in range(6):
+            attn_a = map_attn_a[n]
+            attn_b = eval(model_b, n, map_rand_input[n])
+            sim = torch.mean(torch.cosine_similarity(attn_a, attn_b))
+            sims.append(sim)
+        logger.info(
+            "Reference:\t%s\t%s\t%s"
+            % (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%")
+        )
+if __name__ == "__main__":
+    query_path = r"assets\weights\mi v3.pth"
+    reference_root = r"assets\weights"
+    main(query_path, reference_root)

tools/dlmodels.bat ADDED Viewed

	@@ -0,0 +1,348 @@

+@echo off && chcp 65001
+echo working dir is %cd%
+echo downloading requirement aria2 check.
+echo=
+dir /a:d/b | findstr "aria2" > flag.txt
+findstr "aria2" flag.txt >nul
+if %errorlevel% ==0 (
+    echo aria2 checked.
+    echo=
+) else (
+    echo failed. please downloading aria2 from webpage!
+    echo unzip it and put in this directory!
+    timeout /T 5
+    start https://github.com/aria2/aria2/releases/tag/release-1.36.0
+    echo=
+    goto end
+)
+echo envfiles checking start.
+echo=
+for /f %%x in ('findstr /i /c:"aria2" "flag.txt"') do (set aria2=%%x)&goto endSch
+:endSch
+set d32=f0D32k.pth
+set d40=f0D40k.pth
+set d48=f0D48k.pth
+set g32=f0G32k.pth
+set g40=f0G40k.pth
+set g48=f0G48k.pth
+set d40v2=f0D40k.pth
+set g40v2=f0G40k.pth
+set dld32=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth
+set dld40=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth
+set dld48=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth
+set dlg32=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth
+set dlg40=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth
+set dlg48=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth
+set dld40v2=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth
+set dlg40v2=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth
+set hp2_all=HP2_all_vocals.pth
+set hp3_all=HP3_all_vocals.pth
+set hp5_only=HP5_only_main_vocal.pth
+set VR_DeEchoAggressive=VR-DeEchoAggressive.pth
+set VR_DeEchoDeReverb=VR-DeEchoDeReverb.pth
+set VR_DeEchoNormal=VR-DeEchoNormal.pth
+set onnx_dereverb=vocals.onnx
+set dlhp2_all=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
+set dlhp3_all=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
+set dlhp5_only=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
+set dlVR_DeEchoAggressive=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
+set dlVR_DeEchoDeReverb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
+set dlVR_DeEchoNormal=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
+set dlonnx_dereverb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
+set hb=hubert_base.pt
+set dlhb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt
+echo dir check start.
+echo=
+if exist "%~dp0assets\pretrained" (
+        echo dir .\assets\pretrained checked.
+    ) else (
+        echo failed. generating dir .\assets\pretrained.
+        mkdir pretrained
+    )
+if exist "%~dp0assets\pretrained_v2" (
+        echo dir .\assets\pretrained_v2 checked.
+    ) else (
+        echo failed. generating dir .\assets\pretrained_v2.
+        mkdir pretrained_v2
+    )
+if exist "%~dp0assets\uvr5_weights" (
+        echo dir .\assets\uvr5_weights checked.
+    ) else (
+        echo failed. generating dir .\assets\uvr5_weights.
+        mkdir uvr5_weights
+    )
+if exist "%~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy" (
+        echo dir .\assets\uvr5_weights\onnx_dereverb_By_FoxJoy checked.
+    ) else (
+        echo failed. generating dir .\assets\uvr5_weights\onnx_dereverb_By_FoxJoy.
+        mkdir uvr5_weights\onnx_dereverb_By_FoxJoy
+    )
+echo=
+echo dir check finished.
+echo=
+echo required files check start.
+echo checking D32k.pth
+if exist "%~dp0assets\pretrained\D32k.pth" (
+        echo D32k.pth in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d %~dp0assets\pretrained -o D32k.pth
+        if exist "%~dp0assets\pretrained\D32k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking D40k.pth
+if exist "%~dp0assets\pretrained\D40k.pth" (
+        echo D40k.pth in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d %~dp0assets\pretrained -o D40k.pth
+        if exist "%~dp0assets\pretrained\D40k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking D40k.pth
+if exist "%~dp0assets\pretrained_v2\D40k.pth" (
+        echo D40k.pth in .\assets\pretrained_v2 checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d %~dp0assets\pretrained_v2 -o D40k.pth
+        if exist "%~dp0assets\pretrained_v2\D40k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking D48k.pth
+if exist "%~dp0assets\pretrained\D48k.pth" (
+        echo D48k.pth in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d %~dp0assets\pretrained -o D48k.pth
+        if exist "%~dp0assets\pretrained\D48k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking G32k.pth
+if exist "%~dp0assets\pretrained\G32k.pth" (
+        echo G32k.pth in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d %~dp0assets\pretrained -o G32k.pth
+        if exist "%~dp0assets\pretrained\G32k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking G40k.pth
+if exist "%~dp0assets\pretrained\G40k.pth" (
+        echo G40k.pth in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d %~dp0assets\pretrained -o G40k.pth
+        if exist "%~dp0assets\pretrained\G40k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking G40k.pth
+if exist "%~dp0assets\pretrained_v2\G40k.pth" (
+        echo G40k.pth in .\assets\pretrained_v2 checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d %~dp0assets\pretrained_v2 -o G40k.pth
+        if exist "%~dp0assets\pretrained_v2\G40k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking G48k.pth
+if exist "%~dp0assets\pretrained\G48k.pth" (
+        echo G48k.pth in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d %~dp0assets\pretrained -o G48k.pth
+        if exist "%~dp0assets\pretrained\G48k.pth" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %d32%
+if exist "%~dp0assets\pretrained\%d32%" (
+        echo %d32% in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld32% -d %~dp0assets\pretrained -o %d32%
+        if exist "%~dp0assets\pretrained\%d32%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %d40%
+if exist "%~dp0assets\pretrained\%d40%" (
+        echo %d40% in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld40% -d %~dp0assets\pretrained -o %d40%
+        if exist "%~dp0assets\pretrained\%d40%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %d40v2%
+if exist "%~dp0assets\pretrained_v2\%d40v2%" (
+        echo %d40v2% in .\assets\pretrained_v2 checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld40v2% -d %~dp0assets\pretrained_v2 -o %d40v2%
+        if exist "%~dp0assets\pretrained_v2\%d40v2%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %d48%
+if exist "%~dp0assets\pretrained\%d48%" (
+        echo %d48% in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld48% -d %~dp0assets\pretrained -o %d48%
+        if exist "%~dp0assets\pretrained\%d48%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %g32%
+if exist "%~dp0assets\pretrained\%g32%" (
+        echo %g32% in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg32% -d %~dp0assets\pretrained -o %g32%
+        if exist "%~dp0assets\pretrained\%g32%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %g40%
+if exist "%~dp0assets\pretrained\%g40%" (
+        echo %g40% in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg40% -d %~dp0assets\pretrained -o %g40%
+        if exist "%~dp0assets\pretrained\%g40%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %g40v2%
+if exist "%~dp0assets\pretrained_v2\%g40v2%" (
+        echo %g40v2% in .\assets\pretrained_v2 checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg40v2% -d %~dp0assets\pretrained_v2 -o %g40v2%
+        if exist "%~dp0assets\pretrained_v2\%g40v2%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %g48%
+if exist "%~dp0assets\pretrained\%g48%" (
+        echo %g48% in .\assets\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg48% -d %~dp0assets\pretrained -o %g48%
+        if exist "%~dp0assets\pretrained\%g48%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %hp2_all%
+if exist "%~dp0assets\uvr5_weights\%hp2_all%" (
+        echo %hp2_all% in .\assets\uvr5_weights checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp2_all% -d %~dp0assets\uvr5_weights -o %hp2_all%
+        if exist "%~dp0assets\uvr5_weights\%hp2_all%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %hp3_all%
+if exist "%~dp0assets\uvr5_weights\%hp3_all%" (
+        echo %hp3_all% in .\assets\uvr5_weights checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp3_all% -d %~dp0assets\uvr5_weights -o %hp3_all%
+        if exist "%~dp0assets\uvr5_weights\%hp3_all%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %hp5_only%
+if exist "%~dp0assets\uvr5_weights\%hp5_only%" (
+        echo %hp5_only% in .\assets\uvr5_weights checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp5_only% -d %~dp0assets\uvr5_weights -o %hp5_only%
+        if exist "%~dp0assets\uvr5_weights\%hp5_only%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %VR_DeEchoAggressive%
+if exist "%~dp0assets\uvr5_weights\%VR_DeEchoAggressive%" (
+        echo %VR_DeEchoAggressive% in .\assets\uvr5_weights checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoAggressive% -d %~dp0assets\uvr5_weights -o %VR_DeEchoAggressive%
+        if exist "%~dp0assets\uvr5_weights\%VR_DeEchoAggressive%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %VR_DeEchoDeReverb%
+if exist "%~dp0assets\uvr5_weights\%VR_DeEchoDeReverb%" (
+        echo %VR_DeEchoDeReverb% in .\assets\uvr5_weights checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoDeReverb% -d %~dp0assets\uvr5_weights -o %VR_DeEchoDeReverb%
+        if exist "%~dp0assets\uvr5_weights\%VR_DeEchoDeReverb%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %VR_DeEchoNormal%
+if exist "%~dp0assets\uvr5_weights\%VR_DeEchoNormal%" (
+        echo %VR_DeEchoNormal% in .\assets\uvr5_weights checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoNormal% -d %~dp0assets\uvr5_weights -o %VR_DeEchoNormal%
+        if exist "%~dp0assets\uvr5_weights\%VR_DeEchoNormal%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %onnx_dereverb%
+if exist "%~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy\%onnx_dereverb%" (
+        echo %onnx_dereverb% in .\assets\uvr5_weights\onnx_dereverb_By_FoxJoy checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlonnx_dereverb% -d %~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy -o %onnx_dereverb%
+        if exist "%~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy\%onnx_dereverb%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo checking %hb%
+if exist "%~dp0assets\hubert\%hb%" (
+        echo %hb% in .\assets\hubert\pretrained checked.
+        echo=
+    ) else (
+        echo failed. starting download from huggingface.
+        %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhb% -d %~dp0assets\hubert\ -o %hb%
+        if exist "%~dp0assets\hubert\%hb%" (echo download successful.) else (echo please try again!
+        echo=)
+    )
+echo required files check finished.
+echo envfiles check complete.
+pause
+:end
+del flag.txt

tools/dlmodels.sh ADDED Viewed

	@@ -0,0 +1,566 @@

+#!/bin/bash
+echo working dir is $(pwd)
+echo downloading requirement aria2 check.
+if command -v aria2c &> /dev/null
+then
+    echo "aria2c command found"
+else
+    echo failed. please install aria2
+    sleep 5
+    exit 1
+fi
+d32="f0D32k.pth"
+d40="f0D40k.pth"
+d48="f0D48k.pth"
+g32="f0G32k.pth"
+g40="f0G40k.pth"
+g48="f0G48k.pth"
+d40v2="f0D40k.pth"
+g40v2="f0G40k.pth"
+dld32="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth"
+dld40="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth"
+dld48="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth"
+dlg32="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth"
+dlg40="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth"
+dlg48="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth"
+dld40v2="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth"
+dlg40v2="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth"
+hp2_all="HP2_all_vocals.pth"
+hp3_all="HP3_all_vocals.pth"
+hp5_only="HP5_only_main_vocal.pth"
+VR_DeEchoAggressive="VR-DeEchoAggressive.pth"
+VR_DeEchoDeReverb="VR-DeEchoDeReverb.pth"
+VR_DeEchoNormal="VR-DeEchoNormal.pth"
+onnx_dereverb="vocals.onnx"
+rmvpe="rmvpe.pt"
+dlhp2_all="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth"
+dlhp3_all="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth"
+dlhp5_only="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth"
+dlVR_DeEchoAggressive="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth"
+dlVR_DeEchoDeReverb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth"
+dlVR_DeEchoNormal="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth"
+dlonnx_dereverb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx"
+dlrmvpe="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt"
+hb="hubert_base.pt"
+dlhb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt"
+echo dir check start.
+if [ -d "./assets/pretrained" ]; then
+    echo dir ./assets/pretrained checked.
+else
+    echo failed. generating dir ./assets/pretrained.
+    mkdir pretrained
+fi
+if [ -d "./assets/pretrained_v2" ]; then
+    echo dir ./assets/pretrained_v2 checked.
+else
+    echo failed. generating dir ./assets/pretrained_v2.
+    mkdir pretrained_v2
+fi
+if [ -d "./assets/uvr5_weights" ]; then
+    echo dir ./assets/uvr5_weights checked.
+else
+    echo failed. generating dir ./assets/uvr5_weights.
+    mkdir uvr5_weights
+fi
+if [ -d "./assets/uvr5_weights/onnx_dereverb_By_FoxJoy" ]; then
+    echo dir ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy checked.
+else
+    echo failed. generating dir ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy.
+    mkdir uvr5_weights/onnx_dereverb_By_FoxJoy
+fi
+echo dir check finished.
+echo required files check start.
+echo checking D32k.pth
+if [ -f "./assets/pretrained/D32k.pth" ]; then
+    echo D32k.pth in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d ./assets/pretrained -o D32k.pth
+        if [ -f "./assets/pretrained/D32k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking D40k.pth
+if [ -f "./assets/pretrained/D40k.pth" ]; then
+    echo D40k.pth in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d ./assets/pretrained -o D40k.pth
+        if [ -f "./assets/pretrained/D40k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking D40k.pth
+if [ -f "./assets/pretrained_v2/D40k.pth" ]; then
+    echo D40k.pth in ./assets/pretrained_v2 checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d ./assets/pretrained_v2 -o D40k.pth
+        if [ -f "./assets/pretrained_v2/D40k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking D48k.pth
+if [ -f "./assets/pretrained/D48k.pth" ]; then
+    echo D48k.pth in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d ./assets/pretrained -o D48k.pth
+        if [ -f "./assets/pretrained/D48k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking G32k.pth
+if [ -f "./assets/pretrained/G32k.pth" ]; then
+    echo G32k.pth in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d ./assets/pretrained -o G32k.pth
+        if [ -f "./assets/pretrained/G32k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking G40k.pth
+if [ -f "./assets/pretrained/G40k.pth" ]; then
+    echo G40k.pth in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d ./assets/pretrained -o G40k.pth
+        if [ -f "./assets/pretrained/G40k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking G40k.pth
+if [ -f "./assets/pretrained_v2/G40k.pth" ]; then
+    echo G40k.pth in ./assets/pretrained_v2 checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d ./assets/pretrained_v2 -o G40k.pth
+        if [ -f "./assets/pretrained_v2/G40k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking G48k.pth
+if [ -f "./assets/pretrained/G48k.pth" ]; then
+    echo G48k.pth in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d ./assets/pretrained -o G48k.pth
+        if [ -f "./assets/pretrained/G48k.pth" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $d32
+if [ -f "./assets/pretrained/$d32" ]; then
+    echo $d32 in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld32 -d ./assets/pretrained -o $d32
+        if [ -f "./assets/pretrained/$d32" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $d40
+if [ -f "./assets/pretrained/$d40" ]; then
+    echo $d40 in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld40 -d ./assets/pretrained -o $d40
+        if [ -f "./assets/pretrained/$d40" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $d40v2
+if [ -f "./assets/pretrained_v2/$d40v2" ]; then
+    echo $d40v2 in ./assets/pretrained_v2 checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld40v2 -d ./assets/pretrained_v2 -o $d40v2
+        if [ -f "./assets/pretrained_v2/$d40v2" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $d48
+if [ -f "./assets/pretrained/$d48" ]; then
+    echo $d48 in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld48 -d ./assets/pretrained -o $d48
+        if [ -f "./assets/pretrained/$d48" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $g32
+if [ -f "./assets/pretrained/$g32" ]; then
+    echo $g32 in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg32 -d ./assets/pretrained -o $g32
+        if [ -f "./assets/pretrained/$g32" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $g40
+if [ -f "./assets/pretrained/$g40" ]; then
+    echo $g40 in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg40 -d ./assets/pretrained -o $g40
+        if [ -f "./assets/pretrained/$g40" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $g40v2
+if [ -f "./assets/pretrained_v2/$g40v2" ]; then
+    echo $g40v2 in ./assets/pretrained_v2 checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg40v2 -d ./assets/pretrained_v2 -o $g40v2
+        if [ -f "./assets/pretrained_v2/$g40v2" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $g48
+if [ -f "./assets/pretrained/$g48" ]; then
+    echo $g48 in ./assets/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg48 -d ./assets/pretrained -o $g48
+        if [ -f "./assets/pretrained/$g48" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $hp2_all
+if [ -f "./assets/uvr5_weights/$hp2_all" ]; then
+    echo $hp2_all in ./assets/uvr5_weights checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp2_all -d ./assets/uvr5_weights -o $hp2_all
+        if [ -f "./assets/uvr5_weights/$hp2_all" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $hp3_all
+if [ -f "./assets/uvr5_weights/$hp3_all" ]; then
+    echo $hp3_all in ./assets/uvr5_weights checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp3_all -d ./assets/uvr5_weights -o $hp3_all
+        if [ -f "./assets/uvr5_weights/$hp3_all" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $hp5_only
+if [ -f "./assets/uvr5_weights/$hp5_only" ]; then
+    echo $hp5_only in ./assets/uvr5_weights checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp5_only -d ./assets/uvr5_weights -o $hp5_only
+        if [ -f "./assets/uvr5_weights/$hp5_only" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $VR_DeEchoAggressive
+if [ -f "./assets/uvr5_weights/$VR_DeEchoAggressive" ]; then
+    echo $VR_DeEchoAggressive in ./assets/uvr5_weights checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoAggressive -d ./assets/uvr5_weights -o $VR_DeEchoAggressive
+        if [ -f "./assets/uvr5_weights/$VR_DeEchoAggressive" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $VR_DeEchoDeReverb
+if [ -f "./assets/uvr5_weights/$VR_DeEchoDeReverb" ]; then
+    echo $VR_DeEchoDeReverb in ./assets/uvr5_weights checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoDeReverb -d ./assets/uvr5_weights -o $VR_DeEchoDeReverb
+        if [ -f "./assets/uvr5_weights/$VR_DeEchoDeReverb" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $VR_DeEchoNormal
+if [ -f "./assets/uvr5_weights/$VR_DeEchoNormal" ]; then
+    echo $VR_DeEchoNormal in ./assets/uvr5_weights checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoNormal -d ./assets/uvr5_weights -o $VR_DeEchoNormal
+        if [ -f "./assets/uvr5_weights/$VR_DeEchoNormal" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $onnx_dereverb
+if [ -f "./assets/uvr5_weights/onnx_dereverb_By_FoxJoy/$onnx_dereverb" ]; then
+    echo $onnx_dereverb in ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlonnx_dereverb -d ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy -o $onnx_dereverb
+        if [ -f "./assets/uvr5_weights/onnx_dereverb_By_FoxJoy/$onnx_dereverb" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $rmvpe
+if [ -f "./assets/rmvpe/$rmvpe" ]; then
+    echo $rmvpe in ./assets/rmvpe checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlrmvpe -d ./assets/rmvpe -o $rmvpe
+        if [ -f "./assets/rmvpe/$rmvpe" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo checking $hb
+if [ -f "./assets/hubert/$hb" ]; then
+    echo $hb in ./assets/hubert/pretrained checked.
+else
+    echo failed. starting download from huggingface.
+    if command -v aria2c &> /dev/null; then
+        aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhb -d ./assets/hubert/ -o $hb
+        if [ -f "./assets/hubert/$hb" ]; then
+            echo download successful.
+        else
+            echo please try again!
+            exit 1
+        fi
+    else
+        echo aria2c command not found. Please install aria2c and try again.
+        exit 1
+    fi
+fi
+echo required files check finished.

tools/download_models.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+from pathlib import Path
+import requests
+RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
+BASE_DIR = Path(__file__).resolve().parent.parent
+def dl_model(link, model_name, dir_name):
+    with requests.get(f"{link}{model_name}") as r:
+        r.raise_for_status()
+        os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True)
+        with open(dir_name / model_name, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+if __name__ == "__main__":
+    print("Downloading hubert_base.pt...")
+    dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert")
+    print("Downloading rmvpe.pt...")
+    dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe")
+    print("Downloading vocals.onnx...")
+    dl_model(
+        RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/",
+        "vocals.onnx",
+        BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy",
+    )
+    rvc_models_dir = BASE_DIR / "assets/pretrained"
+    print("Downloading pretrained models:")
+    model_names = [
+        "D32k.pth",
+        "D40k.pth",
+        "D48k.pth",
+        "G32k.pth",
+        "G40k.pth",
+        "G48k.pth",
+        "f0D32k.pth",
+        "f0D40k.pth",
+        "f0D48k.pth",
+        "f0G32k.pth",
+        "f0G40k.pth",
+        "f0G48k.pth",
+    ]
+    for model in model_names:
+        print(f"Downloading {model}...")
+        dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir)
+    rvc_models_dir = BASE_DIR / "assets/pretrained_v2"
+    print("Downloading pretrained models v2:")
+    for model in model_names:
+        print(f"Downloading {model}...")
+        dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir)
+    print("Downloading uvr5_weights:")
+    rvc_models_dir = BASE_DIR / "assets/uvr5_weights"
+    model_names = [
+        "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth",
+        "HP2_all_vocals.pth",
+        "HP3_all_vocals.pth",
+        "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth",
+        "HP5_only_main_vocal.pth",
+        "VR-DeEchoAggressive.pth",
+        "VR-DeEchoDeReverb.pth",
+        "VR-DeEchoNormal.pth",
+    ]
+    for model in model_names:
+        print(f"Downloading {model}...")
+        dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir)
+    print("All models downloaded!")

tools/export_onnx.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
+if __name__ == "__main__":
+    MoeVS = True  # 模型是否为MoeVoiceStudio（原MoeSS）使用
+    ModelPath = "Shiroha/shiroha.pth"  # 模型路径
+    ExportedPath = "model.onnx"  # 输出路径
+    hidden_channels = 256  # hidden_channels，为768Vec做准备
+    cpt = torch.load(ModelPath, map_location="cpu")
+    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
+    print(*cpt["config"])
+    test_phone = torch.rand(1, 200, hidden_channels)  # hidden unit
+    test_phone_lengths = torch.tensor([200]).long()  # hidden unit 长度（貌似没啥用）
+    test_pitch = torch.randint(size=(1, 200), low=5, high=255)  # 基频（单位赫兹）
+    test_pitchf = torch.rand(1, 200)  # nsf基频
+    test_ds = torch.LongTensor([0])  # 说话人ID
+    test_rnd = torch.rand(1, 192, 200)  # 噪声（加入随机因子）
+    device = "cpu"  # 导出时设备（不影响使用模型）
+    net_g = SynthesizerTrnMsNSFsidM(
+        *cpt["config"], is_half=False
+    )  # fp32导出（C++要支持fp16必须手动将内存重新排列所以暂时不用fp16）
+    net_g.load_state_dict(cpt["weight"], strict=False)
+    input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
+    output_names = [
+        "audio",
+    ]
+    # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
+    torch.onnx.export(
+        net_g,
+        (
+            test_phone.to(device),
+            test_phone_lengths.to(device),
+            test_pitch.to(device),
+            test_pitchf.to(device),
+            test_ds.to(device),
+            test_rnd.to(device),
+        ),
+        ExportedPath,
+        dynamic_axes={
+            "phone": [1],
+            "pitch": [1],
+            "pitchf": [1],
+            "rnd": [2],
+        },
+        do_constant_folding=False,
+        opset_version=16,
+        verbose=False,
+        input_names=input_names,
+        output_names=output_names,
+    )

tools/infer/infer-pm-index256.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+对源特征进行检索
+"""
+import os
+import logging
+logger = logging.getLogger(__name__)
+import parselmouth
+import torch
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# import torchcrepe
+from time import time as ttime
+# import pyworld
+import librosa
+import numpy as np
+import soundfile as sf
+import torch.nn.functional as F
+from fairseq import checkpoint_utils
+# from models import SynthesizerTrn256#hifigan_nonsf
+# from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
+from infer.lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
+)  # hifigan_nsf
+from scipy.io import wavfile
+# from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
+# from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
+# from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt"  #
+logger.info("Load model(s) from {}".format(model_path))
+models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+    [model_path],
+    suffix="",
+)
+model = models[0]
+model = model.to(device)
+model = model.half()
+model.eval()
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
+net_g = SynthesizerTrn256(
+    1025,
+    32,
+    192,
+    192,
+    768,
+    2,
+    6,
+    3,
+    0,
+    "1",
+    [3, 7, 11],
+    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+    [10, 10, 2, 2],
+    512,
+    [16, 16, 4, 4],
+    183,
+    256,
+    is_half=True,
+)  # hifigan#512#256#no_dropout
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
+# net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
+#
+# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
+# net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
+# weights=torch.load("infer/ft-mi_1k-noD.pt")
+# weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
+# weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
+# weights=torch.load("infer/ft-mi-sim1k.pt")
+weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
+logger.debug(net_g.load_state_dict(weights, strict=True))
+net_g.eval().to(device)
+net_g.half()
+def get_f0(x, p_len, f0_up_key=0):
+    time_step = 160 / 16000 * 1000
+    f0_min = 50
+    f0_max = 1100
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+    f0 = (
+        parselmouth.Sound(x, 16000)
+        .to_pitch_ac(
+            time_step=time_step / 1000,
+            voicing_threshold=0.6,
+            pitch_floor=f0_min,
+            pitch_ceiling=f0_max,
+        )
+        .selected_array["frequency"]
+    )
+    pad_size = (p_len - len(f0) + 1) // 2
+    if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+        f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
+    f0 *= pow(2, f0_up_key / 12)
+    f0bak = f0.copy()
+    f0_mel = 1127 * np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+        f0_mel_max - f0_mel_min
+    ) + 1
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > 255] = 255
+    # f0_mel[f0_mel > 188] = 188
+    f0_coarse = np.rint(f0_mel).astype(np.int32)
+    return f0_coarse, f0bak
+import faiss
+index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
+big_npy = np.load("infer/big_src_feature_mi.npy")
+ta0 = ta1 = ta2 = 0
+for idx, name in enumerate(
+    [
+        "冬之花clip1.wav",
+    ]
+):  ##
+    wav_path = "todo-songs/%s" % name  #
+    f0_up_key = -2  #
+    audio, sampling_rate = sf.read(wav_path)
+    if len(audio.shape) > 1:
+        audio = librosa.to_mono(audio.transpose(1, 0))
+    if sampling_rate != 16000:
+        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+    feats = torch.from_numpy(audio).float()
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    feats = feats.view(1, -1)
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.half().to(device),
+        "padding_mask": padding_mask.to(device),
+        "output_layer": 9,  # layer 9
+    }
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t0 = ttime()
+    with torch.no_grad():
+        logits = model.extract_features(**inputs)
+        feats = model.final_proj(logits[0])
+    ####索引优化
+    npy = feats[0].cpu().numpy().astype("float32")
+    D, I = index.search(npy, 1)
+    feats = (
+        torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
+    )
+    feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t1 = ttime()
+    # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
+    p_len = min(feats.shape[1], 10000)  #
+    pitch, pitchf = get_f0(audio, p_len, f0_up_key)
+    p_len = min(feats.shape[1], 10000, pitch.shape[0])  # 太大了爆显存
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t2 = ttime()
+    feats = feats[:, :p_len, :]
+    pitch = pitch[:p_len]
+    pitchf = pitchf[:p_len]
+    p_len = torch.LongTensor([p_len]).to(device)
+    pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
+    sid = torch.LongTensor([0]).to(device)
+    pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
+    with torch.no_grad():
+        audio = (
+            net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )  # nsf
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t3 = ttime()
+    ta0 += t1 - t0
+    ta1 += t2 - t1
+    ta2 += t3 - t2
+    # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
+    # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
+    # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
+    wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio)  ##
+logger.debug("%.2fs %.2fs %.2fs", ta0, ta1, ta2)  #

tools/infer/train-index-v2.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
+"""
+import os
+import traceback
+import logging
+logger = logging.getLogger(__name__)
+from multiprocessing import cpu_count
+import faiss
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+# ###########如果是原始特征要先写save
+n_cpu = 0
+if n_cpu == 0:
+    n_cpu = cpu_count()
+inp_root = r"./logs/anz/3_feature768"
+npys = []
+listdir_res = list(os.listdir(inp_root))
+for name in sorted(listdir_res):
+    phone = np.load("%s/%s" % (inp_root, name))
+    npys.append(phone)
+big_npy = np.concatenate(npys, 0)
+big_npy_idx = np.arange(big_npy.shape[0])
+np.random.shuffle(big_npy_idx)
+big_npy = big_npy[big_npy_idx]
+logger.debug(big_npy.shape)  # (6196072, 192)#fp32#4.43G
+if big_npy.shape[0] > 2e5:
+    # if(1):
+    info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
+    logger.info(info)
+    try:
+        big_npy = (
+            MiniBatchKMeans(
+                n_clusters=10000,
+                verbose=True,
+                batch_size=256 * n_cpu,
+                compute_labels=False,
+                init="random",
+            )
+            .fit(big_npy)
+            .cluster_centers_
+        )
+    except:
+        info = traceback.format_exc()
+        logger.warning(info)
+np.save("tools/infer/big_src_feature_mi.npy", big_npy)
+##################train+add
+# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
+n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
+index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf)  # mi
+logger.info("Training...")
+index_ivf = faiss.extract_index_ivf(index)  #
+index_ivf.nprobe = 1
+index.train(big_npy)
+faiss.write_index(
+    index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
+)
+logger.info("Adding...")
+batch_size_add = 8192
+for i in range(0, big_npy.shape[0], batch_size_add):
+    index.add(big_npy[i : i + batch_size_add])
+faiss.write_index(
+    index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)
+)
+"""
+大小（都是FP32）
+big_src_feature 2.95G
+    (3098036, 256)
+big_emb         4.43G
+    (6196072, 192)
+big_emb双倍是因为求特征要repeat后再加pitch
+"""

tools/infer/train-index.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+格式：直接cid为自带的index位；aid放不下了，通过字典来查，反正就5w个
+"""
+import os
+import logging
+logger = logging.getLogger(__name__)
+import faiss
+import numpy as np
+# ###########如果是原始特征要先写save
+inp_root = r"E:\codes\py39\dataset\mi\2-co256"
+npys = []
+for name in sorted(list(os.listdir(inp_root))):
+    phone = np.load("%s/%s" % (inp_root, name))
+    npys.append(phone)
+big_npy = np.concatenate(npys, 0)
+logger.debug(big_npy.shape)  # (6196072, 192)#fp32#4.43G
+np.save("infer/big_src_feature_mi.npy", big_npy)
+##################train+add
+# big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
+logger.debug(big_npy.shape)
+index = faiss.index_factory(256, "IVF512,Flat")  # mi
+logger.info("Training...")
+index_ivf = faiss.extract_index_ivf(index)  #
+index_ivf.nprobe = 9
+index.train(big_npy)
+faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
+logger.info("Adding...")
+index.add(big_npy)
+faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
+"""
+大小（都是FP32）
+big_src_feature 2.95G
+    (3098036, 256)
+big_emb         4.43G
+    (6196072, 192)
+big_emb双倍是因为求特征要repeat后再加pitch
+"""

tools/infer/trans_weights.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import pdb
+import torch
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
+# a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
+a = torch.load(
+    r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
+)[
+    "model"
+]  # sim_nsf#
+for key in a.keys():
+    a[key] = a[key].half()
+# torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
+# torch.save(a,"ft-mi-sim1k.pt")#
+torch.save(a, "ft-mi-no_opt-no_dropout.pt")  #

tools/infer_batch_rvc.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import argparse
+import os
+import sys
+print("Command-line arguments:", sys.argv)
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+import sys
+import tqdm as tq
+from dotenv import load_dotenv
+from scipy.io import wavfile
+from configs.config import Config
+from infer.modules.vc.modules import VC
+def arg_parse() -> tuple:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--f0up_key", type=int, default=0)
+    parser.add_argument("--input_path", type=str, help="input path")
+    parser.add_argument("--index_path", type=str, help="index path")
+    parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
+    parser.add_argument("--opt_path", type=str, help="opt path")
+    parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
+    parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
+    parser.add_argument("--device", type=str, help="device")
+    parser.add_argument("--is_half", type=bool, help="use half -> True")
+    parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
+    parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
+    parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
+    parser.add_argument("--protect", type=float, default=0.33, help="protect")
+    args = parser.parse_args()
+    sys.argv = sys.argv[:1]
+    return args
+def main():
+    load_dotenv()
+    args = arg_parse()
+    config = Config()
+    config.device = args.device if args.device else config.device
+    config.is_half = args.is_half if args.is_half else config.is_half
+    vc = VC(config)
+    vc.get_vc(args.model_name)
+    audios = os.listdir(args.input_path)
+    for file in tq.tqdm(audios):
+        if file.endswith(".wav"):
+            file_path = os.path.join(args.input_path, file)
+            _, wav_opt = vc.vc_single(
+                0,
+                file_path,
+                args.f0up_key,
+                None,
+                args.f0method,
+                args.index_path,
+                None,
+                args.index_rate,
+                args.filter_radius,
+                args.resample_sr,
+                args.rms_mix_rate,
+                args.protect,
+            )
+            out_path = os.path.join(args.opt_path, file)
+            wavfile.write(out_path, wav_opt[0], wav_opt[1])
+if __name__ == "__main__":
+    main()

tools/infer_cli.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import argparse
+import os
+import sys
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from dotenv import load_dotenv
+from scipy.io import wavfile
+from configs.config import Config
+from infer.modules.vc.modules import VC
+####
+# USAGE
+#
+# In your Terminal or CMD or whatever
+def arg_parse() -> tuple:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--f0up_key", type=int, default=0)
+    parser.add_argument("--input_path", type=str, help="input path")
+    parser.add_argument("--index_path", type=str, help="index path")
+    parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
+    parser.add_argument("--opt_path", type=str, help="opt path")
+    parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
+    parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
+    parser.add_argument("--device", type=str, help="device")
+    parser.add_argument("--is_half", type=bool, help="use half -> True")
+    parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
+    parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
+    parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
+    parser.add_argument("--protect", type=float, default=0.33, help="protect")
+    args = parser.parse_args()
+    sys.argv = sys.argv[:1]
+    return args
+def main():
+    load_dotenv()
+    args = arg_parse()
+    config = Config()
+    config.device = args.device if args.device else config.device
+    config.is_half = args.is_half if args.is_half else config.is_half
+    vc = VC(config)
+    vc.get_vc(args.model_name)
+    _, wav_opt = vc.vc_single(
+        0,
+        args.input_path,
+        args.f0up_key,
+        None,
+        args.f0method,
+        args.index_path,
+        None,
+        args.index_rate,
+        args.filter_radius,
+        args.resample_sr,
+        args.rms_mix_rate,
+        args.protect,
+    )
+    wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
+if __name__ == "__main__":
+    main()

tools/onnx_inference_demo.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import soundfile
+from ..infer.lib.infer_pack.onnx_inference import OnnxRVC
+hop_size = 512
+sampling_rate = 40000  # 采样率
+f0_up_key = 0  # 升降调
+sid = 0  # 角色ID
+f0_method = "dio"  # F0提取算法
+model_path = "ShirohaRVC.onnx"  # 模型的完整路径
+vec_name = (
+    "vec-256-layer-9"  # 内部自动补齐为 f"pretrained/{vec_name}.onnx" 需要onnx的vec模型
+)
+wav_path = "123.wav"  # 输入路径或ByteIO实例
+out_path = "out.wav"  # 输出路径或ByteIO实例
+model = OnnxRVC(
+    model_path, vec_path=vec_name, sr=sampling_rate, hop_size=hop_size, device="cuda"
+)
+audio = model.inference(wav_path, sid, f0_method=f0_method, f0_up_key=f0_up_key)
+soundfile.write(out_path, audio, sampling_rate)

tools/rvc_for_realtime.py ADDED Viewed

	@@ -0,0 +1,443 @@

+from io import BytesIO
+import os
+import pickle
+import sys
+import traceback
+from infer.lib import jit
+from infer.lib.jit.get_synthesizer import get_synthesizer
+from time import time as ttime
+import fairseq
+import faiss
+import numpy as np
+import parselmouth
+import pyworld
+import scipy.signal as signal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchcrepe
+from infer.lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from multiprocessing import Manager as M
+from configs.config import Config
+# config = Config()
+mm = M()
+def printt(strr, *args):
+    if len(args) == 0:
+        print(strr)
+    else:
+        print(strr % args)
+# config.device=torch.device("cpu")########强制cpu测试
+# config.is_half=False########强制cpu测试
+class RVC:
+    def __init__(
+        self,
+        key,
+        pth_path,
+        index_path,
+        index_rate,
+        n_cpu,
+        inp_q,
+        opt_q,
+        config: Config,
+        last_rvc=None,
+    ) -> None:
+        """
+        初始化
+        """
+        try:
+            if config.dml == True:
+                def forward_dml(ctx, x, scale):
+                    ctx.scale = scale
+                    res = x.clone().detach()
+                    return res
+                fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
+            # global config
+            self.config = config
+            self.inp_q = inp_q
+            self.opt_q = opt_q
+            # device="cpu"########强制cpu测试
+            self.device = config.device
+            self.f0_up_key = key
+            self.f0_min = 50
+            self.f0_max = 1100
+            self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+            self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+            self.n_cpu = n_cpu
+            self.use_jit = self.config.use_jit
+            self.is_half = config.is_half
+            if index_rate != 0:
+                self.index = faiss.read_index(index_path)
+                self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
+                printt("Index search enabled")
+            self.pth_path: str = pth_path
+            self.index_path = index_path
+            self.index_rate = index_rate
+            self.cache_pitch: np.ndarray = np.zeros(1024, dtype="int32")
+            self.cache_pitchf = np.zeros(1024, dtype="float32")
+            if last_rvc is None:
+                models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+                    ["assets/hubert/hubert_base.pt"],
+                    suffix="",
+                )
+                hubert_model = models[0]
+                hubert_model = hubert_model.to(self.device)
+                if self.is_half:
+                    hubert_model = hubert_model.half()
+                else:
+                    hubert_model = hubert_model.float()
+                hubert_model.eval()
+                self.model = hubert_model
+            else:
+                self.model = last_rvc.model
+            self.net_g: nn.Module = None
+            def set_default_model():
+                self.net_g, cpt = get_synthesizer(self.pth_path, self.device)
+                self.tgt_sr = cpt["config"][-1]
+                cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+                self.if_f0 = cpt.get("f0", 1)
+                self.version = cpt.get("version", "v1")
+                if self.is_half:
+                    self.net_g = self.net_g.half()
+                else:
+                    self.net_g = self.net_g.float()
+            def set_jit_model():
+                jit_pth_path = self.pth_path.rstrip(".pth")
+                jit_pth_path += ".half.jit" if self.is_half else ".jit"
+                reload = False
+                if str(self.device) == "cuda":
+                    self.device = torch.device("cuda:0")
+                if os.path.exists(jit_pth_path):
+                    cpt = jit.load(jit_pth_path)
+                    model_device = cpt["device"]
+                    if model_device != str(self.device):
+                        reload = True
+                else:
+                    reload = True
+                if reload:
+                    cpt = jit.synthesizer_jit_export(
+                        self.pth_path,
+                        "script",
+                        None,
+                        device=self.device,
+                        is_half=self.is_half,
+                    )
+                self.tgt_sr = cpt["config"][-1]
+                self.if_f0 = cpt.get("f0", 1)
+                self.version = cpt.get("version", "v1")
+                self.net_g = torch.jit.load(
+                    BytesIO(cpt["model"]), map_location=self.device
+                )
+                self.net_g.infer = self.net_g.forward
+                self.net_g.eval().to(self.device)
+            def set_synthesizer():
+                if self.use_jit and not config.dml:
+                    if self.is_half and "cpu" in str(self.device):
+                        printt(
+                            "Use default Synthesizer model. \
+                                    Jit is not supported on the CPU for half floating point"
+                        )
+                        set_default_model()
+                    else:
+                        set_jit_model()
+                else:
+                    set_default_model()
+            if last_rvc is None or last_rvc.pth_path != self.pth_path:
+                set_synthesizer()
+            else:
+                self.tgt_sr = last_rvc.tgt_sr
+                self.if_f0 = last_rvc.if_f0
+                self.version = last_rvc.version
+                self.is_half = last_rvc.is_half
+                if last_rvc.use_jit != self.use_jit:
+                    set_synthesizer()
+                else:
+                    self.net_g = last_rvc.net_g
+            if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
+                self.model_rmvpe = last_rvc.model_rmvpe
+            if last_rvc is not None and hasattr(last_rvc, "model_fcpe"):
+                self.device_fcpe = last_rvc.device_fcpe
+                self.model_fcpe = last_rvc.model_fcpe
+        except:
+            printt(traceback.format_exc())
+    def change_key(self, new_key):
+        self.f0_up_key = new_key
+    def change_index_rate(self, new_index_rate):
+        if new_index_rate != 0 and self.index_rate == 0:
+            self.index = faiss.read_index(self.index_path)
+            self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
+            printt("Index search enabled")
+        self.index_rate = new_index_rate
+    def get_f0_post(self, f0):
+        f0bak = f0.copy()
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
+            self.f0_mel_max - self.f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = np.rint(f0_mel).astype(np.int32)
+        return f0_coarse, f0bak
+    def get_f0(self, x, f0_up_key, n_cpu, method="harvest"):
+        n_cpu = int(n_cpu)
+        if method == "crepe":
+            return self.get_f0_crepe(x, f0_up_key)
+        if method == "rmvpe":
+            return self.get_f0_rmvpe(x, f0_up_key)
+        if method == "fcpe":
+            return self.get_f0_fcpe(x, f0_up_key)
+        x = x.cpu().numpy()
+        if method == "pm":
+            p_len = x.shape[0] // 160 + 1
+            f0_min = 65
+            l_pad = int(np.ceil(1.5 / f0_min * 16000))
+            r_pad = l_pad + 1
+            s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac(
+                time_step=0.01,
+                voicing_threshold=0.6,
+                pitch_floor=f0_min,
+                pitch_ceiling=1100,
+            )
+            assert np.abs(s.t1 - 1.5 / f0_min) < 0.001
+            f0 = s.selected_array["frequency"]
+            if len(f0) < p_len:
+                f0 = np.pad(f0, (0, p_len - len(f0)))
+            f0 = f0[:p_len]
+            f0 *= pow(2, f0_up_key / 12)
+            return self.get_f0_post(f0)
+        if n_cpu == 1:
+            f0, t = pyworld.harvest(
+                x.astype(np.double),
+                fs=16000,
+                f0_ceil=1100,
+                f0_floor=50,
+                frame_period=10,
+            )
+            f0 = signal.medfilt(f0, 3)
+            f0 *= pow(2, f0_up_key / 12)
+            return self.get_f0_post(f0)
+        f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64)
+        length = len(x)
+        part_length = 160 * ((length // 160 - 1) // n_cpu + 1)
+        n_cpu = (length // 160 - 1) // (part_length // 160) + 1
+        ts = ttime()
+        res_f0 = mm.dict()
+        for idx in range(n_cpu):
+            tail = part_length * (idx + 1) + 320
+            if idx == 0:
+                self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
+            else:
+                self.inp_q.put(
+                    (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
+                )
+        while 1:
+            res_ts = self.opt_q.get()
+            if res_ts == ts:
+                break
+        f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])]
+        for idx, f0 in enumerate(f0s):
+            if idx == 0:
+                f0 = f0[:-3]
+            elif idx != n_cpu - 1:
+                f0 = f0[2:-3]
+            else:
+                f0 = f0[2:]
+            f0bak[part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]] = (
+                f0
+            )
+        f0bak = signal.medfilt(f0bak, 3)
+        f0bak *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0bak)
+    def get_f0_crepe(self, x, f0_up_key):
+        if "privateuseone" in str(
+            self.device
+        ):  ###不支持dml，cpu又太慢用不成���拿fcpe顶替
+            return self.get_f0(x, f0_up_key, 1, "fcpe")
+        # printt("using crepe,device:%s"%self.device)
+        f0, pd = torchcrepe.predict(
+            x.unsqueeze(0).float(),
+            16000,
+            160,
+            self.f0_min,
+            self.f0_max,
+            "full",
+            batch_size=512,
+            # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用
+            device=self.device,
+            return_periodicity=True,
+        )
+        pd = torchcrepe.filter.median(pd, 3)
+        f0 = torchcrepe.filter.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 = f0[0].cpu().numpy()
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+    def get_f0_rmvpe(self, x, f0_up_key):
+        if hasattr(self, "model_rmvpe") == False:
+            from infer.lib.rmvpe import RMVPE
+            printt("Loading rmvpe model")
+            self.model_rmvpe = RMVPE(
+                "assets/rmvpe/rmvpe.pt",
+                is_half=self.is_half,
+                device=self.device,
+                use_jit=self.config.use_jit,
+            )
+        f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+    def get_f0_fcpe(self, x, f0_up_key):
+        if hasattr(self, "model_fcpe") == False:
+            from torchfcpe import spawn_bundled_infer_model
+            printt("Loading fcpe model")
+            if "privateuseone" in str(self.device):
+                self.device_fcpe = "cpu"
+            else:
+                self.device_fcpe = self.device
+            self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe)
+        f0 = self.model_fcpe.infer(
+            x.to(self.device_fcpe).unsqueeze(0).float(),
+            sr=16000,
+            decoder_mode="local_argmax",
+            threshold=0.006,
+        )
+        f0 *= pow(2, f0_up_key / 12)
+        f0 = f0.squeeze().cpu().numpy()
+        return self.get_f0_post(f0)
+    def infer(
+        self,
+        input_wav: torch.Tensor,
+        block_frame_16k,
+        skip_head,
+        return_length,
+        f0method,
+    ) -> np.ndarray:
+        t1 = ttime()
+        with torch.no_grad():
+            if self.config.is_half:
+                feats = input_wav.half().view(1, -1)
+            else:
+                feats = input_wav.float().view(1, -1)
+            padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+            inputs = {
+                "source": feats,
+                "padding_mask": padding_mask,
+                "output_layer": 9 if self.version == "v1" else 12,
+            }
+            logits = self.model.extract_features(**inputs)
+            feats = (
+                self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
+            )
+            feats = torch.cat((feats, feats[:, -1:, :]), 1)
+        t2 = ttime()
+        try:
+            if hasattr(self, "index") and self.index_rate != 0:
+                npy = feats[0][skip_head // 2 :].cpu().numpy().astype("float32")
+                score, ix = self.index.search(npy, k=8)
+                weight = np.square(1 / score)
+                weight /= weight.sum(axis=1, keepdims=True)
+                npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+                if self.config.is_half:
+                    npy = npy.astype("float16")
+                feats[0][skip_head // 2 :] = (
+                    torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
+                    + (1 - self.index_rate) * feats[0][skip_head // 2 :]
+                )
+            else:
+                printt("Index search FAILED or disabled")
+        except:
+            traceback.print_exc()
+            printt("Index search FAILED")
+        t3 = ttime()
+        if self.if_f0 == 1:
+            f0_extractor_frame = block_frame_16k + 800
+            if f0method == "rmvpe":
+                f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
+            pitch, pitchf = self.get_f0(
+                input_wav[-f0_extractor_frame:], self.f0_up_key, self.n_cpu, f0method
+            )
+            start_frame = block_frame_16k // 160
+            end_frame = len(self.cache_pitch) - (pitch.shape[0] - 4) + start_frame
+            self.cache_pitch[:] = np.append(
+                self.cache_pitch[start_frame:end_frame], pitch[3:-1]
+            )
+            self.cache_pitchf[:] = np.append(
+                self.cache_pitchf[start_frame:end_frame], pitchf[3:-1]
+            )
+        t4 = ttime()
+        p_len = input_wav.shape[0] // 160
+        if self.if_f0 == 1:
+            cache_pitch = (
+                torch.LongTensor(self.cache_pitch[-p_len:]).to(self.device).unsqueeze(0)
+            )
+            cache_pitchf = (
+                torch.FloatTensor(self.cache_pitchf[-p_len:])
+                .to(self.device)
+                .unsqueeze(0)
+            )
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        feats = feats[:, :p_len, :]
+        p_len = torch.LongTensor([p_len]).to(self.device)
+        sid = torch.LongTensor([0]).to(self.device)
+        skip_head = torch.LongTensor([skip_head])
+        return_length = torch.LongTensor([return_length])
+        with torch.no_grad():
+            if self.if_f0 == 1:
+                infered_audio, _, _ = self.net_g.infer(
+                    feats,
+                    p_len,
+                    cache_pitch,
+                    cache_pitchf,
+                    sid,
+                    skip_head,
+                    return_length,
+                )
+            else:
+                infered_audio, _, _ = self.net_g.infer(
+                    feats, p_len, sid, skip_head, return_length
+                )
+        t5 = ttime()
+        printt(
+            "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
+            t2 - t1,
+            t3 - t2,
+            t4 - t3,
+            t5 - t4,
+        )
+        return infered_audio.squeeze().float()

tools/torchgate/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+TorchGating is a PyTorch-based implementation of Spectral Gating
+================================================
+Author: Asaf Zorea
+Contents
+--------
+torchgate imports all the functions from PyTorch, and in addition provides:
+ TorchGating       --- A PyTorch module that applies a spectral gate to an input signal
+"""
+from .torchgate import TorchGate

tools/torchgate/torchgate.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import torch
+from infer.lib.rmvpe import STFT
+from torch.nn.functional import conv1d, conv2d
+from typing import Union, Optional
+from .utils import linspace, temperature_sigmoid, amp_to_db
+class TorchGate(torch.nn.Module):
+    """
+    A PyTorch module that applies a spectral gate to an input signal.
+    Arguments:
+        sr {int} -- Sample rate of the input signal.
+        nonstationary {bool} -- Whether to use non-stationary or stationary masking (default: {False}).
+        n_std_thresh_stationary {float} -- Number of standard deviations above mean to threshold noise for
+                                           stationary masking (default: {1.5}).
+        n_thresh_nonstationary {float} -- Number of multiplies above smoothed magnitude spectrogram. for
+                                        non-stationary masking (default: {1.3}).
+        temp_coeff_nonstationary {float} -- Temperature coefficient for non-stationary masking (default: {0.1}).
+        n_movemean_nonstationary {int} -- Number of samples for moving average smoothing in non-stationary masking
+                                          (default: {20}).
+        prop_decrease {float} -- Proportion to decrease signal by where the mask is zero (default: {1.0}).
+        n_fft {int} -- Size of FFT for STFT (default: {1024}).
+        win_length {[int]} -- Window length for STFT. If None, defaults to `n_fft` (default: {None}).
+        hop_length {[int]} -- Hop length for STFT. If None, defaults to `win_length` // 4 (default: {None}).
+        freq_mask_smooth_hz {float} -- Frequency smoothing width for mask (in Hz). If None, no smoothing is applied
+                                     (default: {500}).
+        time_mask_smooth_ms {float} -- Time smoothing width for mask (in ms). If None, no smoothing is applied
+                                     (default: {50}).
+    """
+    @torch.no_grad()
+    def __init__(
+        self,
+        sr: int,
+        nonstationary: bool = False,
+        n_std_thresh_stationary: float = 1.5,
+        n_thresh_nonstationary: float = 1.3,
+        temp_coeff_nonstationary: float = 0.1,
+        n_movemean_nonstationary: int = 20,
+        prop_decrease: float = 1.0,
+        n_fft: int = 1024,
+        win_length: bool = None,
+        hop_length: int = None,
+        freq_mask_smooth_hz: float = 500,
+        time_mask_smooth_ms: float = 50,
+    ):
+        super().__init__()
+        # General Params
+        self.sr = sr
+        self.nonstationary = nonstationary
+        assert 0.0 <= prop_decrease <= 1.0
+        self.prop_decrease = prop_decrease
+        # STFT Params
+        self.n_fft = n_fft
+        self.win_length = self.n_fft if win_length is None else win_length
+        self.hop_length = self.win_length // 4 if hop_length is None else hop_length
+        # Stationary Params
+        self.n_std_thresh_stationary = n_std_thresh_stationary
+        # Non-Stationary Params
+        self.temp_coeff_nonstationary = temp_coeff_nonstationary
+        self.n_movemean_nonstationary = n_movemean_nonstationary
+        self.n_thresh_nonstationary = n_thresh_nonstationary
+        # Smooth Mask Params
+        self.freq_mask_smooth_hz = freq_mask_smooth_hz
+        self.time_mask_smooth_ms = time_mask_smooth_ms
+        self.register_buffer("smoothing_filter", self._generate_mask_smoothing_filter())
+    @torch.no_grad()
+    def _generate_mask_smoothing_filter(self) -> Union[torch.Tensor, None]:
+        """
+        A PyTorch module that applies a spectral gate to an input signal using the STFT.
+        Returns:
+            smoothing_filter (torch.Tensor): a 2D tensor representing the smoothing filter,
+            with shape (n_grad_freq, n_grad_time), where n_grad_freq is the number of frequency
+            bins to smooth and n_grad_time is the number of time frames to smooth.
+            If both self.freq_mask_smooth_hz and self.time_mask_smooth_ms are None, returns None.
+        """
+        if self.freq_mask_smooth_hz is None and self.time_mask_smooth_ms is None:
+            return None
+        n_grad_freq = (
+            1
+            if self.freq_mask_smooth_hz is None
+            else int(self.freq_mask_smooth_hz / (self.sr / (self.n_fft / 2)))
+        )
+        if n_grad_freq < 1:
+            raise ValueError(
+                f"freq_mask_smooth_hz needs to be at least {int((self.sr / (self._n_fft / 2)))} Hz"
+            )
+        n_grad_time = (
+            1
+            if self.time_mask_smooth_ms is None
+            else int(self.time_mask_smooth_ms / ((self.hop_length / self.sr) * 1000))
+        )
+        if n_grad_time < 1:
+            raise ValueError(
+                f"time_mask_smooth_ms needs to be at least {int((self.hop_length / self.sr) * 1000)} ms"
+            )
+        if n_grad_time == 1 and n_grad_freq == 1:
+            return None
+        v_f = torch.cat(
+            [
+                linspace(0, 1, n_grad_freq + 1, endpoint=False),
+                linspace(1, 0, n_grad_freq + 2),
+            ]
+        )[1:-1]
+        v_t = torch.cat(
+            [
+                linspace(0, 1, n_grad_time + 1, endpoint=False),
+                linspace(1, 0, n_grad_time + 2),
+            ]
+        )[1:-1]
+        smoothing_filter = torch.outer(v_f, v_t).unsqueeze(0).unsqueeze(0)
+        return smoothing_filter / smoothing_filter.sum()
+    @torch.no_grad()
+    def _stationary_mask(
+        self, X_db: torch.Tensor, xn: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Computes a stationary binary mask to filter out noise in a log-magnitude spectrogram.
+        Arguments:
+            X_db (torch.Tensor): 2D tensor of shape (frames, freq_bins) containing the log-magnitude spectrogram.
+            xn (torch.Tensor): 1D tensor containing the audio signal corresponding to X_db.
+        Returns:
+            sig_mask (torch.Tensor): Binary mask of the same shape as X_db, where values greater than the threshold
+            are set to 1, and the rest are set to 0.
+        """
+        if xn is not None:
+            if "privateuseone" in str(xn.device):
+                if not hasattr(self, "stft"):
+                    self.stft = STFT(
+                        filter_length=self.n_fft,
+                        hop_length=self.hop_length,
+                        win_length=self.win_length,
+                        window="hann",
+                    ).to(xn.device)
+                XN = self.stft.transform(xn)
+            else:
+                XN = torch.stft(
+                    xn,
+                    n_fft=self.n_fft,
+                    hop_length=self.hop_length,
+                    win_length=self.win_length,
+                    return_complex=True,
+                    pad_mode="constant",
+                    center=True,
+                    window=torch.hann_window(self.win_length).to(xn.device),
+                )
+            XN_db = amp_to_db(XN).to(dtype=X_db.dtype)
+        else:
+            XN_db = X_db
+        # calculate mean and standard deviation along the frequency axis
+        std_freq_noise, mean_freq_noise = torch.std_mean(XN_db, dim=-1)
+        # compute noise threshold
+        noise_thresh = mean_freq_noise + std_freq_noise * self.n_std_thresh_stationary
+        # create binary mask by thresholding the spectrogram
+        sig_mask = X_db > noise_thresh.unsqueeze(2)
+        return sig_mask
+    @torch.no_grad()
+    def _nonstationary_mask(self, X_abs: torch.Tensor) -> torch.Tensor:
+        """
+        Computes a non-stationary binary mask to filter out noise in a log-magnitude spectrogram.
+        Arguments:
+            X_abs (torch.Tensor): 2D tensor of shape (frames, freq_bins) containing the magnitude spectrogram.
+        Returns:
+            sig_mask (torch.Tensor): Binary mask of the same shape as X_abs, where values greater than the threshold
+            are set to 1, and the rest are set to 0.
+        """
+        X_smoothed = (
+            conv1d(
+                X_abs.reshape(-1, 1, X_abs.shape[-1]),
+                torch.ones(
+                    self.n_movemean_nonstationary,
+                    dtype=X_abs.dtype,
+                    device=X_abs.device,
+                ).view(1, 1, -1),
+                padding="same",
+            ).view(X_abs.shape)
+            / self.n_movemean_nonstationary
+        )
+        # Compute slowness ratio and apply temperature sigmoid
+        slowness_ratio = (X_abs - X_smoothed) / (X_smoothed + 1e-6)
+        sig_mask = temperature_sigmoid(
+            slowness_ratio, self.n_thresh_nonstationary, self.temp_coeff_nonstationary
+        )
+        return sig_mask
+    def forward(
+        self, x: torch.Tensor, xn: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Apply the proposed algorithm to the input signal.
+        Arguments:
+            x (torch.Tensor): The input audio signal, with shape (batch_size, signal_length).
+            xn (Optional[torch.Tensor]): The noise signal used for stationary noise reduction. If `None`, the input
+                                         signal is used as the noise signal. Default: `None`.
+        Returns:
+            torch.Tensor: The denoised audio signal, with the same shape as the input signal.
+        """
+        # Compute short-time Fourier transform (STFT)
+        if "privateuseone" in str(x.device):
+            if not hasattr(self, "stft"):
+                self.stft = STFT(
+                    filter_length=self.n_fft,
+                    hop_length=self.hop_length,
+                    win_length=self.win_length,
+                    window="hann",
+                ).to(x.device)
+            X, phase = self.stft.transform(x, return_phase=True)
+        else:
+            X = torch.stft(
+                x,
+                n_fft=self.n_fft,
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                return_complex=True,
+                pad_mode="constant",
+                center=True,
+                window=torch.hann_window(self.win_length).to(x.device),
+            )
+        # Compute signal mask based on stationary or nonstationary assumptions
+        if self.nonstationary:
+            sig_mask = self._nonstationary_mask(X.abs())
+        else:
+            sig_mask = self._stationary_mask(amp_to_db(X), xn)
+        # Propagate decrease in signal power
+        sig_mask = self.prop_decrease * (sig_mask.float() - 1.0) + 1.0
+        # Smooth signal mask with 2D convolution
+        if self.smoothing_filter is not None:
+            sig_mask = conv2d(
+                sig_mask.unsqueeze(1),
+                self.smoothing_filter.to(sig_mask.dtype),
+                padding="same",
+            )
+        # Apply signal mask to STFT magnitude and phase components
+        Y = X * sig_mask.squeeze(1)
+        # Inverse STFT to obtain time-domain signal
+        if "privateuseone" in str(Y.device):
+            y = self.stft.inverse(Y, phase)
+        else:
+            y = torch.istft(
+                Y,
+                n_fft=self.n_fft,
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                center=True,
+                window=torch.hann_window(self.win_length).to(Y.device),
+            )
+        return y.to(dtype=x.dtype)

tools/torchgate/utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+from torch.types import Number
+@torch.no_grad()
+def amp_to_db(
+    x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40
+) -> torch.Tensor:
+    """
+    Convert the input tensor from amplitude to decibel scale.
+    Arguments:
+        x {[torch.Tensor]} -- [Input tensor.]
+    Keyword Arguments:
+        eps {[float]} -- [Small value to avoid numerical instability.]
+                          (default: {torch.finfo(torch.float64).eps})
+        top_db {[float]} -- [threshold the output at ``top_db`` below the peak]
+            `             (default: {40})
+    Returns:
+        [torch.Tensor] -- [Output tensor in decibel scale.]
+    """
+    x_db = 20 * torch.log10(x.abs() + eps)
+    return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1))
+@torch.no_grad()
+def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor:
+    """
+    Apply a sigmoid function with temperature scaling.
+    Arguments:
+        x {[torch.Tensor]} -- [Input tensor.]
+        x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.]
+        temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.]
+    Returns:
+        [torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.]
+    """
+    return torch.sigmoid((x - x0) / temp_coeff)
+@torch.no_grad()
+def linspace(
+    start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs
+) -> torch.Tensor:
+    """
+    Generate a linearly spaced 1-D tensor.
+    Arguments:
+        start {[Number]} -- [The starting value of the sequence.]
+        stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False.
+                            In that case, the sequence consists of all but the last of ``num + 1``
+                            evenly spaced samples, so that `stop` is excluded. Note that the step
+                            size changes when `endpoint` is False.]
+    Keyword Arguments:
+        num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.]
+        endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included.
+                              Default is True.]
+        **kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.]
+    Returns:
+        [torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.]
+    """
+    if endpoint:
+        return torch.linspace(start, stop, num, **kwargs)
+    else:
+        return torch.linspace(start, stop, num + 1, **kwargs)[:-1]