get models from hf_hub
Browse files- Dockerfile +4 -0
- infer_onnx.py +4 -3
- matcha_hifigan_multispeaker_cat.onnx +0 -3
- matcha_multispeaker_cat_opset_15.onnx +0 -3
- matcha_multispeaker_cat_opset_15_10_steps.onnx +0 -3
- mel_spec_22khz.onnx +0 -3
- mel_spec_22khz_v2.onnx +0 -3
- requirements.txt +2 -1
Dockerfile
CHANGED
@@ -40,6 +40,10 @@ COPY --chown=user requirements.txt $HOME/app/
|
|
40 |
|
41 |
RUN pip install -r requirements.txt
|
42 |
|
|
|
|
|
|
|
|
|
43 |
COPY --chown=user . $HOME/app/
|
44 |
|
45 |
# Fix ownership issues
|
|
|
40 |
|
41 |
RUN pip install -r requirements.txt
|
42 |
|
43 |
+
RUN huggingface-cli download BSC-LT/matcha-tts-cat-onnx matcha_multispeaker_cat_opset_15_10_steps_lastwords.onnx --local-dir $HOME/app/
|
44 |
+
|
45 |
+
RUN huggingface-cli download BSC-LT/vocos-mel-22khz-onnx mel_spec_22khz_v2.onnx --local-dir $HOME/app/
|
46 |
+
|
47 |
COPY --chown=user . $HOME/app/
|
48 |
|
49 |
# Fix ownership issues
|
infer_onnx.py
CHANGED
@@ -30,7 +30,7 @@ def process_text(i: int, text: str, device: torch.device):
|
|
30 |
print(x_phones)
|
31 |
return x.numpy(), x_lengths.numpy()
|
32 |
|
33 |
-
MODEL_PATH_MATCHA_MEL="
|
34 |
MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
|
35 |
MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
|
36 |
CONFIG_PATH="config_22khz.yaml"
|
@@ -39,7 +39,8 @@ SPEAKER_ID_DICT="spk_to_id.json"
|
|
39 |
sess_options = onnxruntime.SessionOptions()
|
40 |
model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
|
41 |
model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
|
42 |
-
model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
|
|
|
43 |
speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
|
44 |
speakers = [sp for sp in speaker_id_dict.keys()]
|
45 |
speakers.sort()
|
@@ -90,7 +91,7 @@ def vocos_inference(mel,denoise):
|
|
90 |
mag_spec_bias = torch.sqrt(spec_bias.pow(2).sum(-1))
|
91 |
|
92 |
# substract
|
93 |
-
strength = 0.
|
94 |
mag_spec_denoised = mag_spec - mag_spec_bias * strength
|
95 |
mag_spec_denoised = torch.clamp(mag_spec_denoised, 0.0)
|
96 |
|
|
|
30 |
print(x_phones)
|
31 |
return x.numpy(), x_lengths.numpy()
|
32 |
|
33 |
+
MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps_lastwords.onnx"
|
34 |
MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
|
35 |
MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
|
36 |
CONFIG_PATH="config_22khz.yaml"
|
|
|
39 |
sess_options = onnxruntime.SessionOptions()
|
40 |
model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
|
41 |
model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
|
42 |
+
#model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
|
43 |
+
|
44 |
speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
|
45 |
speakers = [sp for sp in speaker_id_dict.keys()]
|
46 |
speakers.sort()
|
|
|
91 |
mag_spec_bias = torch.sqrt(spec_bias.pow(2).sum(-1))
|
92 |
|
93 |
# substract
|
94 |
+
strength = 0.0025
|
95 |
mag_spec_denoised = mag_spec - mag_spec_bias * strength
|
96 |
mag_spec_denoised = torch.clamp(mag_spec_denoised, 0.0)
|
97 |
|
matcha_hifigan_multispeaker_cat.onnx
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c5927b5a9a5f7890d4a8c353266ff00a1d9c4376eb1294020ffe43afa622b72f
|
3 |
-
size 142073725
|
|
|
|
|
|
|
|
matcha_multispeaker_cat_opset_15.onnx
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e5b53370f69b8f4ca3d510634b644f6d815f34ee7a2944d0fb3a5588f6286b88
|
3 |
-
size 102285286
|
|
|
|
|
|
|
|
matcha_multispeaker_cat_opset_15_10_steps.onnx
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f7ab5fb2e8d590d5cb610912d3c4e6480b32322cc4fa4bedf94eb0f8b8ce7570
|
3 |
-
size 86049399
|
|
|
|
|
|
|
|
mel_spec_22khz.onnx
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:15485817350df1e1cf50f75058497ec4b5273acb8903591bb41c6b5fb62daf2b
|
3 |
-
size 53870258
|
|
|
|
|
|
|
|
mel_spec_22khz_v2.onnx
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2b02c479881f89a8320024436e986f64b11e82b1fd48046d4b695c5fd9fb84e7
|
3 |
-
size 53883652
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ phonemizer
|
|
3 |
torch
|
4 |
unidecode
|
5 |
gradio
|
6 |
-
soundfile
|
|
|
|
3 |
torch
|
4 |
unidecode
|
5 |
gradio
|
6 |
+
soundfile
|
7 |
+
huggingface_hub[cli]
|