wetdog commited on
Commit
bea1338
1 Parent(s): 40b17fc

get models from hf_hub

Browse files
Dockerfile CHANGED
@@ -40,6 +40,10 @@ COPY --chown=user requirements.txt $HOME/app/
40
 
41
  RUN pip install -r requirements.txt
42
 
 
 
 
 
43
  COPY --chown=user . $HOME/app/
44
 
45
  # Fix ownership issues
 
40
 
41
  RUN pip install -r requirements.txt
42
 
43
+ RUN huggingface-cli download BSC-LT/matcha-tts-cat-onnx matcha_multispeaker_cat_opset_15_10_steps_lastwords.onnx --local-dir $HOME/app/
44
+
45
+ RUN huggingface-cli download BSC-LT/vocos-mel-22khz-onnx mel_spec_22khz_v2.onnx --local-dir $HOME/app/
46
+
47
  COPY --chown=user . $HOME/app/
48
 
49
  # Fix ownership issues
infer_onnx.py CHANGED
@@ -30,7 +30,7 @@ def process_text(i: int, text: str, device: torch.device):
30
  print(x_phones)
31
  return x.numpy(), x_lengths.numpy()
32
 
33
- MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps.onnx"
34
  MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
35
  MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
36
  CONFIG_PATH="config_22khz.yaml"
@@ -39,7 +39,8 @@ SPEAKER_ID_DICT="spk_to_id.json"
39
  sess_options = onnxruntime.SessionOptions()
40
  model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
41
  model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
42
- model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
 
43
  speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
44
  speakers = [sp for sp in speaker_id_dict.keys()]
45
  speakers.sort()
@@ -90,7 +91,7 @@ def vocos_inference(mel,denoise):
90
  mag_spec_bias = torch.sqrt(spec_bias.pow(2).sum(-1))
91
 
92
  # substract
93
- strength = 0.0005
94
  mag_spec_denoised = mag_spec - mag_spec_bias * strength
95
  mag_spec_denoised = torch.clamp(mag_spec_denoised, 0.0)
96
 
 
30
  print(x_phones)
31
  return x.numpy(), x_lengths.numpy()
32
 
33
+ MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps_lastwords.onnx"
34
  MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
35
  MODEL_PATH_VOCOS="mel_spec_22khz_v2.onnx"
36
  CONFIG_PATH="config_22khz.yaml"
 
39
  sess_options = onnxruntime.SessionOptions()
40
  model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
41
  model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
42
+ #model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
43
+
44
  speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
45
  speakers = [sp for sp in speaker_id_dict.keys()]
46
  speakers.sort()
 
91
  mag_spec_bias = torch.sqrt(spec_bias.pow(2).sum(-1))
92
 
93
  # substract
94
+ strength = 0.0025
95
  mag_spec_denoised = mag_spec - mag_spec_bias * strength
96
  mag_spec_denoised = torch.clamp(mag_spec_denoised, 0.0)
97
 
matcha_hifigan_multispeaker_cat.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5927b5a9a5f7890d4a8c353266ff00a1d9c4376eb1294020ffe43afa622b72f
3
- size 142073725
 
 
 
 
matcha_multispeaker_cat_opset_15.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5b53370f69b8f4ca3d510634b644f6d815f34ee7a2944d0fb3a5588f6286b88
3
- size 102285286
 
 
 
 
matcha_multispeaker_cat_opset_15_10_steps.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7ab5fb2e8d590d5cb610912d3c4e6480b32322cc4fa4bedf94eb0f8b8ce7570
3
- size 86049399
 
 
 
 
mel_spec_22khz.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:15485817350df1e1cf50f75058497ec4b5273acb8903591bb41c6b5fb62daf2b
3
- size 53870258
 
 
 
 
mel_spec_22khz_v2.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b02c479881f89a8320024436e986f64b11e82b1fd48046d4b695c5fd9fb84e7
3
- size 53883652
 
 
 
 
requirements.txt CHANGED
@@ -3,4 +3,5 @@ phonemizer
3
  torch
4
  unidecode
5
  gradio
6
- soundfile
 
 
3
  torch
4
  unidecode
5
  gradio
6
+ soundfile
7
+ huggingface_hub[cli]