Spaces:

chnk58hoang
/

VietnameseVITS

Build error

chnk58hoang commited on Dec 15, 2023

Commit

db3dea6

•

1 Parent(s): 5ab552b

convert onnx

Files changed (6) hide show

.gitignore CHANGED Viewed

app.py CHANGED Viewed

@@ -2,16 +2,18 @@ import gradio as gr
 import numpy as np
 from utils import load_model, normalize_text
-vits_model = load_model()
-vits_model.tts('Alo')
 def text_to_speech(text):
-    text = normalize_text(text)
-    audio = vits_model.tts(text)
-    audio = np.array(audio)
     return 16000, audio
@@ -27,4 +29,3 @@ gr.Interface(
     ],
     theme="default",
 ).launch(debug=False)

 import numpy as np
 from utils import load_model, normalize_text
+vits = load_model()
 def text_to_speech(text):
+    """ Text to speech
+    """
+    text_inputs = np.asarray(
+        vits.tokenizer.text_to_ids(text),
+        dtype=np.int64,
+    )[None, :]
+    audio = vits.inference_onnx(text_inputs)
     return 16000, audio
     ],
     theme="default",
 ).launch(debug=False)

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-TTS
 gradio
 numpy
 regex

+TTS==0.17.5
 gradio
 numpy
 regex

utils.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from TTS.api import TTS
 import unicodedata
 import regex
 num_re = regex.compile(r"([0-9.,]*[0-9])")
 digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
@@ -38,7 +39,7 @@ def read_number(num: str) -> str:
             return digits[n // 100] + " trăm lẻ " + digits[n % 100]
         else:
             return digits[n // 100] + " trăm " + read_number(num[1:])
-    elif len(num) >= 4 and len(num) <= 6 and num.isdigit():
         n = int(num)
         n1 = n // 1000
         return read_number(str(n1)) + " ngàn " + read_number(num[-3:])
@@ -57,24 +58,31 @@ def read_number(num: str) -> str:
                 return read_number(parts[0]) + " ngàn " + read_number(parts[1])
         elif len(parts) == 3:
             return (
-                read_number(parts[0])
-                + " triệu "
-                + read_number(parts[1])
-                + " ngàn "
-                + read_number(parts[2])
             )
     return num
 def load_model():
-    config_path = 'vits/vits_config.json'
-    checkpoint_path = 'vits/best_model_vits_22951.pth'
-    tts = TTS(model_name='my_tts',
-              model_path=checkpoint_path,
-              config_path=config_path)
-    return tts
 def normalize_text(text):

+from TTS.tts.models.vits import Vits
+from TTS.tts.configs.vits_config import VitsConfig
+import numpy as np
 import unicodedata
 import regex
 num_re = regex.compile(r"([0-9.,]*[0-9])")
 digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
             return digits[n // 100] + " trăm lẻ " + digits[n % 100]
         else:
             return digits[n // 100] + " trăm " + read_number(num[1:])
+    elif 4 <= len(num) <= 6 and num.isdigit():
         n = int(num)
         n1 = n // 1000
         return read_number(str(n1)) + " ngàn " + read_number(num[-3:])
                 return read_number(parts[0]) + " ngàn " + read_number(parts[1])
         elif len(parts) == 3:
             return (
+                    read_number(parts[0])
+                    + " triệu "
+                    + read_number(parts[1])
+                    + " ngàn "
+                    + read_number(parts[2])
             )
     return num
 def load_model():
+    config = VitsConfig()
+    config.load_json("vits/config.json")
+    vits = Vits.init_from_config(config)
+    vits.load_onnx("vits/coqui_vits.onnx")
+    text = "xin chào tôi là hoàng đây"
+    text_inputs = np.asarray(
+        vits.tokenizer.text_to_ids(text),
+        dtype=np.int64,
+    )[None, :]
+    audio = vits.inference_onnx(text_inputs)
+    return vits
 def normalize_text(text):

vits/{vits_config.json → config.json} RENAMED Viewed

@@ -1,7 +1,7 @@
 {
     "output_path": "/kaggle/working/",
     "logger_uri": null,
-    "run_name": "vits_viet",
     "project_name": null,
     "run_description": "\ud83d\udc38Coqui trainer run.",
     "print_step": 25,
@@ -113,7 +113,21 @@
         }
     ],
     "test_sentences": [
-        "xin ch\u00e0o, t\u1ea5t c\u1ea3 m\u1ecdi ng\u01b0\u1eddi"
     ],
     "eval_split_max_size": null,
     "eval_split_size": 0.01,
@@ -218,8 +232,8 @@
         "reinit_DP": false,
         "reinit_text_encoder": false
     },
-    "lr_gen": 0.0001,
-    "lr_disc": 0.0001,
     "lr_scheduler_gen": "ExponentialLR",
     "lr_scheduler_gen_params": {
         "gamma": 0.999875,
@@ -251,5 +265,6 @@
     "use_d_vector_file": false,
     "d_vector_file": null,
     "d_vector_dim": 0,
-    "github_branch": "* master"
 }

 {
     "output_path": "/kaggle/working/",
     "logger_uri": null,
+    "run_name": "vits_viettts",
     "project_name": null,
     "run_description": "\ud83d\udc38Coqui trainer run.",
     "print_step": 25,
         }
     ],
     "test_sentences": [
+        [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
+        ],
+        [
+            "Be a voice, not an echo."
+        ],
+        [
+            "I'm sorry Dave. I'm afraid I can't do that."
+        ],
+        [
+            "This cake is great. It's so delicious and moist."
+        ],
+        [
+            "Prior to November 22, 1963."
+        ]
     ],
     "eval_split_max_size": null,
     "eval_split_size": 0.01,
         "reinit_DP": false,
         "reinit_text_encoder": false
     },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
     "lr_scheduler_gen": "ExponentialLR",
     "lr_scheduler_gen_params": {
         "gamma": 0.999875,
     "use_d_vector_file": false,
     "d_vector_file": null,
     "d_vector_dim": 0,
+    "restore_path": "/kaggle/input/pretrain-glow/checkpoint_80000.pth",
+    "github_branch": "* hoang"
 }

vits/{best_model_vits_22951.pth → coqui_vits.onnx} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0dfc9c3865ccf2359258c5c6e2145365e643ba3be0208b57efedb6bfac20e428
-size 997817797

 version https://git-lfs.github.com/spec/v1
+oid sha256:debdbe5d25d926fae95180670253a23ffb65047d40da86f4cf7a6e205614d90b
+size 131520541