Spaces:

thinhlpg
/

vixtts-demo

Running on Zero

App Files Files Community

thinhlpg commited on May 6

Commit

f58d262

•

1 Parent(s): 376b5d9

feat: add vietnamese normalize

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +55 -34

.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
 vixtts-demo.code-workspace
 output.wav
 model/
 # Byte-compiled / optimized / DLL files
 __pycache__/

 vixtts-demo.code-workspace
 output.wav
 model/
+test_api.ipynb
 # Byte-compiled / optimized / DLL files
 __pycache__/

app.py CHANGED Viewed

@@ -1,14 +1,3 @@
-import os
-import time
-import uuid
-import torch
-import torchaudio
-# download for mecab
-os.system("python -m unidic download")
 import csv
 import datetime
 import os
@@ -68,36 +57,55 @@ if not "vi" in supported_languages:
     supported_languages.append("vi")
 def predict(
     prompt,
     language,
     audio_file_pth,
-    voice_cleanup,
 ):
     if language not in supported_languages:
-        gr.Warning(
             f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
         )
-        return (
-            None,
-            None,
-            None,
-            None,
-        )
     speaker_wav = audio_file_pth
     if len(prompt) < 2:
-        gr.Warning("Please give a longer prompt text")
-        return (None, None, None, None)
-    if len(prompt) > 200:
-        gr.Warning(
-            "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
-        )
-        return (None, None, None, None)
     try:
         metrics_text = ""
         t_latent = time.time()
@@ -115,13 +123,16 @@ def predict(
         except Exception as e:
             print("Speaker encoding error", str(e))
-            gr.Warning(
                 "It appears something wrong with reference, did you unmute your microphone?"
             )
-            return (None, None, None, None)
         prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
         print("I: Generating new audio...")
         t0 = time.time()
         out = MODEL.inference(
@@ -131,6 +142,7 @@ def predict(
             speaker_embedding,
             repetition_penalty=5.0,
             temperature=0.75,
         )
         inference_time = time.time() - t0
         print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
@@ -140,6 +152,11 @@ def predict(
         real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
         print(f"Real-time factor (RTF): {real_time_factor}")
         metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
     except RuntimeError as e:
@@ -158,7 +175,6 @@ def predict(
                 prompt,
                 language,
                 audio_file_pth,
-                voice_cleanup,
             ]
             error_data = [str(e) if type(e) != str else e for e in error_data]
             print(error_data)
@@ -198,8 +214,8 @@ def predict(
         else:
             if "Failed to decode" in str(e):
                 print("Speaker encoding error", str(e))
-                gr.Warning(
-                    "It appears something wrong with reference, did you unmute your microphone?"
                 )
             else:
                 print("RuntimeError: non device-side assert error:", str(e))
@@ -230,7 +246,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             input_text_gr = gr.Textbox(
                 label="Text Prompt",
                 info="One or two sentences at a time is better. Up to 200 text characters.",
-                value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
             )
             language_gr = gr.Dropdown(
                 label="Language",
@@ -258,6 +274,11 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 max_choices=1,
                 value="vi",
             )
             ref_gr = gr.Audio(
                 label="Reference Audio",
                 info="Click on the ✎ button to upload your own target speaker audio",

 import csv
 import datetime
 import os
     supported_languages.append("vi")
+def normalize_vietnamese_text(text):
+    text = (
+        TTSnorm(text, unknown=False, lower=False, rule=True)
+        .replace("..", ".")
+        .replace("!.", "!")
+        .replace("?.", "?")
+        .replace(" .", ".")
+        .replace(" ,", ",")
+        .replace('"', "")
+        .replace("'", "")
+        .replace("AI", "Ây Ai")
+        .replace("A.I", "Ây Ai")
+    )
+    return text
+def calculate_keep_len(text, lang):
+    """Simple hack for short sentences"""
+    if lang in ["ja", "zh-cn"]:
+        return -1
+    word_count = len(text.split())
+    num_punct = text.count(".") + text.count("!") + text.count("?") + text.count(",")
+    if word_count < 5:
+        return 15000 * word_count + 2000 * num_punct
+    elif word_count < 10:
+        return 13000 * word_count + 2000 * num_punct
+    return -1
 def predict(
     prompt,
     language,
     audio_file_pth,
+    normalize_text=True,
 ):
     if language not in supported_languages:
+        metrics_text = gr.Warning(
             f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
         )
+        return (None, metrics_text)
     speaker_wav = audio_file_pth
     if len(prompt) < 2:
+        metrics_text = gr.Warning("Please give a longer prompt text")
+        return (None, metrics_text)
     try:
         metrics_text = ""
         t_latent = time.time()
         except Exception as e:
             print("Speaker encoding error", str(e))
+            metrics_text = gr.Warning(
                 "It appears something wrong with reference, did you unmute your microphone?"
             )
+            return (None, metrics_text)
         prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", prompt)
+        if normalize_text and language == "vi":
+            prompt = normalize_vietnamese_text(prompt)
         print("I: Generating new audio...")
         t0 = time.time()
         out = MODEL.inference(
             speaker_embedding,
             repetition_penalty=5.0,
             temperature=0.75,
+            enable_text_splitting=True,
         )
         inference_time = time.time() - t0
         print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
         real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
         print(f"Real-time factor (RTF): {real_time_factor}")
         metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
+        # Temporary hack for short sentences
+        keep_len = calculate_keep_len(prompt, language)
+        out["wav"] = out["wav"][:keep_len]
         torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
     except RuntimeError as e:
                 prompt,
                 language,
                 audio_file_pth,
             ]
             error_data = [str(e) if type(e) != str else e for e in error_data]
             print(error_data)
         else:
             if "Failed to decode" in str(e):
                 print("Speaker encoding error", str(e))
+                metrics_text = gr.Warning(
+                    metrics_text="It appears something wrong with reference, did you unmute your microphone?"
                 )
             else:
                 print("RuntimeError: non device-side assert error:", str(e))
             input_text_gr = gr.Textbox(
                 label="Text Prompt",
                 info="One or two sentences at a time is better. Up to 200 text characters.",
+                value="Xin chào, tôi là một mô hình chuyển đổi văn bản thành giọng nói tiếng Việt",
             )
             language_gr = gr.Dropdown(
                 label="Language",
                 max_choices=1,
                 value="vi",
             )
+            normalize_text = gr.Checkbox(
+                label="Normalize Vietnamese Text",
+                info="Normalize Vietnamese Text",
+                default=True,
+            )
             ref_gr = gr.Audio(
                 label="Reference Audio",
                 info="Click on the ✎ button to upload your own target speaker audio",