Spaces:

projecte-aina
/

matxa-alvocat-tts-ca

Running

App Files Files Community

wetdog commited on Mar 8

Commit

6b0bcdf

•

1 Parent(s): 62f951d

set fixed path for temp files and expose parameters in the interface

Browse files

Files changed (1) hide show

infer_onnx.py +23 -5

infer_onnx.py CHANGED Viewed

@@ -92,7 +92,7 @@ def vocos_inference(mel):
     return y
-def tts(text:str, spk_id:int):
     sid = np.array([int(spk_id)]) if spk_id is not None else None
     text_matcha , text_lengths = process_text(0,text,"cpu")
@@ -100,7 +100,7 @@ def tts(text:str, spk_id:int):
     inputs = {
         "x": text_matcha,
         "x_lengths": text_lengths,
-        "scales": np.array([0.667, 1.0], dtype=np.float32),
         "spks": sid
     }
     mel_t0 = perf_counter()
@@ -115,7 +115,7 @@ def tts(text:str, spk_id:int):
     vocos_infer_secs = perf_counter() - vocos_t0
     print("Vocos inference time", vocos_infer_secs)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
         sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
     #MATCHA HIFIGAN
@@ -123,7 +123,7 @@ def tts(text:str, spk_id:int):
     inputs = {
         "x": text_matcha,
         "x_lengths": text_lengths,
-        "scales": np.array([0.667, 1.0], dtype=np.float32),
         "spks": sid
     }
     hifigan_t0 = perf_counter()
@@ -132,9 +132,11 @@ def tts(text:str, spk_id:int):
     hifigan_infer_secs = perf_counter() - hifigan_t0
     print("Matcha + Hifigan",hifigan_infer_secs)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
         sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
     return fp_matcha_vocos.name, fp_matcha.name
 ## GUI space
@@ -176,6 +178,22 @@ vits2_inference = gr.Interface(
             label="Speaker id",
             info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
         ),
     ],
     outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
              gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]

     return y
+def tts(text:str, spk_id:int, temperature:float, length_scale:float):
     sid = np.array([int(spk_id)]) if spk_id is not None else None
     text_matcha , text_lengths = process_text(0,text,"cpu")
     inputs = {
         "x": text_matcha,
         "x_lengths": text_lengths,
+        "scales": np.array([temperature, length_scale], dtype=np.float32),
         "spks": sid
     }
     mel_t0 = perf_counter()
     vocos_infer_secs = perf_counter() - vocos_t0
     print("Vocos inference time", vocos_infer_secs)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
         sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
     #MATCHA HIFIGAN
     inputs = {
         "x": text_matcha,
         "x_lengths": text_lengths,
+        "scales": np.array([temperature, length_scale], dtype=np.float32),
         "spks": sid
     }
     hifigan_t0 = perf_counter()
     hifigan_infer_secs = perf_counter() - hifigan_t0
     print("Matcha + Hifigan",hifigan_infer_secs)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha:
         sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
+    print(f"RTF matcha + hifigan { hifigan_infer_secs/ (wavs.shape[1]/22050) }")
+    print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs.shape[1]/22050) }")
     return fp_matcha_vocos.name, fp_matcha.name
 ## GUI space
             label="Speaker id",
             info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
         ),
+        gr.Slider(
+            0.1,
+            2.0,
+            value=0.667,
+            step=0.01,
+            label="Temperature",
+            info=f"Temperature",
+        ),
+        gr.Slider(
+            0.5,
+            2.0,
+            value=1.0,
+            step=0.01,
+            label="Length scale",
+            info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
+        )
     ],
     outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
              gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]