set fixed path for temp files and expose parameters in the interface
Browse files- infer_onnx.py +23 -5
infer_onnx.py
CHANGED
@@ -92,7 +92,7 @@ def vocos_inference(mel):
|
|
92 |
return y
|
93 |
|
94 |
|
95 |
-
def tts(text:str, spk_id:int):
|
96 |
sid = np.array([int(spk_id)]) if spk_id is not None else None
|
97 |
text_matcha , text_lengths = process_text(0,text,"cpu")
|
98 |
|
@@ -100,7 +100,7 @@ def tts(text:str, spk_id:int):
|
|
100 |
inputs = {
|
101 |
"x": text_matcha,
|
102 |
"x_lengths": text_lengths,
|
103 |
-
"scales": np.array([
|
104 |
"spks": sid
|
105 |
}
|
106 |
mel_t0 = perf_counter()
|
@@ -115,7 +115,7 @@ def tts(text:str, spk_id:int):
|
|
115 |
vocos_infer_secs = perf_counter() - vocos_t0
|
116 |
print("Vocos inference time", vocos_infer_secs)
|
117 |
|
118 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
|
119 |
sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
|
120 |
|
121 |
#MATCHA HIFIGAN
|
@@ -123,7 +123,7 @@ def tts(text:str, spk_id:int):
|
|
123 |
inputs = {
|
124 |
"x": text_matcha,
|
125 |
"x_lengths": text_lengths,
|
126 |
-
"scales": np.array([
|
127 |
"spks": sid
|
128 |
}
|
129 |
hifigan_t0 = perf_counter()
|
@@ -132,9 +132,11 @@ def tts(text:str, spk_id:int):
|
|
132 |
hifigan_infer_secs = perf_counter() - hifigan_t0
|
133 |
print("Matcha + Hifigan",hifigan_infer_secs)
|
134 |
|
135 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
|
136 |
sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
|
137 |
|
|
|
|
|
138 |
return fp_matcha_vocos.name, fp_matcha.name
|
139 |
|
140 |
## GUI space
|
@@ -176,6 +178,22 @@ vits2_inference = gr.Interface(
|
|
176 |
label="Speaker id",
|
177 |
info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
|
178 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
],
|
180 |
outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
|
181 |
gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
|
|
|
92 |
return y
|
93 |
|
94 |
|
95 |
+
def tts(text:str, spk_id:int, temperature:float, length_scale:float):
|
96 |
sid = np.array([int(spk_id)]) if spk_id is not None else None
|
97 |
text_matcha , text_lengths = process_text(0,text,"cpu")
|
98 |
|
|
|
100 |
inputs = {
|
101 |
"x": text_matcha,
|
102 |
"x_lengths": text_lengths,
|
103 |
+
"scales": np.array([temperature, length_scale], dtype=np.float32),
|
104 |
"spks": sid
|
105 |
}
|
106 |
mel_t0 = perf_counter()
|
|
|
115 |
vocos_infer_secs = perf_counter() - vocos_t0
|
116 |
print("Vocos inference time", vocos_infer_secs)
|
117 |
|
118 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
|
119 |
sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
|
120 |
|
121 |
#MATCHA HIFIGAN
|
|
|
123 |
inputs = {
|
124 |
"x": text_matcha,
|
125 |
"x_lengths": text_lengths,
|
126 |
+
"scales": np.array([temperature, length_scale], dtype=np.float32),
|
127 |
"spks": sid
|
128 |
}
|
129 |
hifigan_t0 = perf_counter()
|
|
|
132 |
hifigan_infer_secs = perf_counter() - hifigan_t0
|
133 |
print("Matcha + Hifigan",hifigan_infer_secs)
|
134 |
|
135 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha:
|
136 |
sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
|
137 |
|
138 |
+
print(f"RTF matcha + hifigan { hifigan_infer_secs/ (wavs.shape[1]/22050) }")
|
139 |
+
print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs.shape[1]/22050) }")
|
140 |
return fp_matcha_vocos.name, fp_matcha.name
|
141 |
|
142 |
## GUI space
|
|
|
178 |
label="Speaker id",
|
179 |
info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
|
180 |
),
|
181 |
+
gr.Slider(
|
182 |
+
0.1,
|
183 |
+
2.0,
|
184 |
+
value=0.667,
|
185 |
+
step=0.01,
|
186 |
+
label="Temperature",
|
187 |
+
info=f"Temperature",
|
188 |
+
),
|
189 |
+
gr.Slider(
|
190 |
+
0.5,
|
191 |
+
2.0,
|
192 |
+
value=1.0,
|
193 |
+
step=0.01,
|
194 |
+
label="Length scale",
|
195 |
+
info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
|
196 |
+
)
|
197 |
],
|
198 |
outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
|
199 |
gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
|