wetdog commited on
Commit
6b0bcdf
1 Parent(s): 62f951d

set fixed path for temp files and expose parameters in the interface

Browse files
Files changed (1) hide show
  1. infer_onnx.py +23 -5
infer_onnx.py CHANGED
@@ -92,7 +92,7 @@ def vocos_inference(mel):
92
  return y
93
 
94
 
95
- def tts(text:str, spk_id:int):
96
  sid = np.array([int(spk_id)]) if spk_id is not None else None
97
  text_matcha , text_lengths = process_text(0,text,"cpu")
98
 
@@ -100,7 +100,7 @@ def tts(text:str, spk_id:int):
100
  inputs = {
101
  "x": text_matcha,
102
  "x_lengths": text_lengths,
103
- "scales": np.array([0.667, 1.0], dtype=np.float32),
104
  "spks": sid
105
  }
106
  mel_t0 = perf_counter()
@@ -115,7 +115,7 @@ def tts(text:str, spk_id:int):
115
  vocos_infer_secs = perf_counter() - vocos_t0
116
  print("Vocos inference time", vocos_infer_secs)
117
 
118
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
119
  sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
120
 
121
  #MATCHA HIFIGAN
@@ -123,7 +123,7 @@ def tts(text:str, spk_id:int):
123
  inputs = {
124
  "x": text_matcha,
125
  "x_lengths": text_lengths,
126
- "scales": np.array([0.667, 1.0], dtype=np.float32),
127
  "spks": sid
128
  }
129
  hifigan_t0 = perf_counter()
@@ -132,9 +132,11 @@ def tts(text:str, spk_id:int):
132
  hifigan_infer_secs = perf_counter() - hifigan_t0
133
  print("Matcha + Hifigan",hifigan_infer_secs)
134
 
135
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
136
  sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
137
 
 
 
138
  return fp_matcha_vocos.name, fp_matcha.name
139
 
140
  ## GUI space
@@ -176,6 +178,22 @@ vits2_inference = gr.Interface(
176
  label="Speaker id",
177
  info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
178
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  ],
180
  outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
181
  gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
 
92
  return y
93
 
94
 
95
+ def tts(text:str, spk_id:int, temperature:float, length_scale:float):
96
  sid = np.array([int(spk_id)]) if spk_id is not None else None
97
  text_matcha , text_lengths = process_text(0,text,"cpu")
98
 
 
100
  inputs = {
101
  "x": text_matcha,
102
  "x_lengths": text_lengths,
103
+ "scales": np.array([temperature, length_scale], dtype=np.float32),
104
  "spks": sid
105
  }
106
  mel_t0 = perf_counter()
 
115
  vocos_infer_secs = perf_counter() - vocos_t0
116
  print("Vocos inference time", vocos_infer_secs)
117
 
118
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
119
  sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
120
 
121
  #MATCHA HIFIGAN
 
123
  inputs = {
124
  "x": text_matcha,
125
  "x_lengths": text_lengths,
126
+ "scales": np.array([temperature, length_scale], dtype=np.float32),
127
  "spks": sid
128
  }
129
  hifigan_t0 = perf_counter()
 
132
  hifigan_infer_secs = perf_counter() - hifigan_t0
133
  print("Matcha + Hifigan",hifigan_infer_secs)
134
 
135
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha:
136
  sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
137
 
138
+ print(f"RTF matcha + hifigan { hifigan_infer_secs/ (wavs.shape[1]/22050) }")
139
+ print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs.shape[1]/22050) }")
140
  return fp_matcha_vocos.name, fp_matcha.name
141
 
142
  ## GUI space
 
178
  label="Speaker id",
179
  info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
180
  ),
181
+ gr.Slider(
182
+ 0.1,
183
+ 2.0,
184
+ value=0.667,
185
+ step=0.01,
186
+ label="Temperature",
187
+ info=f"Temperature",
188
+ ),
189
+ gr.Slider(
190
+ 0.5,
191
+ 2.0,
192
+ value=1.0,
193
+ step=0.01,
194
+ label="Length scale",
195
+ info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
196
+ )
197
  ],
198
  outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
199
  gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]