fffiloni commited on
Commit
32d04b2
1 Parent(s): 8d18137

add MaskGCT voice cloning option

Browse files
Files changed (1) hide show
  1. webgui.py +38 -0
webgui.py CHANGED
@@ -212,6 +212,22 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
212
  video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
213
 
214
  return final_output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  with gr.Blocks() as demo:
217
  gr.Markdown('# EchoMimic')
@@ -228,6 +244,20 @@ with gr.Blocks() as demo:
228
  with gr.Column():
229
  uploaded_img = gr.Image(type="filepath", label="Reference Image")
230
  uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  with gr.Accordion(label=advanced_settings_label, open=False):
232
  with gr.Row():
233
  width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"], interactive=available_property)
@@ -297,6 +327,14 @@ with gr.Blocks() as demo:
297
  output_video= final_output_path
298
  return final_output_path
299
 
 
 
 
 
 
 
 
 
300
  generate_button.click(
301
  generate_video,
302
  inputs=[
 
212
  video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
213
 
214
  return final_output_path
215
+
216
+ def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone):
217
+ try:
218
+ client = Client("amphion/maskgct")
219
+ except:
220
+ raise gr.Error(f"amphion/maskgct space's api might not be ready, please wait, or upload an audio instead.")
221
+
222
+ result = client.predict(
223
+ prompt_wav = handle_file(audio_to_clone),
224
+ target_text = prompt_audio_maskGCT,
225
+ target_len=-1,
226
+ n_timesteps=25,
227
+ api_name="/predict"
228
+ )
229
+ print(result)
230
+ return result, gr.update(value=result, visible=True)
231
 
232
  with gr.Blocks() as demo:
233
  gr.Markdown('# EchoMimic')
 
244
  with gr.Column():
245
  uploaded_img = gr.Image(type="filepath", label="Reference Image")
246
  uploaded_audio = gr.Audio(type="filepath", label="Input Audio")
247
+ preprocess_audio_file = gr.File(visible=False)
248
+ with gr.Accordion(label="Voice cloning with MaskGCT", open=False):
249
+ prompt_audio_maskGCT = gr.Textbox(
250
+ label = "Text to synthetize",
251
+ lines = 2,
252
+ max_lines = 2,
253
+ elem_id = "text-synth-maskGCT"
254
+ )
255
+ audio_to_clone_maskGCT = gr.Audio(
256
+ label = "Voice to clone",
257
+ type = "filepath",
258
+ elem_id = "audio-clone-elm-maskGCT"
259
+ )
260
+ gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)")
261
  with gr.Accordion(label=advanced_settings_label, open=False):
262
  with gr.Row():
263
  width = gr.Slider(label="Width", minimum=128, maximum=1024, value=default_values["width"], interactive=available_property)
 
327
  output_video= final_output_path
328
  return final_output_path
329
 
330
+ gen_maskGCT_voice_btn.click(
331
+ fn = get_maskGCT_TTS,
332
+ inputs = [prompt_audio_maskGCT, audio_to_clone_maskGCT],
333
+ outputs = [voice, preprocess_audio_file],
334
+ queue = False,
335
+ show_api = False
336
+ )
337
+
338
  generate_button.click(
339
  generate_video,
340
  inputs=[