fffiloni commited on
Commit
c9a6087
·
verified ·
1 Parent(s): e079364

english translation

Browse files
Files changed (1) hide show
  1. app.py +22 -22
app.py CHANGED
@@ -131,7 +131,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
131
  vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
132
  if quantization_input:
133
  quantize_(vae, int8_weight_only())
134
- print("使用int8量化")
135
 
136
  ## reference net init
137
  reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
@@ -287,7 +287,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
287
  return video_output, seed_text
288
 
289
 
290
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
291
  gr.Markdown("""
292
  <div>
293
  <h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
@@ -297,7 +297,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
297
  <a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
298
  </div>
299
  <div style="text-align: center; font-weight: bold; color: red;">
300
- ⚠️ 该演示仅供学术研究和体验使用。
301
  </div>
302
 
303
  """)
@@ -305,29 +305,29 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
305
  with gr.Row():
306
  with gr.Column():
307
  with gr.Group():
308
- image_input = gr.Image(label="图像输入(自动缩放)", type="filepath")
309
- audio_input = gr.Audio(label="音频输入", type="filepath")
310
- pose_input = gr.Textbox(label="姿态输入(目录地址)", placeholder="请输入姿态数据的目录地址", value="assets/halfbody_demo/pose/01")
311
- with gr.Group():
312
  with gr.Row():
313
- width = gr.Number(label="宽度(16的倍数,推荐768", value=768)
314
- height = gr.Number(label="高度(16的倍数,推荐768", value=768)
315
- length = gr.Number(label="视频长度,推荐240)", value=240)
316
  with gr.Row():
317
- steps = gr.Number(label="步骤(推荐30", value=20)
318
- sample_rate = gr.Number(label="采样率(推荐16000", value=16000)
319
- cfg = gr.Number(label="cfg(推荐2.5", value=2.5, step=0.1)
320
  with gr.Row():
321
- fps = gr.Number(label="帧率(推荐24", value=24)
322
- context_frames = gr.Number(label="上下文框架(推荐12", value=12)
323
- context_overlap = gr.Number(label="上下文重叠(推荐3", value=3)
324
  with gr.Row():
325
- quantization_input = gr.Checkbox(label="int8量化(推荐显存12G的用户开启,并使用不超过5秒的音频)", value=False)
326
- seed = gr.Number(label="种子(-1为随机)", value=-1)
327
- generate_button = gr.Button("🎬 生成视频")
328
  with gr.Column():
329
- video_output = gr.Video(label="输出视频")
330
- seed_text = gr.Textbox(label="种子", interactive=False, visible=False)
331
  gr.Examples(
332
  examples=[
333
  ["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
@@ -339,7 +339,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
339
  ["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
340
  ],
341
  inputs=[image_input, audio_input],
342
- label="预设人物及音频",
343
  )
344
 
345
  generate_button.click(
 
131
  vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
132
  if quantization_input:
133
  quantize_(vae, int8_weight_only())
134
+ print("Use int8 quantization.")
135
 
136
  ## reference net init
137
  reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
 
287
  return video_output, seed_text
288
 
289
 
290
+ with gr.Blocks() as demo:
291
  gr.Markdown("""
292
  <div>
293
  <h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
 
297
  <a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
298
  </div>
299
  <div style="text-align: center; font-weight: bold; color: red;">
300
+ ⚠️ This demonstration is for academic research and experiential use only.
301
  </div>
302
 
303
  """)
 
305
  with gr.Row():
306
  with gr.Column():
307
  with gr.Group():
308
+ image_input = gr.Image(label="Image Input (Auto Scaling)", type="filepath")
309
+ audio_input = gr.Audio(label="Audio Input", type="filepath")
310
+ pose_input = gr.Textbox(label="Pose Input (Directory Path)", placeholder="Please enter the directory path for pose data.", value="assets/halfbody_demo/pose/01")
311
+ with gr.Accordion("Advanced Settings", open=False):
312
  with gr.Row():
313
+ width = gr.Number(label="Width (multiple of 16, recommended: 768)", value=768)
314
+ height = gr.Number(label="Height (multiple of 16, recommended: 768)", value=768)
315
+ length = gr.Number(label="Video Length (recommended: 240)", value=240)
316
  with gr.Row():
317
+ steps = gr.Number(label="Steps (recommended: 30)", value=20)
318
+ sample_rate = gr.Number(label="Sampling Rate (recommended: 16000)", value=16000)
319
+ cfg = gr.Number(label="CFG (recommended: 2.5)", value=2.5, step=0.1)
320
  with gr.Row():
321
+ fps = gr.Number(label="Frame Rate (recommended: 24)", value=24)
322
+ context_frames = gr.Number(label="Context Frames (recommended: 12)", value=12)
323
+ context_overlap = gr.Number(label="Context Overlap (recommended: 3)", value=3)
324
  with gr.Row():
325
+ quantization_input = gr.Checkbox(label="Int8 Quantization (recommended for users with 12GB VRAM, use audio no longer than 5 seconds)", value=False)
326
+ seed = gr.Number(label="Seed (-1 for random)", value=-1)
327
+ generate_button = gr.Button("🎬 Generate Video")
328
  with gr.Column():
329
+ video_output = gr.Video(label="Output Video")
330
+ seed_text = gr.Textbox(label="Seed", interactive=False, visible=False)
331
  gr.Examples(
332
  examples=[
333
  ["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
 
339
  ["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
340
  ],
341
  inputs=[image_input, audio_input],
342
+ label="Preset Characters and Audio",
343
  )
344
 
345
  generate_button.click(