Spaces:
Runtime error
Runtime error
english translation
Browse files
app.py
CHANGED
@@ -131,7 +131,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
|
|
131 |
vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
|
132 |
if quantization_input:
|
133 |
quantize_(vae, int8_weight_only())
|
134 |
-
print("
|
135 |
|
136 |
## reference net init
|
137 |
reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
|
@@ -287,7 +287,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
|
|
287 |
return video_output, seed_text
|
288 |
|
289 |
|
290 |
-
with gr.Blocks(
|
291 |
gr.Markdown("""
|
292 |
<div>
|
293 |
<h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
|
@@ -297,7 +297,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
297 |
<a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
|
298 |
</div>
|
299 |
<div style="text-align: center; font-weight: bold; color: red;">
|
300 |
-
⚠️
|
301 |
</div>
|
302 |
|
303 |
""")
|
@@ -305,29 +305,29 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
305 |
with gr.Row():
|
306 |
with gr.Column():
|
307 |
with gr.Group():
|
308 |
-
image_input = gr.Image(label="
|
309 |
-
audio_input = gr.Audio(label="
|
310 |
-
pose_input = gr.Textbox(label="
|
311 |
-
with gr.
|
312 |
with gr.Row():
|
313 |
-
width = gr.Number(label="
|
314 |
-
height = gr.Number(label="
|
315 |
-
length = gr.Number(label="
|
316 |
with gr.Row():
|
317 |
-
steps = gr.Number(label="
|
318 |
-
sample_rate = gr.Number(label="
|
319 |
-
cfg = gr.Number(label="
|
320 |
with gr.Row():
|
321 |
-
fps = gr.Number(label="
|
322 |
-
context_frames = gr.Number(label="
|
323 |
-
context_overlap = gr.Number(label="
|
324 |
with gr.Row():
|
325 |
-
quantization_input = gr.Checkbox(label="
|
326 |
-
seed = gr.Number(label="
|
327 |
-
generate_button = gr.Button("🎬
|
328 |
with gr.Column():
|
329 |
-
video_output = gr.Video(label="
|
330 |
-
seed_text = gr.Textbox(label="
|
331 |
gr.Examples(
|
332 |
examples=[
|
333 |
["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
|
@@ -339,7 +339,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
339 |
["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
|
340 |
],
|
341 |
inputs=[image_input, audio_input],
|
342 |
-
label="
|
343 |
)
|
344 |
|
345 |
generate_button.click(
|
|
|
131 |
vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
|
132 |
if quantization_input:
|
133 |
quantize_(vae, int8_weight_only())
|
134 |
+
print("Use int8 quantization.")
|
135 |
|
136 |
## reference net init
|
137 |
reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
|
|
|
287 |
return video_output, seed_text
|
288 |
|
289 |
|
290 |
+
with gr.Blocks() as demo:
|
291 |
gr.Markdown("""
|
292 |
<div>
|
293 |
<h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
|
|
|
297 |
<a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
|
298 |
</div>
|
299 |
<div style="text-align: center; font-weight: bold; color: red;">
|
300 |
+
⚠️ This demonstration is for academic research and experiential use only.
|
301 |
</div>
|
302 |
|
303 |
""")
|
|
|
305 |
with gr.Row():
|
306 |
with gr.Column():
|
307 |
with gr.Group():
|
308 |
+
image_input = gr.Image(label="Image Input (Auto Scaling)", type="filepath")
|
309 |
+
audio_input = gr.Audio(label="Audio Input", type="filepath")
|
310 |
+
pose_input = gr.Textbox(label="Pose Input (Directory Path)", placeholder="Please enter the directory path for pose data.", value="assets/halfbody_demo/pose/01")
|
311 |
+
with gr.Accordion("Advanced Settings", open=False):
|
312 |
with gr.Row():
|
313 |
+
width = gr.Number(label="Width (multiple of 16, recommended: 768)", value=768)
|
314 |
+
height = gr.Number(label="Height (multiple of 16, recommended: 768)", value=768)
|
315 |
+
length = gr.Number(label="Video Length (recommended: 240)", value=240)
|
316 |
with gr.Row():
|
317 |
+
steps = gr.Number(label="Steps (recommended: 30)", value=20)
|
318 |
+
sample_rate = gr.Number(label="Sampling Rate (recommended: 16000)", value=16000)
|
319 |
+
cfg = gr.Number(label="CFG (recommended: 2.5)", value=2.5, step=0.1)
|
320 |
with gr.Row():
|
321 |
+
fps = gr.Number(label="Frame Rate (recommended: 24)", value=24)
|
322 |
+
context_frames = gr.Number(label="Context Frames (recommended: 12)", value=12)
|
323 |
+
context_overlap = gr.Number(label="Context Overlap (recommended: 3)", value=3)
|
324 |
with gr.Row():
|
325 |
+
quantization_input = gr.Checkbox(label="Int8 Quantization (recommended for users with 12GB VRAM, use audio no longer than 5 seconds)", value=False)
|
326 |
+
seed = gr.Number(label="Seed (-1 for random)", value=-1)
|
327 |
+
generate_button = gr.Button("🎬 Generate Video")
|
328 |
with gr.Column():
|
329 |
+
video_output = gr.Video(label="Output Video")
|
330 |
+
seed_text = gr.Textbox(label="Seed", interactive=False, visible=False)
|
331 |
gr.Examples(
|
332 |
examples=[
|
333 |
["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
|
|
|
339 |
["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
|
340 |
],
|
341 |
inputs=[image_input, audio_input],
|
342 |
+
label="Preset Characters and Audio",
|
343 |
)
|
344 |
|
345 |
generate_button.click(
|