|
import spaces |
|
import gradio as gr |
|
import sys |
|
import os |
|
import torch |
|
import numpy as np |
|
from os.path import join as pjoin |
|
import utils.paramUtil as paramUtil |
|
from utils.plot_script import * |
|
from utils.utils import * |
|
from utils.motion_process import recover_from_ric |
|
from accelerate.utils import set_seed |
|
from models.gaussian_diffusion import DiffusePipeline |
|
from options.generate_options import GenerateOptions |
|
from utils.model_load import load_model_weights |
|
from motion_loader import get_dataset_loader |
|
from models import build_models |
|
import yaml |
|
import time |
|
from box import Box |
|
import hashlib |
|
from huggingface_hub import hf_hub_download |
|
|
|
ckptdir = './checkpoints/t2m/release' |
|
os.makedirs(ckptdir, exist_ok=True) |
|
|
|
|
|
mean_path = hf_hub_download( |
|
repo_id="EvanTHU/MotionCLR", |
|
filename="meta/mean.npy", |
|
local_dir=ckptdir, |
|
local_dir_use_symlinks=False |
|
) |
|
|
|
std_path = hf_hub_download( |
|
repo_id="EvanTHU/MotionCLR", |
|
filename="meta/std.npy", |
|
local_dir=ckptdir, |
|
local_dir_use_symlinks=False |
|
) |
|
|
|
model_path = hf_hub_download( |
|
repo_id="EvanTHU/MotionCLR", |
|
filename="model/latest.tar", |
|
local_dir=ckptdir, |
|
local_dir_use_symlinks=False |
|
) |
|
|
|
opt_path = hf_hub_download( |
|
repo_id="EvanTHU/MotionCLR", |
|
filename="opt.txt", |
|
local_dir=ckptdir, |
|
local_dir_use_symlinks=False |
|
) |
|
|
|
|
|
|
|
os.makedirs("tmp", exist_ok=True) |
|
os.environ['GRADIO_TEMP_DIR'] = './tmp' |
|
|
|
def generate_md5(input_string): |
|
|
|
md5_hash = hashlib.md5(input_string.encode()) |
|
|
|
return md5_hash.hexdigest() |
|
|
|
def set_all_use_to_false(data): |
|
for key, value in data.items(): |
|
if isinstance(value, Box): |
|
set_all_use_to_false(value) |
|
elif key == 'use': |
|
data[key] = False |
|
return data |
|
|
|
def yaml_to_box(yaml_file): |
|
with open(yaml_file, 'r') as file: |
|
yaml_data = yaml.safe_load(file) |
|
|
|
return Box(yaml_data) |
|
|
|
HEAD = """<div class="embed_hidden"> |
|
<h1 style='text-align: center'> MotionCLR User Interaction Demo </h1> |
|
""" |
|
|
|
edit_config = yaml_to_box('options/edit.yaml') |
|
os.environ['GRADIO_TEMP_DIR'] = './tmp' |
|
CSS = """ |
|
.retrieved_video { |
|
position: relative; |
|
margin: 0; |
|
box-shadow: var(--block-shadow); |
|
border-width: var(--block-border-width); |
|
border-color: #000000; |
|
border-radius: var(--block-radius); |
|
background: var(--block-background-fill); |
|
width: 100%; |
|
line-height: var(--line-sm); |
|
} |
|
.contour_video { |
|
display: flex; |
|
flex-direction: column; |
|
justify-content: center; |
|
align-items: center; |
|
z-index: var(--layer-5); |
|
border-radius: var(--block-radius); |
|
background: var(--background-fill-primary); |
|
padding: 0 var(--size-6); |
|
max-height: var(--size-screen-h); |
|
overflow: hidden; |
|
} |
|
""" |
|
|
|
def generate_video_from_text(text, opt, pipeline): |
|
width = 500 |
|
height = 500 |
|
texts = [text] |
|
motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)] |
|
|
|
save_dir = './tmp/gen/' |
|
filename = generate_md5(str(time.time())) + ".mp4" |
|
save_path = pjoin(save_dir, str(filename)) |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
start_time = time.perf_counter() |
|
gr.Info("Generating motion...", duration = 3) |
|
pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens])) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3) |
|
start_time = time.perf_counter() |
|
mean = np.load(pjoin(opt.meta_dir, 'mean.npy')) |
|
std = np.load(pjoin(opt.meta_dir, 'std.npy')) |
|
|
|
|
|
samples = [] |
|
|
|
root_list = [] |
|
for i, motion in enumerate(pred_motions): |
|
motion = motion.cpu().numpy() * std + mean |
|
|
|
motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num) |
|
|
|
floor_height = motion.min(dim=0)[0].min(dim=0)[0][1] |
|
motion[:, :, 1] -= floor_height |
|
motion = motion.numpy() |
|
|
|
motion = motion_temporal_filter(motion, sigma=1) |
|
|
|
samples.append(motion) |
|
|
|
i = 0 |
|
title = texts[i] |
|
motion = samples[i] |
|
kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain |
|
plot_3d_motion(save_path, kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius) |
|
|
|
gr.Info("Rendered motion...", duration = 3) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3) |
|
|
|
video_dis = f'<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_path}"></video>' |
|
style_dis = video_dis + """<br> <p align="center"> Content Reference </p>""" |
|
global edit_config |
|
edit_config = set_all_use_to_false(edit_config) |
|
return video_dis, video_dis, video_dis, video_dis, style_dis, video_dis, gr.update(visible=True) |
|
|
|
def reweighting(text, idx, weight, opt, pipeline): |
|
global edit_config |
|
edit_config.reweighting_attn.use = True |
|
edit_config.reweighting_attn.idx = idx |
|
edit_config.reweighting_attn.reweighting_attn_weight = weight |
|
|
|
|
|
gr.Info("Loading Configurations...", duration = 3) |
|
model = build_models(opt, edit_config=edit_config) |
|
ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar') |
|
niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema) |
|
|
|
pipeline = DiffusePipeline( |
|
opt = opt, |
|
model = model, |
|
diffuser_name = opt.diffuser_name, |
|
device=opt.device, |
|
num_inference_steps=opt.num_inference_steps, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
print(edit_config) |
|
|
|
width = 500 |
|
height = 500 |
|
texts = [text, text] |
|
motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)] |
|
|
|
save_dir = './tmp/gen/' |
|
filenames = [generate_md5(str(time.time())) + ".mp4", generate_md5(str(time.time())) + ".mp4"] |
|
save_paths = [pjoin(save_dir, str(filenames[0])), pjoin(save_dir, str(filenames[1]))] |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
start_time = time.perf_counter() |
|
gr.Info("Generating motion...", duration = 3) |
|
pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens])) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3) |
|
start_time = time.perf_counter() |
|
mean = np.load(pjoin(opt.meta_dir, 'mean.npy')) |
|
std = np.load(pjoin(opt.meta_dir, 'std.npy')) |
|
|
|
|
|
samples = [] |
|
|
|
root_list = [] |
|
for i, motion in enumerate(pred_motions): |
|
motion = motion.cpu().numpy() * std + mean |
|
|
|
motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num) |
|
|
|
floor_height = motion.min(dim=0)[0].min(dim=0)[0][1] |
|
motion[:, :, 1] -= floor_height |
|
motion = motion.numpy() |
|
|
|
motion = motion_temporal_filter(motion, sigma=1) |
|
|
|
samples.append(motion) |
|
|
|
i = 1 |
|
title = texts[i] |
|
motion = samples[i] |
|
kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain |
|
plot_3d_motion(save_paths[1], kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius) |
|
|
|
|
|
gr.Info("Rendered motion...", duration = 3) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3) |
|
|
|
video_dis = f'<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[1]}"></video>' |
|
|
|
|
|
edit_config = set_all_use_to_false(edit_config) |
|
return video_dis |
|
|
|
def generate_example_based_motion(text, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion, opt, pipeline): |
|
global edit_config |
|
edit_config.example_based.use = True |
|
edit_config.example_based.chunk_size = chunk_size |
|
edit_config.example_based.example_based_steps_end = example_based_steps_end |
|
edit_config.example_based.temp_seed = temp_seed |
|
edit_config.example_based.temp_seed_bar = temp_seed_bar |
|
|
|
|
|
gr.Info("Loading Configurations...", duration = 3) |
|
model = build_models(opt, edit_config=edit_config) |
|
ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar') |
|
niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema) |
|
|
|
pipeline = DiffusePipeline( |
|
opt = opt, |
|
model = model, |
|
diffuser_name = opt.diffuser_name, |
|
device=opt.device, |
|
num_inference_steps=opt.num_inference_steps, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
width = 500 |
|
height = 500 |
|
texts = [text for _ in range(num_motion)] |
|
motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)] |
|
|
|
save_dir = './tmp/gen/' |
|
filenames = [generate_md5(str(time.time())) + ".mp4" for _ in range(num_motion)] |
|
save_paths = [pjoin(save_dir, str(filenames[i])) for i in range(num_motion)] |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
start_time = time.perf_counter() |
|
gr.Info("Generating motion...", duration = 3) |
|
pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens])) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3) |
|
start_time = time.perf_counter() |
|
mean = np.load(pjoin(opt.meta_dir, 'mean.npy')) |
|
std = np.load(pjoin(opt.meta_dir, 'std.npy')) |
|
|
|
|
|
samples = [] |
|
|
|
root_list = [] |
|
progress=gr.Progress() |
|
progress(0, desc="Starting...") |
|
for i, motion in enumerate(pred_motions): |
|
motion = motion.cpu().numpy() * std + mean |
|
|
|
motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num) |
|
|
|
floor_height = motion.min(dim=0)[0].min(dim=0)[0][1] |
|
motion[:, :, 1] -= floor_height |
|
motion = motion.numpy() |
|
|
|
motion = motion_temporal_filter(motion, sigma=1) |
|
|
|
samples.append(motion) |
|
|
|
video_dis = [] |
|
i = 0 |
|
for title in progress.tqdm(texts): |
|
print(save_paths[i]) |
|
title = texts[i] |
|
motion = samples[i] |
|
kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain |
|
plot_3d_motion(save_paths[i], kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius) |
|
video_html = f''' |
|
<video class="retrieved_video" width="{width}" height="{height}" preload="auto" muted playsinline onpause="this.load()" autoplay loop disablepictureinpicture src="./file={save_paths[i]}"> </video> |
|
''' |
|
video_dis.append(video_html) |
|
i += 1 |
|
|
|
for _ in range(24 - num_motion): |
|
video_dis.append(None) |
|
gr.Info("Rendered motion...", duration = 3) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3) |
|
|
|
edit_config = set_all_use_to_false(edit_config) |
|
return video_dis |
|
|
|
def transfer_style(text, style_text, style_transfer_steps_end, opt, pipeline): |
|
global edit_config |
|
edit_config.style_tranfer.use = True |
|
edit_config.style_tranfer.style_transfer_steps_end = style_transfer_steps_end |
|
|
|
gr.Info("Loading Configurations...", duration = 3) |
|
model = build_models(opt, edit_config=edit_config) |
|
ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar') |
|
niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema) |
|
|
|
pipeline = DiffusePipeline( |
|
opt = opt, |
|
model = model, |
|
diffuser_name = opt.diffuser_name, |
|
device=opt.device, |
|
num_inference_steps=opt.num_inference_steps, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
print(edit_config) |
|
|
|
width = 500 |
|
height = 500 |
|
texts = [style_text, text, text] |
|
motion_lens = [opt.motion_length * opt.fps for _ in range(opt.num_samples)] |
|
|
|
save_dir = './tmp/gen/' |
|
filenames = [generate_md5(str(time.time())) + ".mp4", generate_md5(str(time.time())) + ".mp4", generate_md5(str(time.time())) + ".mp4"] |
|
save_paths = [pjoin(save_dir, str(filenames[0])), pjoin(save_dir, str(filenames[1])), pjoin(save_dir, str(filenames[2]))] |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
start_time = time.perf_counter() |
|
gr.Info("Generating motion...", duration = 3) |
|
pred_motions, _ = pipeline.generate(texts, torch.LongTensor([int(x) for x in motion_lens])) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Generating time cost: {exc:.2f} s, rendering starts...", duration = 3) |
|
start_time = time.perf_counter() |
|
mean = np.load(pjoin(opt.meta_dir, 'mean.npy')) |
|
std = np.load(pjoin(opt.meta_dir, 'std.npy')) |
|
|
|
samples = [] |
|
|
|
root_list = [] |
|
for i, motion in enumerate(pred_motions): |
|
motion = motion.cpu().numpy() * std + mean |
|
|
|
motion = recover_from_ric(torch.from_numpy(motion).float(), opt.joints_num) |
|
|
|
floor_height = motion.min(dim=0)[0].min(dim=0)[0][1] |
|
motion[:, :, 1] -= floor_height |
|
motion = motion.numpy() |
|
|
|
motion = motion_temporal_filter(motion, sigma=1) |
|
|
|
samples.append(motion) |
|
|
|
for i,title in enumerate(texts): |
|
title = texts[i] |
|
motion = samples[i] |
|
kinematic_tree = paramUtil.t2m_kinematic_chain if (opt.dataset_name == 't2m') else paramUtil.kit_kinematic_chain |
|
plot_3d_motion(save_paths[i], kinematic_tree, motion, title=title, fps=opt.fps, radius=opt.radius) |
|
|
|
gr.Info("Rendered motion...", duration = 3) |
|
end_time = time.perf_counter() |
|
exc = end_time - start_time |
|
gr.Info(f"Rendering time cost: {exc:.2f} s", duration = 3) |
|
|
|
video_dis0 = f"""<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[0]}"></video> <br> <p align="center"> Style Reference </p>""" |
|
video_dis1 = f"""<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[2]}"></video> <br> <p align="center"> Content Reference </p>""" |
|
video_dis2 = f"""<video controls playsinline width="{width}" style="display: block; margin: 0 auto;" src="./file={save_paths[1]}"></video> <br> <p align="center"> Transfered Result </p>""" |
|
|
|
edit_config = set_all_use_to_false(edit_config) |
|
return video_dis0, video_dis2 |
|
|
|
|
|
@spaces.GPU |
|
def main(): |
|
parser = GenerateOptions() |
|
opt = parser.parse_app() |
|
set_seed(opt.seed) |
|
device_id = opt.gpu_id |
|
device = torch.device('cuda:%d' % device_id if torch.cuda.is_available() else 'cpu') |
|
opt.device = device |
|
|
|
|
|
|
|
model = build_models(opt, edit_config=edit_config) |
|
ckpt_path = pjoin(opt.model_dir, opt.which_ckpt + '.tar') |
|
niter = load_model_weights(model, ckpt_path, use_ema=not opt.no_ema) |
|
|
|
pipeline = DiffusePipeline( |
|
opt = opt, |
|
model = model, |
|
diffuser_name = opt.diffuser_name, |
|
device=device, |
|
num_inference_steps=opt.num_inference_steps, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(HEAD) |
|
with gr.Row(): |
|
with gr.Column(scale=7): |
|
text_input = gr.Textbox(label="Input the text prompt to generate motion...") |
|
with gr.Column(scale=3): |
|
sequence_length = gr.Slider(minimum=1, maximum=9.6, step=0.1, label="Motion length", value=8) |
|
with gr.Row(): |
|
generate_button = gr.Button("Generate motion") |
|
|
|
with gr.Row(): |
|
video_display = gr.HTML(label="生成的视频", visible=True) |
|
|
|
|
|
tabs = gr.Tabs(visible=True) |
|
with tabs: |
|
with gr.Tab("Motion (de-)emphasizing"): |
|
with gr.Row(): |
|
int_input = gr.Number(label="Editing word index", minimum=0, maximum=70) |
|
weight_input = gr.Slider(minimum=-1, maximum=1, step=0.01, label="Input weight for (de-)emphasizing [-1, 1]", value=0) |
|
|
|
trim_button = gr.Button("Edit reweighting") |
|
|
|
with gr.Row(): |
|
original_video1 = gr.HTML(label="before editing", visible=False) |
|
edited_video = gr.HTML(label="after editing") |
|
|
|
trim_button.click( |
|
fn=lambda x, int_input, weight_input : reweighting(x, int_input, weight_input, opt, pipeline), |
|
inputs=[text_input, int_input, weight_input], |
|
outputs=edited_video, |
|
) |
|
|
|
with gr.Tab("Example-based motion genration"): |
|
with gr.Row(): |
|
with gr.Column(scale=4): |
|
chunk_size = gr.Number(minimum=10, maximum=20, step=10,label="Chunk size (#frames)", value=20) |
|
example_based_steps_end = gr.Number(minimum=0, maximum=9,label="Ending step of manipulation", value=6) |
|
with gr.Column(scale=3): |
|
temp_seed = gr.Number(label="Seed for random", value=200, minimum=0) |
|
temp_seed_bar = gr.Slider(minimum=0, maximum=100, step=1, label="Seed for random bar", value=15) |
|
with gr.Column(scale=3): |
|
num_motion = gr.Radio(choices=[4, 8, 12, 16, 24], value=8, label="Select number of motions") |
|
|
|
gen_button = gr.Button("Generate example-based motion") |
|
|
|
|
|
example_video_display = [] |
|
for _ in range(6): |
|
with gr.Row(): |
|
for _ in range(4): |
|
video = gr.HTML(label="Example-based motion", visible=True) |
|
example_video_display.append(video) |
|
|
|
gen_button.click( |
|
fn=lambda text, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion: generate_example_based_motion(text, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion, opt, pipeline), |
|
inputs=[text_input, chunk_size, example_based_steps_end, temp_seed, temp_seed_bar, num_motion], |
|
outputs=example_video_display |
|
) |
|
|
|
with gr.Tab("Style transfer"): |
|
with gr.Row(): |
|
style_text = gr.Textbox(label="Reference prompt (e.g. 'a man walks.')", value="a man walks.") |
|
style_transfer_steps_end = gr.Number(label="The end step of diffusion (0~9)", minimum=0, maximum=9, value=5) |
|
|
|
style_transfer_button = gr.Button("Transfer style") |
|
|
|
with gr.Row(): |
|
style_reference = gr.HTML(label="style reference") |
|
original_video4 = gr.HTML(label="before style transfer", visible=False) |
|
styled_video = gr.HTML(label="after style transfer") |
|
|
|
style_transfer_button.click( |
|
fn=lambda text, style_text, style_transfer_steps_end: transfer_style(text, style_text, style_transfer_steps_end, opt, pipeline), |
|
inputs=[text_input, style_text, style_transfer_steps_end], |
|
outputs=[style_reference, styled_video], |
|
) |
|
|
|
def update_motion_length(sequence_length): |
|
opt.motion_length = sequence_length |
|
|
|
def on_generate(text, length, pipeline): |
|
update_motion_length(length) |
|
return generate_video_from_text(text, opt, pipeline) |
|
|
|
|
|
generate_button.click( |
|
fn=lambda text, length: on_generate(text, length, pipeline), |
|
inputs=[text_input, sequence_length], |
|
outputs=[ |
|
video_display, |
|
original_video1, |
|
original_video4, |
|
tabs, |
|
], |
|
show_progress=True |
|
) |
|
|
|
generate_button.click( |
|
fn=lambda: [gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)], |
|
inputs=None, |
|
outputs=[video_display, original_video1, original_video4] |
|
) |
|
|
|
demo.launch() |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |