Spaces:

yhzhai
/

mcm

Running on Zero

File size: 17,265 Bytes

import spaces
import os
import random
from datetime import datetime
from typing import Optional

import gradio as gr
import numpy as np
import torch
from diffusers import (
    AnimateDiffPipeline,
    DiffusionPipeline,
    LCMScheduler,
    MotionAdapter,
)
from diffusers.utils import export_to_video
from peft import PeftModel

device = "cuda"
mcm_id = "yhzhai/mcm"
basedir = os.getcwd()
savedir = os.path.join(
    basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S")
)

MAX_SEED = np.iinfo(np.int32).max


def get_modelscope_pipeline(
    mcm_variant: Optional[str] = "WebVid",
):
    model_id = "ali-vilab/text-to-video-ms-1.7b"
    # if torch.cuda.is_available():
    #     pipe = DiffusionPipeline.from_pretrained(
    #         model_id, torch_dtype=torch.float16, variant="fp16"
    #     )
    # else:
    pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16")
    scheduler = LCMScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
        timestep_scaling=4.0,
    )
    pipe.scheduler = scheduler
    pipe.enable_vae_slicing()

    if mcm_variant == "WebVid":
        subfolder = "modelscopet2v-webvid"
    elif mcm_variant == "LAION-aes":
        subfolder = "modelscopet2v-laion"
    elif mcm_variant == "Anime":
        subfolder = "modelscopet2v-anime"
    elif mcm_variant == "Realistic":
        subfolder = "modelscopet2v-real"
    elif mcm_variant == "3D Cartoon":
        subfolder = "modelscopet2v-3d-cartoon"
    else:
        subfolder = "modelscopet2v-laion"

    lora = PeftModel.from_pretrained(
        pipe.unet,
        model_id=mcm_id,
        subfolder=subfolder,
        adapter_name="lora",
        torch_device="cpu",
    )
    lora.merge_and_unload()
    pipe.unet = lora

    pipe = pipe.to(device)

    return pipe


def get_animatediff_pipeline(
    real_variant: Optional[str] = "realvision",
    motion_module_path: str = "guoyww/animatediff-motion-adapter-v1-5-2",
    mcm_variant: Optional[str] = "WebVid",
):
    if real_variant is None:
        model_id = "runwayml/stable-diffusion-v1-5"
    elif real_variant == "epicrealism":
        model_id = "emilianJR/epiCRealism"
    elif real_variant == "realvision":
        model_id = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
    else:
        raise ValueError(f"Unknown real_variant {real_variant}")

    # if torch.cuda.is_available():
    #     adapter = MotionAdapter.from_pretrained(
    #         motion_module_path, torch_dtype=torch.float16
    #     )
    #     pipe = AnimateDiffPipeline.from_pretrained(
    #         model_id,
    #         motion_adapter=adapter,
    #         torch_dtype=torch.float16,
    #     )
    # else:
    adapter = MotionAdapter.from_pretrained(motion_module_path)
    pipe = AnimateDiffPipeline.from_pretrained(
        model_id,
        motion_adapter=adapter, torch_dtype=torch.float16
    )
    scheduler = LCMScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
        timestep_scaling=4.0,
        clip_sample=False,
        timestep_spacing="linspace",
        beta_schedule="linear",
        beta_start=0.00085,
        beta_end=0.012,
        steps_offset=1,
    )
    pipe.scheduler = scheduler
    pipe.enable_vae_slicing()

    if mcm_variant == "WebVid":
        subfolder = "animatediff-webvid"
    elif mcm_variant == "LAION-aes":
        subfolder = "animatediff-laion"
    else:
        subfolder = "animatediff-laion"

    lora = PeftModel.from_pretrained(
        pipe.unet,
        model_id=mcm_id,
        subfolder=subfolder,
        adapter_name="lora",
        torch_device="cpu",
    )
    lora.merge_and_unload()
    pipe.unet = lora

    pipe = pipe.to(device)
    return pipe


pipe_dict = {
    "ModelScope T2V": {
        "WebVid": None,
        "LAION-aes": None,
        "Anime": None,
        "Realistic": None,
        "3D Cartoon": None,
    },
    "AnimateDiff (SD1.5)": {"WebVid": None, "LAION-aes": None},
    "AnimateDiff (RealisticVision)": {"WebVid": None, "LAION-aes": None},
    "AnimateDiff (epiCRealism)": {"WebVid": None, "LAION-aes": None},
}
cache_pipeline = {
    "base_model": None,
    "variant": None,
    "pipeline": None,
}

# def init_pipelines():
#     for base_model in variants.keys():
#         for variant in variants[base_model]:
#             if pipe_dict[base_model][variant] is None:
#                 if base_model == "ModelScope T2V":
#                     pipe_dict[base_model][variant] = get_modelscope_pipeline(mcm_variant=variant)
#                 elif base_model == "AnimateDiff (SD1.5)":
#                     pipe_dict[base_model][variant] = get_animatediff_pipeline(
#                         real_variant=None,
#                         motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
#                         mcm_variant=variant,
#                     )
#                 elif base_model == "AnimateDiff (RealisticVision)":
#                     pipe_dict[base_model][variant] = get_animatediff_pipeline(
#                         real_variant="realvision",
#                         motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
#                         mcm_variant=variant,
#                     )
#                 elif base_model == "AnimateDiff (epiCRealism)":
#                     pipe_dict[base_model][variant] = get_animatediff_pipeline(
#                         real_variant="epicrealism",
#                         motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
#                         mcm_variant=variant,
#                     )
#                 else:
#                     raise ValueError(f"Unknown base_model {base_model}")


@spaces.GPU(duration=60)
def infer(
    base_model,
    variant,
    prompt,
    num_inference_steps=4,
    height=256,
    width=256,
    seed=0,
    randomize_seed=True,
    progress = gr.Progress(track_tqdm=True),
):
    # if pipe_dict[base_model][variant] is None:
    #     if base_model == "ModelScope T2V":
    #         pipe_dict[base_model][variant] = get_modelscope_pipeline(mcm_variant=variant)
    #     elif base_model == "AnimateDiff (SD1.5)":
    #         pipe_dict[base_model][variant] = get_animatediff_pipeline(
    #             real_variant=None,
    #             motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
    #             mcm_variant=variant,
    #         )
    #     elif base_model == "AnimateDiff (RealisticVision)":
    #         pipe_dict[base_model][variant] = get_animatediff_pipeline(
    #             real_variant="realvision",
    #             motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
    #             mcm_variant=variant,
    #         )
    #     elif base_model == "AnimateDiff (epiCRealism)":
    #         pipe_dict[base_model][variant] = get_animatediff_pipeline(
    #             real_variant="epicrealism",
    #             motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
    #             mcm_variant=variant,
    #         )
    #     else:
    #         raise ValueError(f"Unknown base_model {base_model}")
    if (
        cache_pipeline["base_model"] == base_model
        and cache_pipeline["variant"] == variant
    ):
        pass
    else:
        if base_model == "ModelScope T2V":
            pipeline = get_modelscope_pipeline(mcm_variant=variant)
        elif base_model == "AnimateDiff (SD1.5)":
            pipeline = get_animatediff_pipeline(
                real_variant=None,
                motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
                mcm_variant=variant,
            )
        elif base_model == "AnimateDiff (RealisticVision)":
            pipeline = get_animatediff_pipeline(
                real_variant="realvision",
                motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
                mcm_variant=variant,
            )
        elif base_model == "AnimateDiff (epiCRealism)":
            pipeline = get_animatediff_pipeline(
                real_variant="epicrealism",
                motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2",
                mcm_variant=variant,
            )
        else:
            raise ValueError(f"Unknown base_model {base_model}")

        cache_pipeline["base_model"] = base_model
        cache_pipeline["variant"] = variant
        cache_pipeline["pipeline"] = pipeline

    # pipe_dict[base_model][variant] = pipe_dict[base_model][variant].to(device)
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)

    generator = torch.Generator("cpu").manual_seed(seed)

    output = cache_pipeline["pipeline"](
        prompt=prompt,
        num_frames=16,
        guidance_scale=1.0,
        num_inference_steps=num_inference_steps,
        height=height,
        width=width,
        generator=generator,
    ).frames
    if not isinstance(output, list):
        output = [output[i] for i in range(output.shape[0])]

    os.makedirs(savedir, exist_ok=True)
    save_path = os.path.join(
        savedir, f"sample_{base_model}_{variant}_{seed}.mp4".replace(" ", "_")
    )
    export_to_video(
        output[0],
        save_path,
        fps=7,
    )
    print(f"Saved to {save_path}")
    # pipe_dict[base_model][variant] = pipe_dict[base_model][variant].to("cpu")
    return save_path, seed


examples = [
    [
        "ModelScope T2V",
        "LAION-aes",
        "Aerial uhd 4k view. mid-air flight over fresh and clean mountain river at sunny summer morning. Green trees and sun rays on horizon. Direct on sun.",
        4,
        256,
        256,
    ],
    [
        "ModelScope T2V",
        "Anime",
        "Timelapse misty mountain landscape",
        4,
        256,
        256,
    ],
    [
        "ModelScope T2V",
        "WebVid",
        "Back of woman in shorts going near pure creek in beautiful mountains.",
        4,
        256,
        256,
    ],
    [
        "ModelScope T2V",
        "3D Cartoon",
        "A rotating pandoro (a traditional italian sweet yeast bread, most popular around christmas and new year) being eaten in time-lapse.",
        4,
        256,
        256,
    ],
    [
        "ModelScope T2V",
        "Realistic",
        "Slow motion avocado with a stone falls and breaks into 2 parts with splashes",
        4,
        256,
        256,
    ],
    [
        "AnimateDiff (epiCRealism)",
        "LAION-aes",
        "Slow motion of delicious salmon sachimi set with green vegetables leaves served on wood plate. make homemade japanese food at home.-dan",
        8,
        512,
        512,
    ],
    [
        "AnimateDiff (epiCRealism)",
        "WebVid",
        "Blooming meadow panorama zoom-out shot heavenly clouds and upcoming thunderstorm in mountain range harz, germany.",
        8,
        512,
        512,
    ],
    [
        "AnimateDiff (epiCRealism)",
        "LAION-aes",
        "A young woman in a yellow sweater uses vr glasses, sitting on the shore of a pond on a background of dark waves. a strong wind develops her hair, the sun's rays are reflected from the water.",
        8,
        512,
        512,
    ],
    [
        "AnimateDiff (epiCRealism)",
        "LAION-aes",
        "Female running at sunset. healthy fitness concept",
        8,
        512,
        512,
    ],
]

css = """
#col-container {
    margin: 0 auto;
}
"""

variants = {
    "ModelScope T2V": ["WebVid", "LAION-aes", "Anime", "Realistic", "3D Cartoon"],
    "AnimateDiff (SD1.5)": ["WebVid", "LAION-aes"],
    "AnimateDiff (RealisticVision)": ["WebVid", "LAION-aes"],
    "AnimateDiff (epiCRealism)": ["WebVid", "LAION-aes"],
}


def update_variant(rs):
    return gr.update(choices=variants[rs], value=None)


# init_pipelines()

with gr.Blocks(css=css) as demo:

    with gr.Column(elem_id="col-container"):
        gr.HTML(
            """
        <div style="text-align: center; margin-bottom: 20px;">
            <h1 align="center">
              <a href="https://yhzhai.github.io/mcm/"><b>Motion Consistency Model: Accelerating Video Diffusion with Disentangled Motion-Appearance Distillation</b></a>
            </h1>
            <h4>Our motion consistency model not only accelerates text2video diffusion model sampling process, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.</h4>
            <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
                <a href='https://yhzhai.github.io/mcm/'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;
                <a href='https://arxiv.org/abs/2406.06890'><img src='https://img.shields.io/badge/Paper-arXiv-red'></a> &nbsp;
                <a href='https://huggingface.co/yhzhai/mcm'><img src='https://img.shields.io/badge/HF-checkpoint-yellow'></a> 
            </div>
        </div>
        """
        )

        gr.Markdown(
            f"""
        <p align="center">Currently running on {device}.</p>
        <p align="center">Model loading takes extra time.</p>
        """
        )

        # <p align="center">ModelScope T2V works the best for resolution 256x256, and AnimateDiff works the best for 512x512.</p>
        with gr.Row():
            base_model = gr.Dropdown(
                label="Base model",
                choices=[
                    "ModelScope T2V",
                    "AnimateDiff (SD1.5)",
                    "AnimateDiff (RealisticVision)",
                    "AnimateDiff (epiCRealism)",
                ],
                value="ModelScope T2V",
                interactive=True,
            )
            variant_dropdown = gr.Dropdown(
                variants["ModelScope T2V"],
                label="MCM Variant",
                interactive=True,
                value=None,
            )
            base_model.change(
                update_variant, inputs=[base_model], outputs=[variant_dropdown]
            )

        with gr.Row():
            prompt = gr.Text(
                label="Prompt",
                show_label=False,
                max_lines=1,
                placeholder="Enter your prompt",
                container=False,
            )

            run_button = gr.Button("Run", scale=0)

        with gr.Row():
            with gr.Column():
                with gr.Accordion("Advanced Settings", open=True):

                    seed = gr.Slider(
                        label="Seed",
                        minimum=0,
                        maximum=MAX_SEED,
                        step=1,
                        value=0,
                    )

                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

                    with gr.Row():
                        num_inference_steps = gr.Slider(
                            label="Number of inference steps",
                            minimum=1,
                            maximum=12,
                            step=1,
                            value=4,
                        )
                    
                    with gr.Group():
                        with gr.Row():
                            text_hint = gr.Textbox(
                                "Hint: ModelScope T2V works the best for resolution 256x256, and AnimateDiff works the best for resolution 512x512.",
                                interactive=False,
                                label="Hint",
                                container=False,

                            )
                        with gr.Row():
                            height = gr.Slider(
                                label="Height",
                                minimum=256,
                                maximum=1024,
                                step=64,
                                value=512,
                                interactive=True,
                            )
                            width = gr.Slider(
                                label="Width",
                                minimum=256,
                                maximum=1024,
                                step=64,
                                value=512,
                                interactive=True,
                            )



            with gr.Column(show_progress=True):
                # result = gr.Video(label="Result", show_label=False, interactive=False, height=512, width=512, autoplay=True)
                result = gr.Video(
                    label="Result",
                    show_label=False,
                    interactive=False,
                    autoplay=True,
                    # height=512,
                    # width=512,
                )

        gr.Examples(
            examples=examples,
            inputs=[base_model, variant_dropdown, prompt, num_inference_steps, height, width],
            fn=infer,
            outputs=[result, seed],
        )

    run_button.click(
        fn=infer,
        inputs=[
            base_model,
            variant_dropdown,
            prompt,
            num_inference_steps,
            height,
            width,
            seed,
            randomize_seed,
        ],
        outputs=[result, seed],
    )

demo.queue().launch()