stable-video-diffusion-xt-1

Runtime error

App Files Files Community

multimodalart HF staff commited on Nov 22, 2023

Commit

d56d267

1 Parent(s): 5e1ee6f

Update app.py

Browse files

Files changed (1) hide show

app.py +218 -62

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import cv2
 import numpy as np
 import torch
 from einops import rearrange, repeat
 from omegaconf import OmegaConf
 from PIL import Image
 from torchvision.transforms import ToTensor
@@ -16,37 +17,198 @@ from scripts.util.detection.nsfw_and_watermark_dectection import \
     DeepFloydDataFiltering
 from sgm.inference.helpers import embed_watermark
 from sgm.util import default, instantiate_from_config
-from huggingface_hub import hf_hub_download
-import gradio as gr
-import uuid
-from simple_video_sample import sample
-num_frames = 25
-num_steps = 30
-model_config = "scripts/sampling/configs/svd_xt.yaml"
 device = "cuda"
-hf_hub_download(repo_id="stabilityai/stable-video-diffusion-img2vid-xt", filename="svd_xt.safetensors", local_dir="checkpoints", token=os.getenv("HF_TOKEN"))
-def run_sampling(
-    input_path: str,
-    num_frames: Optional[int] = 25,
-    num_steps: Optional[int] = 30,
     version: str = "svd_xt",
     fps_id: int = 6,
     motion_bucket_id: int = 127,
     cond_aug: float = 0.02,
     seed: int = 23,
     decoding_t: int = 7,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
 ):
-    output_folder = str(uuid.uuid4())
-    print(output_folder)
-    print(version)
-    print(input_path)
-    sample(input_path, version=version, output_folder=output_folder, decoding_t=decoding_t)
-    return f"{output_folder}/000000.mp4"
 def get_unique_embedder_keys_from_conditioner(conditioner):
     return list(set([x.input_key for x in conditioner.embedders]))
@@ -92,58 +254,52 @@ def get_batch(keys, value_dict, N, T, device):
             batch_uc[key] = torch.clone(batch[key])
     return batch, batch_uc
 def resize_image(image_path, output_size=(1024, 576)):
-    with Image.open(image_path) as image:
-        # Calculate aspect ratios
-        target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
-        image_aspect = image.width / image.height  # Aspect ratio of the original image
-        # Resize then crop if the original image is larger
-        if image_aspect > target_aspect:
-            # Resize the image to match the target height, maintaining aspect ratio
-            new_height = output_size[1]
-            new_width = int(new_height * image_aspect)
-            resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-            # Calculate coordinates for cropping
-            left = (new_width - output_size[0]) / 2
-            top = 0
-            right = (new_width + output_size[0]) / 2
-            bottom = output_size[1]
-        else:
-            # Resize the image to match the target width, maintaining aspect ratio
-            new_width = output_size[0]
-            new_height = int(new_width / image_aspect)
-            resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-            # Calculate coordinates for cropping
-            left = 0
-            top = (new_height - output_size[1]) / 2
-            right = output_size[0]
-            bottom = (new_height + output_size[1]) / 2
-        # Crop the image
-        cropped_image = resized_image.crop((left, top, right, bottom))
-    return cropped_image
-css = '''
-.gradio-container{max-width:850px !important}
-'''
-with gr.Blocks(css=css) as demo:
   gr.Markdown('''# Stable Video Diffusion - Image2Video - XT
-Generate 25 frames of video from a single image with SDV-XT. [Join the waitlist](https://stability.ai/contact) for the text-to-video web experience
   ''')
   with gr.Column():
     image = gr.Image(label="Upload your image (it will be center cropped to 1024x576)", type="filepath")
     generate_btn = gr.Button("Generate")
-    #with gr.Accordion("Advanced options", open=False):
-    #  cond_aug = gr.Slider(label="Conditioning augmentation", value=0.02, minimum=0.0)
-    #  seed = gr.Slider(label="Seed", value=42, minimum=0, maximum=int(1e9), step=1)
-      #decoding_t = gr.Slider(label="Decode frames at a time", value=6, minimum=1, maximum=14, interactive=False)
-    #  saving_fps = gr.Slider(label="Saving FPS", value=6, minimum=6, maximum=48, step=6)
   with gr.Column():
     video = gr.Video()
-  image.upload(fn=resize_image, inputs=image, outputs=image)
-  generate_btn.click(fn=run_sampling, inputs=[image], outputs=video, api_name="video")
-demo.launch()

 import numpy as np
 import torch
 from einops import rearrange, repeat
+from fire import Fire
 from omegaconf import OmegaConf
 from PIL import Image
 from torchvision.transforms import ToTensor
     DeepFloydDataFiltering
 from sgm.inference.helpers import embed_watermark
 from sgm.util import default, instantiate_from_config
+hf_hub_download(repo_id="stabilityai/stable-video-diffusion-img2vid-xt", filename="svd_xt.safetensors", local_dir="checkpoints")
+version = "svd_xt"
 device = "cuda"
+def load_model(
+    config: str,
+    device: str,
+    num_frames: int,
+    num_steps: int,
+):
+    config = OmegaConf.load(config)
+    if device == "cuda":
+        config.model.params.conditioner_config.params.emb_models[
+            0
+        ].params.open_clip_embedding_config.params.init_device = device
+    config.model.params.sampler_config.params.num_steps = num_steps
+    config.model.params.sampler_config.params.guider_config.params.num_frames = (
+        num_frames
+    )
+    if device == "cuda":
+        with torch.device(device):
+            model = instantiate_from_config(config.model).to(device).eval()
+    else:
+        model = instantiate_from_config(config.model).to(device).eval()
+    filter = DeepFloydDataFiltering(verbose=False, device=device)
+    return model, filter
+if version == "svd_xt":
+    num_frames = 25
+    num_steps = 30
+    model_config = "scripts/sampling/configs/svd_xt.yaml"
+else:
+    raise ValueError(f"Version {version} does not exist.")
+model, filter = load_model(
+    model_config,
+    device,
+    num_frames,
+    num_steps,
+)
+def sample(
+    input_path: str = "assets/test_image.png",  # Can either be image file or folder with image files
     version: str = "svd_xt",
     fps_id: int = 6,
     motion_bucket_id: int = 127,
     cond_aug: float = 0.02,
     seed: int = 23,
     decoding_t: int = 7,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
+    device: str = "cuda",
+    output_folder: str = "outputs",
 ):
+    """
+    Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
+    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
+    """
+    torch.manual_seed(seed)
+    path = Path(input_path)
+    all_img_paths = []
+    if path.is_file():
+        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
+            all_img_paths = [input_path]
+        else:
+            raise ValueError("Path is not valid image file.")
+    elif path.is_dir():
+        all_img_paths = sorted(
+            [
+                f
+                for f in path.iterdir()
+                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
+            ]
+        )
+        if len(all_img_paths) == 0:
+            raise ValueError("Folder does not contain any images.")
+    else:
+        raise ValueError
+    for input_img_path in all_img_paths:
+        with Image.open(input_img_path) as image:
+            if image.mode == "RGBA":
+                image = image.convert("RGB")
+            w, h = image.size
+            if h % 64 != 0 or w % 64 != 0:
+                width, height = map(lambda x: x - x % 64, (w, h))
+                image = image.resize((width, height))
+                print(
+                    f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
+                )
+            image = ToTensor()(image)
+            image = image * 2.0 - 1.0
+        image = image.unsqueeze(0).to(device)
+        H, W = image.shape[2:]
+        assert image.shape[1] == 3
+        F = 8
+        C = 4
+        shape = (num_frames, C, H // F, W // F)
+        if (H, W) != (576, 1024):
+            print(
+                "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
+            )
+        if motion_bucket_id > 255:
+            print(
+                "WARNING: High motion bucket! This may lead to suboptimal performance."
+            )
+        if fps_id < 5:
+            print("WARNING: Small fps value! This may lead to suboptimal performance.")
+        if fps_id > 30:
+            print("WARNING: Large fps value! This may lead to suboptimal performance.")
+        value_dict = {}
+        value_dict["motion_bucket_id"] = motion_bucket_id
+        value_dict["fps_id"] = fps_id
+        value_dict["cond_aug"] = cond_aug
+        value_dict["cond_frames_without_noise"] = image
+        value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
+        value_dict["cond_aug"] = cond_aug
+        with torch.no_grad():
+            with torch.autocast(device):
+                batch, batch_uc = get_batch(
+                    get_unique_embedder_keys_from_conditioner(model.conditioner),
+                    value_dict,
+                    [1, num_frames],
+                    T=num_frames,
+                    device=device,
+                )
+                c, uc = model.conditioner.get_unconditional_conditioning(
+                    batch,
+                    batch_uc=batch_uc,
+                    force_uc_zero_embeddings=[
+                        "cond_frames",
+                        "cond_frames_without_noise",
+                    ],
+                )
+                for k in ["crossattn", "concat"]:
+                    uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                    uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                    c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                    c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+                randn = torch.randn(shape, device=device)
+                additional_model_inputs = {}
+                additional_model_inputs["image_only_indicator"] = torch.zeros(
+                    2, num_frames
+                ).to(device)
+                additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+                def denoiser(input, sigma, c):
+                    return model.denoiser(
+                        model.model, input, sigma, c, **additional_model_inputs
+                    )
+                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+                model.en_and_decode_n_samples_a_time = decoding_t
+                samples_x = model.decode_first_stage(samples_z)
+                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                os.makedirs(output_folder, exist_ok=True)
+                base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+                video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+                writer = cv2.VideoWriter(
+                    video_path,
+                    cv2.VideoWriter_fourcc(*"mp4v"),
+                    fps_id + 1,
+                    (samples.shape[-1], samples.shape[-2]),
+                )
+                samples = embed_watermark(samples)
+                samples = filter(samples)
+                vid = (
+                    (rearrange(samples, "t c h w -> t h w c") * 255)
+                    .cpu()
+                    .numpy()
+                    .astype(np.uint8)
+                )
+                for frame in vid:
+                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                    writer.write(frame)
+                writer.release()
+        return video_path
 def get_unique_embedder_keys_from_conditioner(conditioner):
     return list(set([x.input_key for x in conditioner.embedders]))
             batch_uc[key] = torch.clone(batch[key])
     return batch, batch_uc
+import gradio as gr
+import uuid
 def resize_image(image_path, output_size=(1024, 576)):
+    image = Image.open(image_path)
+    # Calculate aspect ratios
+    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
+    image_aspect = image.width / image.height  # Aspect ratio of the original image
+    # Resize then crop if the original image is larger
+    if image_aspect > target_aspect:
+        # Resize the image to match the target height, maintaining aspect ratio
+        new_height = output_size[1]
+        new_width = int(new_height * image_aspect)
+        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
+        # Calculate coordinates for cropping
+        left = (new_width - output_size[0]) / 2
+        top = 0
+        right = (new_width + output_size[0]) / 2
+        bottom = output_size[1]
+    else:
+        # Resize the image to match the target width, maintaining aspect ratio
+        new_width = output_size[0]
+        new_height = int(new_width / image_aspect)
+        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
+        # Calculate coordinates for cropping
+        left = 0
+        top = (new_height - output_size[1]) / 2
+        right = output_size[0]
+        bottom = (new_height + output_size[1]) / 2
+    # Crop the image
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return cropped_image
+with gr.Blocks() as demo:
   gr.Markdown('''# Stable Video Diffusion - Image2Video - XT
+Generate 25 frames of video from a single image using SDV-XT.
   ''')
   with gr.Column():
     image = gr.Image(label="Upload your image (it will be center cropped to 1024x576)", type="filepath")
     generate_btn = gr.Button("Generate")
   with gr.Column():
     video = gr.Video()
+  image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
+  generate_btn.click(fn=sample, inputs=image, outputs=video, api_name="video")
+if __name__ == "__main__":
+    demo.launch(share=True)