File size: 6,720 Bytes
5c6a167
9ac31b8
 
3b06696
 
 
 
775329e
9ac31b8
 
3b06696
a9235bb
25d3956
e3d310b
 
dc2eed4
 
 
 
 
 
43fad66
9ac31b8
43fad66
 
20b7858
43fad66
 
57eee7d
0ad3cbd
 
 
57eee7d
7c47fc0
20b7858
43fad66
 
 
 
 
 
d56d267
9ac31b8
 
 
d56d267
dc2eed4
d56d267
0cd72ee
9ac31b8
25d3956
3b06696
492fffc
 
3b06696
9ac31b8
d56d267
 
3b06696
9ac31b8
 
 
25d3956
 
9ac31b8
d56d267
9ac31b8
 
 
0cd72ee
5c6a167
9ac31b8
 
 
0cd72ee
3b06696
0cd72ee
d56d267
 
 
3b06696
d56d267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b06696
d56d267
 
 
ff46702
d56d267
dc2eed4
25d3956
 
0cd72ee
25d3956
 
c8d4706
 
3780d1e
492fffc
 
25d3956
d56d267
492fffc
9ac31b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d56d267
decf237
ba7dc43
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import spaces
import gradio as gr
import torch
import os
from glob import glob
from pathlib import Path
from typing import Optional
from huggingface_hub import HfFolder
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
from PIL import Image
import uuid
import random
from huggingface_hub import hf_hub_download

title = '''# ๐Ÿ‘‹๐ŸปWelcome to Tonic's๐ŸŒŸ๐ŸŽฅStableVideo XT-1-1 
๐ŸŒŸ๐ŸŽฅStableVideo XT-1-1 (SVD) Image-to-Video is a latent diffusion model trained to generate short video clips from an image conditioning. Check out the [Community demo for Stable Video Diffusion](https://huggingface.co/spaces/multimodalart/stable-video-diffusion) - Img2Vid - XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1), [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), [stability's ui waitlist](https://stability.ai/contact))
#### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `4s` vid from a single image at (`25 frames` at `6 fps`). this demo uses [๐Ÿงจ diffusers for low VRAM and fast generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd).
Join us : ๐ŸŒŸTeamTonic๐ŸŒŸ is always making cool demos! Join our active builder's๐Ÿ› ๏ธcommunity ๐Ÿ‘ป  [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On ๐Ÿค—Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On ๐ŸŒGithub: [Tonic-AI](https://github.com/tonic-ai) & contribute to ๐ŸŒŸ [SciTonic](https://github.com/Tonic-AI/scitonic) ๐Ÿค—Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant ๐Ÿค—
'''

original_model_id = "stabilityai/stable-video-diffusion-img2vid-xt"
pipe = StableVideoDiffusionPipeline.from_pretrained(
    original_model_id,
    torch_dtype=torch.float16,
    variant="fp16",
)

downloaded_safetensors_path = "svd_xt_1_1.safetensors"
if not os.path.exists(downloaded_safetensors_path):
    raise FileNotFoundError(f"The file {downloaded_safetensors_path} was not found.")

cache_dir = os.path.join(os.path.expanduser("~"), "cache")
model_cache_dir = os.path.join(cache_dir, original_model_id.replace("/", "--"))
original_safetensors_path = os.path.join(model_cache_dir, "svd_xt.safetensors")
os.replace(downloaded_safetensors_path, original_safetensors_path)

pipe = StableVideoDiffusionPipeline.from_pretrained(
    model_cache_dir,
    torch_dtype=torch.float16,
    variant="fp16"
)
pipe.to("cuda")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
max_64_bit_int = 2**63 - 1

@spaces.GPU(enable_queue=True)
def sample(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = True,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    version: str = "svd_xt",
    cond_aug: float = 0.02,
    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    output_folder: str = "outputs",
):
    if image.mode == "RGBA":
        image = image.convert("RGB")
        
    if(randomize_seed):
        seed = random.randint(0, max_64_bit_int)
    generator = torch.manual_seed(seed)
    
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=14).frames[0]
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)
    
    return video_path, seed

def resize_image(image, output_size=(1024, 576)):
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
    image_aspect = image.width / image.height  # Aspect ratio of the original image

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        # Resize the image to match the target height, maintaining aspect ratio
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        # Resize the image to match the target width, maintaining aspect ratio
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    # Crop the image
    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image

with gr.Blocks() as demo:
  gr.Markdown(title)
  with gr.Row():
    with gr.Column():
        image = gr.Image(label="Upload your image", type="pil")
        generate_btn = gr.Button("Generate")
    video = gr.Video()
  with gr.Accordion("Advanced options", open=False):
      seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
      randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
      motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
      fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
      
  image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
  generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video")
  gr.Examples(
    examples=[
        "images/blink_meme.png",
        "images/confused2_meme.png",
        "images/disaster_meme.png",
        "images/distracted_meme.png",
        "images/hide_meme.png",
        "images/nazare_meme.png",
        "images/success_meme.png",
        "images/willy_meme.png",
        "images/wink_meme.png"
    ],
    inputs=image,
    outputs=[video, seed],
    fn=sample,
    cache_examples=True,
  )

if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch(share=True)