stable-video-diffusion-xt-1

Runtime error

File size: 6,510 Bytes

5c6a167
9ac31b8
 
3b06696
 
 
 
775329e
9ac31b8
 
3b06696
a9235bb
25d3956
e3d310b
 
dc2eed4
 
 
 
 
 
775329e
 
 
 
 
 
 
 
 
dc2eed4
9ac31b8
775329e
d56d267
9ac31b8
 
8010ebe
9ac31b8
 
d56d267
dc2eed4
d56d267
0cd72ee
9ac31b8
25d3956
3b06696
492fffc
 
3b06696
9ac31b8
d56d267
 
3b06696
9ac31b8
 
 
25d3956
 
9ac31b8
d56d267
9ac31b8
 
 
0cd72ee
5c6a167
9ac31b8
 
 
0cd72ee
3b06696
0cd72ee
d56d267
 
 
3b06696
d56d267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b06696
d56d267
 
 
ff46702
d56d267
dc2eed4
25d3956
 
0cd72ee
25d3956
 
c8d4706
 
3780d1e
492fffc
 
25d3956
d56d267
492fffc
9ac31b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d56d267
decf237
ba7dc43

import spaces
import gradio as gr
import torch
import os
from glob import glob
from pathlib import Path
from typing import Optional
from huggingface_hub import HfFolder
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
from PIL import Image
import uuid
import random
from huggingface_hub import hf_hub_download

title = '''# 👋🏻Welcome to Tonic's🌟🎥StableVideo XT-1-1 
🌟🎥StableVideo XT-1-1 (SVD) Image-to-Video is a latent diffusion model trained to generate short video clips from an image conditioning. Check out the [Community demo for Stable Video Diffusion](https://huggingface.co/spaces/multimodalart/stable-video-diffusion) - Img2Vid - XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1), [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), [stability's ui waitlist](https://stability.ai/contact))
#### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `4s` vid from a single image at (`25 frames` at `6 fps`). this demo uses [🧨 diffusers for low VRAM and fast generation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd).
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻  [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to 🌟 [SciTonic](https://github.com/Tonic-AI/scitonic) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
'''

# Load the API token from an environment variable
hf_token = os.getenv("HF_TOKEN")

# If the token is not found, raise an error or handle it appropriately
if not hf_token:
    raise ValueError("Hugging Face token not found. Please set the HF_TOKEN environment variable.")

# Use the token for authentication
HfFolder.save_token(hf_token)

pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt-1-1", torch_dtype=torch.float16, variant="fp16", use_auth_token=hf_token
)
pipe.to("cuda")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
#pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)

max_64_bit_int = 2**63 - 1

@spaces.GPU(enable_queue=True)
def sample(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = True,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    version: str = "svd_xt",
    cond_aug: float = 0.02,
    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    output_folder: str = "outputs",
):
    if image.mode == "RGBA":
        image = image.convert("RGB")
        
    if(randomize_seed):
        seed = random.randint(0, max_64_bit_int)
    generator = torch.manual_seed(seed)
    
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=14).frames[0]
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)
    
    return video_path, seed

def resize_image(image, output_size=(1024, 576)):
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
    image_aspect = image.width / image.height  # Aspect ratio of the original image

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        # Resize the image to match the target height, maintaining aspect ratio
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        # Resize the image to match the target width, maintaining aspect ratio
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    # Crop the image
    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image

with gr.Blocks() as demo:
  gr.Markdown(title)
  with gr.Row():
    with gr.Column():
        image = gr.Image(label="Upload your image", type="pil")
        generate_btn = gr.Button("Generate")
    video = gr.Video()
  with gr.Accordion("Advanced options", open=False):
      seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
      randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
      motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
      fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
      
  image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)
  generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video")
  gr.Examples(
    examples=[
        "images/blink_meme.png",
        "images/confused2_meme.png",
        "images/disaster_meme.png",
        "images/distracted_meme.png",
        "images/hide_meme.png",
        "images/nazare_meme.png",
        "images/success_meme.png",
        "images/willy_meme.png",
        "images/wink_meme.png"
    ],
    inputs=image,
    outputs=[video, seed],
    fn=sample,
    cache_examples=True,
  )

if __name__ == "__main__":
    demo.queue(max_size=20)
    demo.launch(share=True)