File size: 4,357 Bytes
35d6846
 
93c305a
35d6846
93c305a
18e9814
 
35d6846
 
 
93c305a
 
 
 
 
 
 
 
18e9814
93c305a
 
 
35d6846
 
 
a9f93ea
111a3cd
 
35d6846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f028eb9
 
 
35d6846
 
 
 
 
 
97d09e9
18e9814
 
 
35d6846
97d09e9
35d6846
18e9814
 
 
 
 
 
 
 
cdb7645
35d6846
93c305a
35d6846
 
18e9814
93c305a
18e9814
97d09e9
18e9814
 
 
 
35d6846
97d09e9
18e9814
 
35d6846
 
cdb7645
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
import gradio as gr
from diffusers import AnimateDiffPipeline, MotionAdapter, DPMSolverMultistepScheduler, AutoencoderKL, SparseControlNetModel
from diffusers.utils import export_to_gif, load_image
from transformers import pipeline
from PIL import Image
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")

def translate_korean_to_english(text):
    if any('\u3131' <= char <= '\u3163' or '\uac00' <= char <= '\ud7a3' for char in text):
        translated = translator(text)[0]['translation_text']
        return translated
    return text

def generate_video(prompt, negative_prompt, num_inference_steps, conditioning_frame_indices, controlnet_conditioning_scale, width, height, num_frames):
    prompt = translate_korean_to_english(prompt)
    negative_prompt = translate_korean_to_english(negative_prompt)

    motion_adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-3", torch_dtype=torch.float16).to(device)
    controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-scribble", torch_dtype=torch.float16).to(device)
    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16).to(device)
    
    pipe = AnimateDiffPipeline.from_pretrained(
        "SG161222/Realistic_Vision_V6.0_B1_noVAE",
        motion_adapter=motion_adapter,
        controlnet=controlnet,
        vae=vae,
        torch_dtype=torch.float16,
    ).to(device)
    
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, beta_schedule="linear", algorithm_type="dpmsolver++", use_karras_sigmas=True)
    
    image_files = [
        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png"
    ]
    conditioning_frames = [load_image(img_file) for img_file in image_files]

    conditioning_frame_indices = eval(conditioning_frame_indices)
    controlnet_conditioning_scale = float(controlnet_conditioning_scale)

    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=num_inference_steps,
        conditioning_frames=conditioning_frames,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        controlnet_frame_indices=conditioning_frame_indices,
        width=width,
        height=height,
        num_frames=num_frames,
        generator=torch.Generator().manual_seed(1337),
    ).frames[0]
    
    # ํ›„์ฒ˜๋ฆฌ: ํ”„๋ ˆ์ž„ ๊ฐ„ ๋ณด๊ฐ„์„ ํ†ตํ•œ ๋ถ€๋“œ๋Ÿฌ์šด ์ „ํ™˜
    interpolated_frames = []
    for i in range(len(video) - 1):
        interpolated_frames.append(video[i])
        interpolated_frames.append(Image.blend(video[i], video[i+1], 0.5))
    interpolated_frames.append(video[-1])
    
    export_to_gif(interpolated_frames, "output.gif")
    return "output.gif"

demo = gr.Interface(
    fn=generate_video,
    inputs=[
        gr.Textbox(label="Prompt (ํ•œ๊ธ€ ๋˜๋Š” ์˜์–ด)", value="๊ท€์—ฌ์šด ๊ฐ•์•„์ง€๊ฐ€ ์กฐ์šฉํžˆ ์ง–๊ณ ์žˆ, ๊ฑธ์ž‘, ๊ณ ํ’ˆ์งˆ"),
        gr.Textbox(label="Negative Prompt (ํ•œ๊ธ€ ๋˜๋Š” ์˜์–ด)", value="์ €ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๋ ˆํ„ฐ๋ฐ•์Šค"),
        gr.Slider(label="Number of Inference Steps", minimum=1, maximum=200, step=1, value=150),
        gr.Textbox(label="Conditioning Frame Indices", value="[0, 8, 15]"),
        gr.Slider(label="ControlNet Conditioning Scale", minimum=0.1, maximum=2.0, step=0.1, value=1.0),
        gr.Slider(label="Width", minimum=256, maximum=1024, step=64, value=512),
        gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=512),
        gr.Slider(label="Number of Frames", minimum=16, maximum=128, step=16, value=64)
    ],
    outputs=gr.Image(label="Generated Video"),
    title="AnimateDiffSparseControlNetPipeline์„ ์‚ฌ์šฉํ•œ ๊ณ ํ’ˆ์งˆ ๋น„๋””์˜ค ์ƒ์„ฑ",
    description="AnimateDiffSparseControlNetPipeline์„ ์‚ฌ์šฉํ•˜์—ฌ ๊ณ ํ’ˆ์งˆ ๋น„๋””์˜ค๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. ํ•œ๊ธ€ ๋˜๋Š” ์˜์–ด๋กœ ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค."
)

demo.launch()