from diffusers import DiffusionPipeline | |
from .invert import Inverter | |
from .generate import Generator | |
from .utils import init_model, seed_everything, get_frame_ids | |
import torch | |
from omegaconf import OmegaConf | |
class VidToMePipeline(DiffusionPipeline): | |
# def __init__(self, device="cuda", sd_version="2.1", float_precision="fp16", height=512, width=512): | |
# # this will initlize the core pipeline components | |
# pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision) | |
# self.pipe = pipe | |
# self.scheduler = scheduler | |
# self.model_key = model_key | |
# self.device = device | |
# self.sd_version = sd_version | |
# self.float_precision = float_precision | |
# self.height = height | |
# self.width = width | |
def __init__(self, device="cuda", sd_version="1.5", float_precision="fp16", height=512, width=512): | |
# Register configuration parameters | |
self.register_to_config(device=device, sd_version=sd_version, float_precision=float_precision, height=height, width=width) | |
self.sd_version = sd_version | |
self.float_precision = float_precision | |
self.height = height | |
self.width = width | |
# this will initlize the core pipeline components | |
pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision) | |
self.pipe = pipe | |
self.scheduler = scheduler | |
self.model_key = model_key | |
super().__init__() | |
def __call__(self, video_path=None, video_prompt=None, edit_prompt=None, | |
control_type="none", n_timesteps=50, guidance_scale=7.5, | |
negative_prompt="ugly, blurry, low res", frame_range=None, | |
use_lora=False, seed=123, local_merge_ratio=0.9, global_merge_ratio=0.8): | |
# dynamic config built from user inputs | |
config = self._build_config(video_path, video_prompt, edit_prompt, control_type, | |
n_timesteps, guidance_scale, negative_prompt, | |
frame_range, use_lora, seed, local_merge_ratio, global_merge_ratio) | |
# seed for reproducibility - change as you need | |
seed_everything(config['seed']) | |
# inversion stage | |
print("Start inversion!") | |
inversion = Inverter(self.pipe, self.scheduler, config) | |
inversion(config['input_path'], config['inversion']['save_path']) | |
# generation stage | |
print("Start generation!") | |
generator = Generator(self.pipe, self.scheduler, config) | |
frame_ids = get_frame_ids(config['generation']['frame_range'], None) | |
generator(config['input_path'], config['generation']['latents_path'], | |
config['generation']['output_path'], frame_ids=frame_ids) | |
print(f"Output generated at: {config['generation']['output_path']}") | |
# def _build_config(self, video_path, video_prompt, edit_prompt, control_type, | |
# n_timesteps, guidance_scale, negative_prompt, frame_range, | |
# use_lora, seed, local_merge_ratio, global_merge_ratio): | |
# # constructing config dictionary from user prompts | |
# config = { | |
# 'sd_version': self.sd_version, | |
# 'input_path': video_path, | |
# 'work_dir': "outputs/", | |
# 'height': self.height, | |
# 'width': self.width, | |
# 'inversion': { | |
# 'prompt': video_prompt or "Default video prompt.", | |
# 'save_path': "outputs/latents", | |
# 'steps': 50, | |
# 'save_intermediate': False | |
# }, | |
# 'generation': { | |
# 'control': control_type, | |
# 'guidance_scale': guidance_scale, | |
# 'n_timesteps': n_timesteps, | |
# 'negative_prompt': negative_prompt, | |
# 'prompt': edit_prompt or "Default edit prompt.", | |
# 'latents_path': "outputs/latents", | |
# 'output_path': "outputs/final", | |
# 'frame_range': frame_range or [0, 32], | |
# 'use_lora': use_lora, | |
# 'local_merge_ratio': local_merge_ratio, | |
# 'global_merge_ratio': global_merge_ratio | |
# }, | |
# 'seed': seed, | |
# 'device': "cuda", | |
# 'float_precision': self.float_precision | |
# } | |
# return config | |
from omegaconf import OmegaConf | |
def _build_config(self, video_path, video_prompt, edit_prompt, control_type, | |
n_timesteps, guidance_scale, negative_prompt, frame_range, | |
use_lora, seed, local_merge_ratio, global_merge_ratio): | |
# Build config using OmegaConf, abstracting as much as possible | |
config = OmegaConf.create({ | |
'sd_version': self.sd_version, # Default sd_version | |
'model_key': self.model_key or None, # Optionally allow model_key to be None | |
'input_path': video_path, # Path to the video | |
'work_dir': "workdir", # Default workdir, can be abstracted further | |
'height': self.height, | |
'width': self.width, | |
'inversion': { | |
'save_path': "${work_dir}/latents", # Save latents during inversion | |
'prompt': video_prompt or "Default video prompt.", | |
'n_frames': None, # None to invert all frames | |
'steps': 50, # Default inversion steps | |
'save_intermediate': False, # Default, but can be abstracted to user | |
'save_steps': 50, # Default | |
'use_blip': False, # Abstract BLIP prompt creation | |
'recon': False, # Reconstruct the input video from latents | |
'control': control_type or "none", # Default to 'none', can use 'tile', 'softedge', etc. | |
'control_scale': 1.0, # Default control scale | |
'batch_size': 8, # Default batch size for inversion | |
'force': False, # Default, force inversion even if latents exist | |
}, | |
'generation': { | |
'control': "pnp", # Default to Plug-and-Play for generation control | |
'pnp_attn_t': 0.5, # PnP args | |
'pnp_f_t': 0.8, # PnP args | |
'control_scale': 1.0, # Scale for ControlNet-like controls | |
'guidance_scale': guidance_scale, # Guidance scale for CFG | |
'n_timesteps': n_timesteps, # Number of diffusion timesteps | |
'negative_prompt': negative_prompt or "ugly, blurry, low res", # Negative prompt to avoid undesired generations | |
'prompt': edit_prompt or None, # Edit prompt during generation | |
'latents_path': "${work_dir}/latents", # Latents path from inversion | |
'output_path': "${work_dir}", # Output directory for final images | |
'chunk_size': 4, # Number of frames processed per chunk | |
'chunk_ord': "mix-4", # Processing order for video chunks | |
'local_merge_ratio': local_merge_ratio, # Merge ratio for blending | |
'merge_global': True, # Enable global merging | |
'global_merge_ratio': global_merge_ratio, # Global merge ratio | |
'global_rand': 0.5, # Randomness in global merge | |
'align_batch': True, # Align batch processing | |
'frame_range': frame_range or [0, 32, 1], # Default frame range | |
'frame_ids': None, # Specify frame IDs to edit | |
'save_frame': True, # Save individual frames | |
'use_lora': use_lora, # Enable LoRA if applicable | |
# Additional LoRA configurations | |
'lora': { | |
'pretrained_model_name_or_path_or_dict': None, # Default LoRA model path | |
'lora_weight_name': None, | |
'lora_adapter': None, | |
'lora_weight': 1.0 | |
} | |
}, | |
'seed': seed, # Seed for reproducibility | |
'device': "cuda", # Default to CUDA | |
'float_precision': "fp16", # Enable mixed-precision | |
'enable_xformers_memory_efficient_attention': True # Default to enable xformers memory-efficient attention | |
}) | |
return config | |
# # Sample usage | |
# pipeline = VidToMePipeline(device="cuda", sd_version="2.1", float_precision="fp16") | |
# pipeline(video_path="path/to/video.mp4", video_prompt="A beautiful scene of a sunset", | |
# edit_prompt="Make the sunset look more vibrant", control_type="depth", n_timesteps=50) | |