File size: 8,628 Bytes
cac2b0c
4bd6d0e
 
 
526c5d8
43b1627
cac2b0c
 
629f20f
2e7920c
 
 
 
 
 
 
 
 
 
629f20f
 
 
 
cac2b0c
 
 
 
2e7920c
 
 
 
 
483dce0
cac2b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43b1627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1451d5e
59e9a43
 
 
43b1627
59e9a43
 
 
 
 
 
cac2b0c
59e9a43
cac2b0c
59e9a43
 
 
 
 
 
 
 
 
 
cac2b0c
 
59e9a43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cac2b0c
59e9a43
 
 
 
43b1627
 
3ace9e3
cac2b0c
59e9a43
ce01e8f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from diffusers import DiffusionPipeline
from .invert import Inverter
from .generate import Generator
from .utils import init_model, seed_everything, get_frame_ids
import torch
from omegaconf import OmegaConf

class VidToMePipeline(DiffusionPipeline):
    # def __init__(self, device="cuda", sd_version="2.1", float_precision="fp16", height=512, width=512):
        # # this will initlize the core pipeline components
        # pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision)
        # self.pipe = pipe
        # self.scheduler = scheduler
        # self.model_key = model_key
        # self.device = device
        # self.sd_version = sd_version
        # self.float_precision = float_precision
        # self.height = height
        # self.width = width

    def __init__(self, device="cuda", sd_version="1.5", float_precision="fp16", height=512, width=512):
        # Register configuration parameters
        self.register_to_config(device=device, sd_version=sd_version, float_precision=float_precision, height=height, width=width)
        self.sd_version = sd_version
        self.float_precision = float_precision
        self.height = height
        self.width = width
        # this will initlize the core pipeline components
        pipe, scheduler, model_key = init_model(device, sd_version, None, "none", float_precision)
        self.pipe = pipe
        self.scheduler = scheduler
        self.model_key = model_key
        super().__init__()

    def __call__(self, video_path=None, video_prompt=None, edit_prompt=None, 
                 control_type="none", n_timesteps=50, guidance_scale=7.5, 
                 negative_prompt="ugly, blurry, low res", frame_range=None, 
                 use_lora=False, seed=123, local_merge_ratio=0.9, global_merge_ratio=0.8):
        
        # dynamic config built from user inputs
        config = self._build_config(video_path, video_prompt, edit_prompt, control_type, 
                                    n_timesteps, guidance_scale, negative_prompt, 
                                    frame_range, use_lora, seed, local_merge_ratio, global_merge_ratio)
        
        # seed for reproducibility - change as you need
        seed_everything(config['seed'])

        # inversion stage
        print("Start inversion!")
        inversion = Inverter(self.pipe, self.scheduler, config)
        inversion(config['input_path'], config['inversion']['save_path'])

        # generation stage
        print("Start generation!")
        generator = Generator(self.pipe, self.scheduler, config)
        frame_ids = get_frame_ids(config['generation']['frame_range'], None)
        generator(config['input_path'], config['generation']['latents_path'], 
                  config['generation']['output_path'], frame_ids=frame_ids)
        print(f"Output generated at: {config['generation']['output_path']}")

    # def _build_config(self, video_path, video_prompt, edit_prompt, control_type, 
    #                   n_timesteps, guidance_scale, negative_prompt, frame_range, 
    #                   use_lora, seed, local_merge_ratio, global_merge_ratio):
    #     # constructing config dictionary from user prompts
    #     config = {
    #         'sd_version': self.sd_version,
    #         'input_path': video_path,
    #         'work_dir': "outputs/",
    #         'height': self.height,
    #         'width': self.width,
    #         'inversion': {
    #             'prompt': video_prompt or "Default video prompt.",
    #             'save_path': "outputs/latents",
    #             'steps': 50,
    #             'save_intermediate': False
    #         },
    #         'generation': {
    #             'control': control_type,
    #             'guidance_scale': guidance_scale,
    #             'n_timesteps': n_timesteps,
    #             'negative_prompt': negative_prompt,
    #             'prompt': edit_prompt or "Default edit prompt.",
    #             'latents_path': "outputs/latents",
    #             'output_path': "outputs/final",
    #             'frame_range': frame_range or [0, 32],
    #             'use_lora': use_lora,
    #             'local_merge_ratio': local_merge_ratio,
    #             'global_merge_ratio': global_merge_ratio
    #         },
    #         'seed': seed,
    #         'device': "cuda",
    #         'float_precision': self.float_precision
    #     }
    #     return config
    from omegaconf import OmegaConf

    def _build_config(self, video_path, video_prompt, edit_prompt, control_type, 
                  n_timesteps, guidance_scale, negative_prompt, frame_range, 
                  use_lora, seed, local_merge_ratio, global_merge_ratio):
        # Build config using OmegaConf, abstracting as much as possible
        config = OmegaConf.create({
            'sd_version': self.sd_version,  # Default sd_version
            'model_key': self.model_key or None,  # Optionally allow model_key to be None
            'input_path': video_path,  # Path to the video
            'work_dir': "workdir",  # Default workdir, can be abstracted further
            'height': self.height,
            'width': self.width,
            'inversion': {
                'save_path': "${work_dir}/latents",  # Save latents during inversion
                'prompt': video_prompt or "Default video prompt.",
                'n_frames': None,  # None to invert all frames
                'steps': 50,  # Default inversion steps
                'save_intermediate': False,  # Default, but can be abstracted to user
                'save_steps': 50,  # Default
                'use_blip': False,  # Abstract BLIP prompt creation
                'recon': False,  # Reconstruct the input video from latents
                'control': control_type or "none",  # Default to 'none', can use 'tile', 'softedge', etc.
                'control_scale': 1.0,  # Default control scale
                'batch_size': 8,  # Default batch size for inversion
                'force': False,  # Default, force inversion even if latents exist
            },
            'generation': {
                'control': "pnp",  # Default to Plug-and-Play for generation control
                'pnp_attn_t': 0.5,  # PnP args
                'pnp_f_t': 0.8,  # PnP args
                'control_scale': 1.0,  # Scale for ControlNet-like controls
                'guidance_scale': guidance_scale,  # Guidance scale for CFG
                'n_timesteps': n_timesteps,  # Number of diffusion timesteps
                'negative_prompt': negative_prompt or "ugly, blurry, low res",  # Negative prompt to avoid undesired generations
                'prompt': edit_prompt or None,  # Edit prompt during generation
                'latents_path': "${work_dir}/latents",  # Latents path from inversion
                'output_path': "${work_dir}",  # Output directory for final images
                'chunk_size': 4,  # Number of frames processed per chunk
                'chunk_ord': "mix-4",  # Processing order for video chunks
                'local_merge_ratio': local_merge_ratio,  # Merge ratio for blending
                'merge_global': True,  # Enable global merging
                'global_merge_ratio': global_merge_ratio,  # Global merge ratio
                'global_rand': 0.5,  # Randomness in global merge
                'align_batch': True,  # Align batch processing
                'frame_range': frame_range or [0, 32, 1],  # Default frame range
                'frame_ids': None,  # Specify frame IDs to edit
                'save_frame': True,  # Save individual frames
                'use_lora': use_lora,  # Enable LoRA if applicable
                # Additional LoRA configurations
                'lora': {
                    'pretrained_model_name_or_path_or_dict': None,  # Default LoRA model path
                    'lora_weight_name': None,
                    'lora_adapter': None,
                    'lora_weight': 1.0
                }
            },
            'seed': seed,  # Seed for reproducibility
            'device': "cuda",  # Default to CUDA
            'float_precision': "fp16",  # Enable mixed-precision
            'enable_xformers_memory_efficient_attention': True  # Default to enable xformers memory-efficient attention
        })
        
        return config


# # Sample usage
# pipeline = VidToMePipeline(device="cuda", sd_version="2.1", float_precision="fp16")
# pipeline(video_path="path/to/video.mp4", video_prompt="A beautiful scene of a sunset", 
#          edit_prompt="Make the sunset look more vibrant", control_type="depth", n_timesteps=50)