# Copyright (c) Alibaba, Inc. and its affiliates. import logging import os import os.path as osp from datetime import datetime import torch from easydict import EasyDict cfg = EasyDict(__name__='Config: VideoLDM Decoder') # ---------------------------work dir-------------------------- cfg.work_dir = 'workspace/' # ---------------------------Global Variable----------------------------------- cfg.resolution = [448, 256] cfg.max_frames = 32 # ----------------------------------------------------------------------------- # ---------------------------Dataset Parameter--------------------------------- cfg.mean = [0.5, 0.5, 0.5] cfg.std = [0.5, 0.5, 0.5] cfg.max_words = 1000 # PlaceHolder cfg.vit_out_dim = 1024 cfg.vit_resolution = [224, 224] cfg.depth_clamp = 10.0 cfg.misc_size = 384 cfg.depth_std = 20.0 cfg.frame_lens = 32 cfg.sample_fps = 8 cfg.batch_sizes = 1 # ----------------------------------------------------------------------------- # ---------------------------Mode Parameters----------------------------------- # Diffusion cfg.schedule = 'cosine' cfg.num_timesteps = 1000 cfg.mean_type = 'v' cfg.var_type = 'fixed_small' cfg.loss_type = 'mse' cfg.ddim_timesteps = 50 cfg.ddim_eta = 0.0 cfg.clamp = 1.0 cfg.share_noise = False cfg.use_div_loss = False cfg.noise_strength = 0.1 # classifier-free guidance cfg.p_zero = 0.1 cfg.guide_scale = 3.0 # clip vision encoder cfg.vit_mean = [0.48145466, 0.4578275, 0.40821073] cfg.vit_std = [0.26862954, 0.26130258, 0.27577711] # Model cfg.scale_factor = 0.18215 cfg.use_fp16 = True cfg.temporal_attention = True cfg.decoder_bs = 8 cfg.UNet = { 'type': 'Vid2VidSDUNet', 'in_dim': 4, 'dim': 320, 'y_dim': cfg.vit_out_dim, 'context_dim': 1024, 'out_dim': 8 if cfg.var_type.startswith('learned') else 4, 'dim_mult': [1, 2, 4, 4], 'num_heads': 8, 'head_dim': 64, 'num_res_blocks': 2, 'attn_scales': [1 / 1, 1 / 2, 1 / 4], 'dropout': 0.1, 'temporal_attention': cfg.temporal_attention, 'temporal_attn_times': 1, 'use_checkpoint': False, 'use_fps_condition': False, 'use_sim_mask': False, 'num_tokens': 4, 'default_fps': 8, 'input_dim': 1024 } cfg.guidances = [] # auotoencoder from stabel diffusion cfg.auto_encoder = { 'type': 'AutoencoderKL', 'ddconfig': { 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0 }, 'embed_dim': 4, 'pretrained': 'models/v2-1_512-ema-pruned.ckpt' } # clip embedder cfg.embedder = { 'type': 'FrozenOpenCLIPEmbedder', 'layer': 'penultimate', 'vit_resolution': [224, 224], 'pretrained': 'open_clip_pytorch_model.bin' } # ----------------------------------------------------------------------------- # ---------------------------Training Settings--------------------------------- # training and optimizer cfg.ema_decay = 0.9999 cfg.num_steps = 600000 cfg.lr = 5e-5 cfg.weight_decay = 0.0 cfg.betas = (0.9, 0.999) cfg.eps = 1.0e-8 cfg.chunk_size = 16 cfg.alpha = 0.7 cfg.save_ckp_interval = 1000 # ----------------------------------------------------------------------------- # ----------------------------Pretrain Settings--------------------------------- # Default: load 2d pretrain cfg.fix_weight = False cfg.load_match = False cfg.pretrained_checkpoint = 'v2-1_512-ema-pruned.ckpt' cfg.pretrained_image_keys = 'stable_diffusion_image_key_temporal_attention_x1.json' cfg.resume_checkpoint = 'img2video_ldm_0779000.pth' # ----------------------------------------------------------------------------- # -----------------------------Visual------------------------------------------- # Visual videos cfg.viz_interval = 1000 cfg.visual_train = { 'type': 'VisualVideoTextDuringTrain', } cfg.visual_inference = { 'type': 'VisualGeneratedVideos', } cfg.inference_list_path = '' # logging cfg.log_interval = 100 # Default log_dir cfg.log_dir = 'workspace/output_data' # ----------------------------------------------------------------------------- # ---------------------------Others-------------------------------------------- # seed cfg.seed = 8888 cfg.negative_prompt = 'painting, oil painting, illustration, drawing, art, sketch, oil painting, cartoon, \ CG Style, 3D render, unreal engine, blurring, dirty, messy, worst quality, low quality, frames, watermark, \ signature, jpeg artifacts, deformed, lowres, over-smooth' cfg.positive_prompt = 'Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, \ hyper detailed photo - realistic maximum detail, 32k, Color Grading, ultra HD, extreme meticulous detailing, \ skin pore detailing, hyper sharpness, perfect without deformations.'