import importlib import numpy as np import cv2 import torch import torch.distributed as dist from collections import OrderedDict import os from lvdm.models.samplers.ddim import DDIMSampler from lvdm.models.samplers.ddim_multiplecond import DDIMSampler as DDIMSampler_multicond from einops import rearrange, repeat def count_params(model, verbose=False): total_params = sum(p.numel() for p in model.parameters()) if verbose: print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") return total_params def check_istarget(name, para_list): """ name: full name of source para para_list: partial name of target para """ istarget=False for para in para_list: if para in name: return True return istarget def instantiate_from_config(config): if not "target" in config: if config == '__is_first_stage__': return None elif config == "__is_unconditional__": return None raise KeyError("Expected key `target` to instantiate.") return get_obj_from_str(config["target"])(**config.get("params", dict())) def get_obj_from_str(string, reload=False): module, cls = string.rsplit(".", 1) if reload: module_imp = importlib.import_module(module) importlib.reload(module_imp) return getattr(importlib.import_module(module, package=None), cls) def load_npz_from_dir(data_dir): data = [np.load(os.path.join(data_dir, data_name))['arr_0'] for data_name in os.listdir(data_dir)] data = np.concatenate(data, axis=0) return data def load_npz_from_paths(data_paths): data = [np.load(data_path)['arr_0'] for data_path in data_paths] data = np.concatenate(data, axis=0) return data def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None): h, w = image.shape[:2] if resize_short_edge is not None: k = resize_short_edge / min(h, w) else: k = max_resolution / (h * w) k = k**0.5 h = int(np.round(h * k / 64)) * 64 w = int(np.round(w * k / 64)) * 64 image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4) return image def setup_dist(args): if dist.is_initialized(): return torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( 'nccl', init_method='env://' ) def load_model_checkpoint(model, ckpt): state_dict = torch.load(ckpt, map_location="cpu") if "state_dict" in list(state_dict.keys()): state_dict = state_dict["state_dict"] try: model.load_state_dict(state_dict, strict=True) except: ## rename the keys for 256x256 model new_pl_sd = OrderedDict() for k,v in state_dict.items(): new_pl_sd[k] = v for k in list(new_pl_sd.keys()): if "framestride_embed" in k: new_key = k.replace("framestride_embed", "fps_embedding") new_pl_sd[new_key] = new_pl_sd[k] del new_pl_sd[k] model.load_state_dict(new_pl_sd, strict=True) else: # deepspeed new_pl_sd = OrderedDict() for key in state_dict['module'].keys(): new_pl_sd[key[16:]]=state_dict['module'][key] model.load_state_dict(new_pl_sd) print('>>> model checkpoint loaded.') return model def get_latent_z(model, videos): b, c, t, h, w = videos.shape x = rearrange(videos, 'b c t h w -> (b t) c h w') z = model.encode_first_stage(x) z = rearrange(z, '(b t) c h w -> b c t h w', b=b, t=t) return z def image_guided_synthesis(model, prompts, videos, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1., \ unconditional_guidance_scale=1.0, cfg_img=None, fs=None, text_input=False, multiple_cond_cfg=False, timestep_spacing='uniform', guidance_rescale=0.0, condition_index=None, **kwargs): ddim_sampler = DDIMSampler(model) if not multiple_cond_cfg else DDIMSampler_multicond(model) batch_size = noise_shape[0] fs = torch.tensor([fs] * batch_size, dtype=torch.long, device=model.device) if not text_input: prompts = [""]*batch_size assert condition_index is not None, "Error: condition index is None!" img = videos[:,:,condition_index[0]] #bchw img_emb = model.embedder(img) ## blc img_emb = model.image_proj_model(img_emb) cond_emb = model.get_learned_conditioning(prompts) cond = {"c_crossattn": [torch.cat([cond_emb,img_emb], dim=1)]} if model.model.conditioning_key == 'hybrid': z = get_latent_z(model, videos) # b c t h w # if loop or interp: # img_cat_cond = torch.zeros_like(z) # img_cat_cond[:,:,0,:,:] = z[:,:,0,:,:] # img_cat_cond[:,:,-1,:,:] = z[:,:,-1,:,:] # else: img_cat_cond = z cond["c_concat"] = [img_cat_cond] # b c 1 h w if unconditional_guidance_scale != 1.0: if model.uncond_type == "empty_seq": prompts = batch_size * [""] uc_emb = model.get_learned_conditioning(prompts) elif model.uncond_type == "zero_embed": uc_emb = torch.zeros_like(cond_emb) uc_img_emb = model.embedder(torch.zeros_like(img)) ## b l c uc_img_emb = model.image_proj_model(uc_img_emb) uc = {"c_crossattn": [torch.cat([uc_emb,uc_img_emb],dim=1)]} if model.model.conditioning_key == 'hybrid': uc["c_concat"] = [img_cat_cond] else: uc = None ## we need one more unconditioning image=yes, text="" if multiple_cond_cfg and cfg_img != 1.0: uc_2 = {"c_crossattn": [torch.cat([uc_emb,img_emb],dim=1)]} if model.model.conditioning_key == 'hybrid': uc_2["c_concat"] = [img_cat_cond] kwargs.update({"unconditional_conditioning_img_nonetext": uc_2}) else: kwargs.update({"unconditional_conditioning_img_nonetext": None}) z0 = None cond_mask = None batch_variants = [] for _ in range(n_samples): if z0 is not None: cond_z0 = z0.clone() kwargs.update({"clean_cond": True}) else: cond_z0 = None if ddim_sampler is not None: samples, _ = ddim_sampler.sample(S=ddim_steps, conditioning=cond, batch_size=batch_size, shape=noise_shape[1:], verbose=False, unconditional_guidance_scale=unconditional_guidance_scale, unconditional_conditioning=uc, eta=ddim_eta, cfg_img=cfg_img, mask=cond_mask, x0=cond_z0, fs=fs, timestep_spacing=timestep_spacing, guidance_rescale=guidance_rescale, **kwargs ) ## reconstruct from latent to pixel space batch_images = model.decode_first_stage(samples) batch_variants.append(batch_images) ## variants, batch, c, t, h, w batch_variants = torch.stack(batch_variants) return batch_variants.permute(1, 0, 2, 3, 4, 5)