# from utils.args import parse_args import logging import os import argparse from pathlib import Path from PIL import Image import numpy as np import torch from tqdm.auto import tqdm from diffusers.utils import check_min_version from pipeline import LotusGPipeline, LotusDPipeline from utils.image_utils import colorize_depth_map from utils.seed_all import seed_all from contextlib import nullcontext import cv2 check_min_version('0.28.0.dev0') def infer_pipe(pipe, test_image, task_name, seed, device, video_depth=False): if seed is None: generator = None else: generator = torch.Generator(device=device).manual_seed(seed) if torch.backends.mps.is_available(): autocast_ctx = nullcontext() else: autocast_ctx = torch.autocast(pipe.device.type) with autocast_ctx: if video_depth == False: test_image = Image.open(test_image).convert('RGB') test_image = np.array(test_image).astype(np.float32) if max(test_image.shape[:2]) > 1024: # resize for a maximum size of 1024 scale = 1024 / max(test_image.shape[:2]) elif min(test_image.shape[:2]) < 384: # resize for a minimum size of 384 scale = 384 / min(test_image.shape[:2]) else: scale = 1.0 new_shape = (int(test_image.shape[1] * scale), int(test_image.shape[0] * scale)) test_image = cv2.resize(test_image, new_shape) test_image = test_image.astype(np.float16) test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0) test_image = test_image / 127.5 - 1.0 test_image = test_image.to(device) task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device) task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1) # Run pred = pipe( rgb_in=test_image, prompt='', num_inference_steps=1, generator=generator, # guidance_scale=0, output_type='np', timesteps=[999], task_emb=task_emb, ).images[0] # Post-process the prediction if task_name == 'depth': output_npy = pred.mean(axis=-1) output_color = colorize_depth_map(output_npy, reverse_color=True) else: output_npy = pred output_color = Image.fromarray((output_npy * 255).astype(np.uint8)) return output_color def infer_pipe_video(pipe, test_image, task_name, generator, device, latents=None): if torch.backends.mps.is_available(): autocast_ctx = nullcontext() else: autocast_ctx = torch.autocast(pipe.device.type) with autocast_ctx: test_image = np.array(test_image).astype(np.float16) test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0) test_image = test_image / 127.5 - 1.0 test_image = test_image.to(device) task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device) task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1) # Run output = pipe( rgb_in=test_image, prompt='', num_inference_steps=1, generator=generator, latents=latents, # guidance_scale=0, output_type='np', timesteps=[999], task_emb=task_emb, return_dict=False ) pred = output[0][0] last_frame_latent = output[2] # Post-process the prediction if task_name == 'depth': output_npy = pred.mean(axis=-1) output_color = colorize_depth_map(output_npy, reverse_color=True) else: output_npy = pred output_color = Image.fromarray((output_npy * 255).astype(np.uint8)) return output_color, last_frame_latent def load_pipe(task_name, device): if task_name == 'depth': model_g = 'jingheya/lotus-depth-g-v2-0-disparity' model_d = 'jingheya/lotus-depth-d-v2-0-disparity' else: model_g = 'jingheya/lotus-normal-g-v1-0' model_d = 'jingheya/lotus-normal-d-v1-0' dtype = torch.float16 pipe_g = LotusGPipeline.from_pretrained( model_g, torch_dtype=dtype, ) pipe_d = LotusDPipeline.from_pretrained( model_d, torch_dtype=dtype, ) pipe_g.to(device) pipe_d.to(device) pipe_g.set_progress_bar_config(disable=True) pipe_d.set_progress_bar_config(disable=True) logging.info(f"Successfully loading pipeline from {model_g} and {model_d}.") return pipe_g, pipe_d def lotus_video(input_video, task_name, seed, device): pipe_g, pipe_d = load_pipe(task_name, device) # load the video and split it into frames cap = cv2.VideoCapture(input_video) fps = cap.get(cv2.CAP_PROP_FPS) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) frames = [] while True: ret, frame = cap.read() if not ret: break frames.append(frame) cap.release() # generate latents_common for lotus-g if seed is None: generator = None else: generator = torch.Generator(device=device).manual_seed(seed) last_frame_latent = None latent_common = torch.randn( (1, 4, height // pipe_g.vae_scale_factor, width // pipe_g.vae_scale_factor), generator=generator, dtype=pipe_g.dtype, device=device ) output_g = [] output_d = [] for frame in frames: latents = latent_common if last_frame_latent is not None: latents = 0.9 * latents + 0.1 * last_frame_latent output_frame_g, last_frame_latent = infer_pipe_video(pipe_g, frame, task_name, seed, device, latents) output_frame_d = infer_pipe(pipe_d, frame, task_name, seed, device, video_depth=True) output_g.append(output_frame_g) output_d.append(output_frame_d) return output_g, output_d, fps def lotus(image_input, task_name, seed, device): pipe_g, pipe_d = load_pipe(task_name, device) output_g = infer_pipe(pipe_g, image_input, task_name, seed, device) output_d = infer_pipe(pipe_d, image_input, task_name, seed, device) return output_g, output_d def parse_args(): '''Set the Args''' parser = argparse.ArgumentParser( description="Run Lotus..." ) # model settings parser.add_argument( "--pretrained_model_name_or_path", type=str, default=None, help="pretrained model path from hugging face or local dir", ) parser.add_argument( "--prediction_type", type=str, default="sample", help="The used prediction_type. ", ) parser.add_argument( "--timestep", type=int, default=999, ) parser.add_argument( "--mode", type=str, default="regression", # "generation" help="Whether to use the generation or regression pipeline." ) parser.add_argument( "--task_name", type=str, default="depth", # "normal" ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) # inference settings parser.add_argument("--seed", type=int, default=None, help="Random seed.") parser.add_argument( "--output_dir", type=str, required=True, help="Output directory." ) parser.add_argument( "--input_dir", type=str, required=True, help="Input directory." ) parser.add_argument( "--half_precision", action="store_true", help="Run with half-precision (16-bit float), might lead to suboptimal result.", ) args = parser.parse_args() return args def main(): logging.basicConfig(level=logging.INFO) logging.info(f"Run inference...") args = parse_args() # -------------------- Preparation -------------------- # Random seed if args.seed is not None: seed_all(args.seed) # Output directories os.makedirs(args.output_dir, exist_ok=True) logging.info(f"Output dir = {args.output_dir}") output_dir_color = os.path.join(args.output_dir, f'{args.task_name}_vis') output_dir_npy = os.path.join(args.output_dir, f'{args.task_name}') if not os.path.exists(output_dir_color): os.makedirs(output_dir_color) if not os.path.exists(output_dir_npy): os.makedirs(output_dir_npy) # half_precision if args.half_precision: dtype = torch.float16 logging.info(f"Running with half precision ({dtype}).") else: dtype = torch.float16 # -------------------- Device -------------------- if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") logging.warning("CUDA is not available. Running on CPU will be slow.") logging.info(f"Device = {device}") # -------------------- Data -------------------- root_dir = Path(args.input_dir) test_images = list(root_dir.rglob('*.png')) + list(root_dir.rglob('*.jpg')) test_images = sorted(test_images) print('==> There are', len(test_images), 'images for validation.') # -------------------- Model -------------------- if args.mode == 'generation': pipeline = LotusGPipeline.from_pretrained( args.pretrained_model_name_or_path, torch_dtype=dtype, ) elif args.mode == 'regression': pipeline = LotusDPipeline.from_pretrained( args.pretrained_model_name_or_path, torch_dtype=dtype, ) else: raise ValueError(f'Invalid mode: {args.mode}') logging.info(f"Successfully loading pipeline from {args.pretrained_model_name_or_path}.") pipeline = pipeline.to(device) pipeline.set_progress_bar_config(disable=True) if args.enable_xformers_memory_efficient_attention: pipeline.enable_xformers_memory_efficient_attention() if args.seed is None: generator = None else: generator = torch.Generator(device=device).manual_seed(args.seed) # -------------------- Inference and saving -------------------- with torch.no_grad(): for i in tqdm(range(len(test_images))): # Preprocess validation image test_image = Image.open(test_images[i]).convert('RGB') test_image = np.array(test_image).astype(np.float16) test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0) test_image = test_image / 127.5 - 1.0 test_image = test_image.to(device) task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device) task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1) # Run pred = pipeline( rgb_in=test_image, prompt='', num_inference_steps=1, generator=generator, # guidance_scale=0, output_type='np', timesteps=[args.timestep], task_emb=task_emb, ).images[0] # Post-process the prediction save_file_name = os.path.basename(test_images[i])[:-4] if args.task_name == 'depth': output_npy = pred.mean(axis=-1) output_color = colorize_depth_map(output_npy) else: output_npy = pred output_color = Image.fromarray((output_npy * 255).astype(np.uint8)) output_color.save(os.path.join(output_dir_color, f'{save_file_name}.png')) np.save(os.path.join(output_dir_npy, f'{save_file_name}.npy'), output_npy) print('==> Inference is done. \n==> Results saved to:', args.output_dir) if __name__ == '__main__': main()