import sys sys.path.append('./') import gradio as gr import spaces import os import sys import subprocess import numpy as np from PIL import Image import cv2 import torch import random from transformers import pipeline os.system("pip install -e ./controlnet_aux") from controlnet_aux import OpenposeDetector, CannyDetector from depth_anything_v2.dpt import DepthAnythingV2 from huggingface_hub import hf_hub_download from huggingface_hub import login hf_token = os.environ.get("HF_TOKEN_GATED") login(token=hf_token) MAX_SEED = np.iinfo(np.int32).max # 번역기 설정 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en") def translate_to_english(text): if any('\uAC00' <= char <= '\uD7A3' for char in text): return translator(text, max_length=512)[0]['translation_text'] return text def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' model_configs = { 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]} } encoder = 'vitl' model = DepthAnythingV2(**model_configs[encoder]) filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-Large", filename=f"depth_anything_v2_vitl.pth", repo_type="model") state_dict = torch.load(filepath, map_location="cpu") model.load_state_dict(state_dict) model = model.to(DEVICE).eval() import torch from diffusers.utils import load_image from diffusers import FluxControlNetPipeline, FluxControlNetModel from diffusers.models import FluxMultiControlNetModel base_model = 'black-forest-labs/FLUX.1-dev' controlnet_model = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro' controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16) controlnet = FluxMultiControlNetModel([controlnet]) pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16) pipe.to("cuda") mode_mapping = {"캐니":0, "타일":1, "깊이":2, "블러":3, "오픈포즈":4, "그레이스케일":5, "저품질": 6} strength_mapping = {"캐니":0.65, "타일":0.45, "깊이":0.55, "블러":0.45, "오픈포즈":0.55, "그레이스케일":0.45, "저품질": 0.4} canny = CannyDetector() open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators") torch.backends.cuda.matmul.allow_tf32 = True pipe.vae.enable_tiling() pipe.vae.enable_slicing() pipe.enable_model_cpu_offload() # for saving memory def convert_from_image_to_cv2(img: Image) -> np.ndarray: return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) def convert_from_cv2_to_image(img: np.ndarray) -> Image: return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) def extract_depth(image): image = np.asarray(image) depth = model.infer_image(image[:, :, ::-1]) depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 depth = depth.astype(np.uint8) gray_depth = Image.fromarray(depth).convert('RGB') return gray_depth def extract_openpose(img): processed_image_open_pose = open_pose(img, hand_and_face=True) return processed_image_open_pose def extract_canny(image): processed_image_canny = canny(image) return processed_image_canny def apply_gaussian_blur(image, kernel_size=(21, 21)): image = convert_from_image_to_cv2(image) blurred_image = convert_from_cv2_to_image(cv2.GaussianBlur(image, kernel_size, 0)) return blurred_image def convert_to_grayscale(image): image = convert_from_image_to_cv2(image) gray_image = convert_from_cv2_to_image(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)) return gray_image def add_gaussian_noise(image, mean=0, sigma=10): image = convert_from_image_to_cv2(image) noise = np.random.normal(mean, sigma, image.shape) noisy_image = convert_from_cv2_to_image(np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8)) return noisy_image def tile(input_image, resolution=768): input_image = convert_from_image_to_cv2(input_image) H, W, C = input_image.shape H = float(H) W = float(W) k = float(resolution) / min(H, W) H *= k W *= k H = int(np.round(H / 64.0)) * 64 W = int(np.round(W / 64.0)) * 64 img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA) img = convert_from_cv2_to_image(img) return img def resize_img(input_image, max_side=768, min_side=512, size=None, pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64): w, h = input_image.size if size is not None: w_resize_new, h_resize_new = size else: ratio = min_side / min(h, w) w, h = round(ratio*w), round(ratio*h) ratio = max_side / max(h, w) input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode) w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number input_image = input_image.resize([w_resize_new, h_resize_new], mode) if pad_to_max_side: res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255 offset_x = (max_side - w_resize_new) // 2 offset_y = (max_side - h_resize_new) // 2 res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image) input_image = Image.fromarray(res) return input_image @spaces.GPU() def infer(cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed, progress=gr.Progress(track_tqdm=True)): control_mode_num = mode_mapping[control_mode] prompt = translate_to_english(prompt) if cond_in is None: if image_in is not None: image_in = resize_img(load_image(image_in)) if control_mode == "Canny": control_image = extract_canny(image_in) elif control_mode == "Depth": control_image = extract_depth(image_in) elif control_mode == "OpenPose": control_image = extract_openpose(image_in) elif control_mode == "Blur": control_image = apply_gaussian_blur(image_in) elif control_mode == "LowQuality": control_image = add_gaussian_noise(image_in) elif control_mode == "Grayscale": control_image = convert_to_grayscale(image_in) elif control_mode == "Tile": control_image = tile(image_in) else: control_image = resize_img(load_image(cond_in)) width, height = control_image.size image = pipe( prompt, control_image=[control_image], control_mode=[control_mode_num], width=width, height=height, controlnet_conditioning_scale=[control_strength], num_inference_steps=inference_steps, guidance_scale=guidance_scale, generator=torch.manual_seed(seed), ).images[0] torch.cuda.empty_cache() return image, control_image, gr.update(visible=True) css = """ footer { visibility: hidden; } """ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo: with gr.Column(elem_id="col-container"): with gr.Column(): with gr.Row(): with gr.Column(): with gr.Row(equal_height=True): cond_in = gr.Image(label="Upload Processed Control Image", sources=["upload"], type="filepath") image_in = gr.Image(label="Extract Condition from Reference Image (Optional)", sources=["upload"], type="filepath") prompt = gr.Textbox(label="Prompt", value="Highest Quality") with gr.Accordion("ControlNet"): control_mode = gr.Radio( ["Canny", "Depth", "OpenPose", "Grayscale", "Blur", "Tile", "LowQuality"], label="Mode", value="Grayscale", info="Select control mode, applies to all images" ) control_strength = gr.Slider( label="Control Strength", minimum=0, maximum=1.0, step=0.05, value=0.50, ) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, ) randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) with gr.Accordion("Advanced Settings", open=False): with gr.Column(): with gr.Row(): inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=24) guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=3.5) submit_btn = gr.Button("Submit") with gr.Column(): result = gr.Image(label="Result") processed_cond = gr.Image(label="Preprocessed Condition") submit_btn.click( fn=randomize_seed_fn, inputs=[seed, randomize_seed], outputs=seed, queue=False, api_name=False ).then( fn = infer, inputs = [cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed], outputs = [result, processed_cond], show_api=False ) demo.queue(api_open=False) demo.launch()