Spaces:

Ashoka74
/

Demo_Refurnish

Running on Zero

File size: 63,918 Bytes

import spaces
import argparse
import random

import os
import math
import gradio as gr
import numpy as np
import torch
import safetensors.torch as sf
import datetime
from pathlib import Path
from io import BytesIO



from PIL import Image
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler
from diffusers.models.attention_processor import AttnProcessor2_0
from transformers import CLIPTextModel, CLIPTokenizer
import dds_cloudapi_sdk
from dds_cloudapi_sdk import Config, Client, TextPrompt
from dds_cloudapi_sdk.tasks.dinox import DinoxTask
from dds_cloudapi_sdk.tasks import DetectionTarget
from dds_cloudapi_sdk.tasks.detection import DetectionTask

from enum import Enum
from torch.hub import download_url_to_file
import tempfile

from sam2.build_sam import build_sam2

from sam2.sam2_image_predictor import SAM2ImagePredictor
import cv2

from transformers import AutoModelForImageSegmentation
from inference_i2mv_sdxl import prepare_pipeline, remove_bg, run_pipeline
from torchvision import transforms


from typing import Optional

from depth_anything_v2.dpt import DepthAnythingV2

import httpx

client = httpx.Client(timeout=httpx.Timeout(10.0))  # Set timeout to 10 seconds
NUM_VIEWS = 6
HEIGHT = 768
WIDTH = 768
MAX_SEED = np.iinfo(np.int32).max



import supervision as sv
import torch
from PIL import Image

import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

transform_image = transforms.Compose(
    [
        transforms.Resize((1024, 1024)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]
)

# Load

# Model paths
model_path = './models/iclight_sd15_fc.safetensors'
model_path2 = './checkpoints/depth_anything_v2_vits.pth'
model_path3 = './checkpoints/sam2_hiera_large.pt'
model_path4 = './checkpoints/config.json'
model_path5 = './checkpoints/preprocessor_config.json'
model_path6 = './configs/sam2_hiera_l.yaml'
model_path7 = './mvadapter_i2mv_sdxl.safetensors'

# Base URL for the repository
BASE_URL = 'https://huggingface.co/Ashoka74/Placement/resolve/main/'

# Model URLs
model_urls = {
    model_path: 'iclight_sd15_fc.safetensors',
    model_path2: 'depth_anything_v2_vits.pth',
    model_path3: 'sam2_hiera_large.pt',
    model_path4: 'config.json',
    model_path5: 'preprocessor_config.json',
    model_path6: 'sam2_hiera_l.yaml',
    model_path7: 'mvadapter_i2mv_sdxl.safetensors'
}

# Ensure directories exist
def ensure_directories():
    for path in model_urls.keys():
        os.makedirs(os.path.dirname(path), exist_ok=True)

# Download models
def download_models():
    for local_path, filename in model_urls.items():
        if not os.path.exists(local_path):
            try:
                url = f"{BASE_URL}{filename}"
                print(f"Downloading {filename}")
                download_url_to_file(url, local_path)
                print(f"Successfully downloaded {filename}")
            except Exception as e:
                print(f"Error downloading {filename}: {e}")

ensure_directories()

download_models()





try:
    import xformers
    import xformers.ops
    XFORMERS_AVAILABLE = True
    print("xformers is available - Using memory efficient attention")
except ImportError:
    XFORMERS_AVAILABLE = False
    print("xformers not available - Using default attention")

# Memory optimizations for RTX 2070
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    # Set a smaller attention slice size for RTX 2070
    torch.backends.cuda.max_split_size_mb = 512
    device = torch.device('cuda')
else:
    device = torch.device('cpu')



# 'stablediffusionapi/realistic-vision-v51'
# 'runwayml/stable-diffusion-v1-5'
sd15_name = 'stablediffusionapi/realistic-vision-v51'
tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder")
vae = AutoencoderKL.from_pretrained(sd15_name, subfolder="vae")
unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")

# Load model directly
from transformers import AutoModelForImageSegmentation
# rmbg = AutoModelForImageSegmentation.from_pretrained("briaai/RMBG-2.0", trust_remote_code=True)#, token=os.getenv('token'))
# rmbg = rmbg.to(device=device, dtype=torch.float32)  # Keep this as float32

rmbg = AutoModelForImageSegmentation.from_pretrained(
            "ZhengPeng7/BiRefNet", trust_remote_code=True
        )
rmbg = rmbg.to(device=device, dtype=torch.float32) 

# remove bg
# rmbg = AutoModelForImageSegmentation.from_pretrained(
#     "ZhengPeng7/BiRefNet", trust_remote_code=True
# )
# rmbg = rmbg.to(device)

model = DepthAnythingV2(encoder='vits', features=64, out_channels=[48, 96, 192, 384])
model.load_state_dict(torch.load('checkpoints/depth_anything_v2_vits.pth', map_location=device))
model = model.to(device)
model.eval()

# Change UNet


with torch.no_grad():
    new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
    new_conv_in.weight.zero_()
    new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
    new_conv_in.bias = unet.conv_in.bias
    unet.conv_in = new_conv_in


unet_original_forward = unet.forward


def enable_efficient_attention():
    if XFORMERS_AVAILABLE:
        try:
            # RTX 2070 specific settings
            unet.set_use_memory_efficient_attention_xformers(True)
            vae.set_use_memory_efficient_attention_xformers(True)
            print("Enabled xformers memory efficient attention")
        except Exception as e:
            print(f"Xformers error: {e}")
            print("Falling back to sliced attention")
            # Use sliced attention for RTX 2070
            # unet.set_attention_slice_size(4)
            # vae.set_attention_slice_size(4)
            unet.set_attn_processor(AttnProcessor2_0())
            vae.set_attn_processor(AttnProcessor2_0())
    else:
        # Fallback for when xformers is not available
        print("Using sliced attention")
        # unet.set_attention_slice_size(4)
        # vae.set_attention_slice_size(4)
        unet.set_attn_processor(AttnProcessor2_0())
        vae.set_attn_processor(AttnProcessor2_0())

# Add memory clearing function
def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

# Enable efficient attention
enable_efficient_attention()


def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
    c_concat = kwargs['cross_attention_kwargs']['concat_conds'].to(sample)
    c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0)
    new_sample = torch.cat([sample, c_concat], dim=1)
    kwargs['cross_attention_kwargs'] = {}
    return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs)


unet.forward = hooked_unet_forward




sd_offset = sf.load_file(model_path)
sd_origin = unet.state_dict()
keys = sd_origin.keys()
sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
unet.load_state_dict(sd_merged, strict=True)
del sd_offset, sd_origin, sd_merged, keys

# Device

# device = torch.device('cuda')
# text_encoder = text_encoder.to(device=device, dtype=torch.float16)
# vae = vae.to(device=device, dtype=torch.bfloat16)
# unet = unet.to(device=device, dtype=torch.float16)
# rmbg = rmbg.to(device=device, dtype=torch.float32)


# Device and dtype setup
device = torch.device('cuda')
#dtype = torch.float16  # RTX 2070 works well with float16
dtype = torch.bfloat16


pipe = prepare_pipeline(
    base_model="stabilityai/stable-diffusion-xl-base-1.0",
    vae_model="madebyollin/sdxl-vae-fp16-fix",
    unet_model=None,
    lora_model=None,
    adapter_path="huanngzh/mv-adapter",
    scheduler=None,
    num_views=NUM_VIEWS,
    device=device,
    dtype=dtype,
)

# Memory optimizations for RTX 2070
# torch.backends.cudnn.benchmark = True
# if torch.cuda.is_available():
#     torch.backends.cuda.matmul.allow_tf32 = True
#     torch.backends.cudnn.allow_tf32 = True
#     # Set a very small attention slice size for RTX 2070 to avoid OOM
#     torch.backends.cuda.max_split_size_mb = 128

# Move models to device with consistent dtype
text_encoder = text_encoder.to(device=device, dtype=dtype)
vae = vae.to(device=device, dtype=dtype)  # Changed from bfloat16 to float16
unet = unet.to(device=device, dtype=dtype)
#rmbg = rmbg.to(device=device, dtype=torch.float32)  # Keep this as float32
rmbg = rmbg.to(device)

ddim_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

euler_a_scheduler = EulerAncestralDiscreteScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    steps_offset=1
)

dpmpp_2m_sde_karras_scheduler = DPMSolverMultistepScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    algorithm_type="sde-dpmsolver++",
    use_karras_sigmas=True,
    steps_offset=1
)

# Pipelines

t2i_pipe = StableDiffusionPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=dpmpp_2m_sde_karras_scheduler,
    safety_checker=None,
    requires_safety_checker=False,
    feature_extractor=None,
    image_encoder=None
)

i2i_pipe = StableDiffusionImg2ImgPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=dpmpp_2m_sde_karras_scheduler,
    safety_checker=None,
    requires_safety_checker=False,
    feature_extractor=None,
    image_encoder=None
)


@torch.inference_mode()
def encode_prompt_inner(txt: str):
    max_length = tokenizer.model_max_length
    chunk_length = tokenizer.model_max_length - 2
    id_start = tokenizer.bos_token_id
    id_end = tokenizer.eos_token_id
    id_pad = id_end

    def pad(x, p, i):
        return x[:i] if len(x) >= i else x + [p] * (i - len(x))

    tokens = tokenizer(txt, truncation=False, add_special_tokens=False)["input_ids"]
    chunks = [[id_start] + tokens[i: i + chunk_length] + [id_end] for i in range(0, len(tokens), chunk_length)]
    chunks = [pad(ck, id_pad, max_length) for ck in chunks]

    token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64)
    conds = text_encoder(token_ids).last_hidden_state

    return conds


@torch.inference_mode()
def encode_prompt_pair(positive_prompt, negative_prompt):
    c = encode_prompt_inner(positive_prompt)
    uc = encode_prompt_inner(negative_prompt)

    c_len = float(len(c))
    uc_len = float(len(uc))
    max_count = max(c_len, uc_len)
    c_repeat = int(math.ceil(max_count / c_len))
    uc_repeat = int(math.ceil(max_count / uc_len))
    max_chunk = max(len(c), len(uc))

    c = torch.cat([c] * c_repeat, dim=0)[:max_chunk]
    uc = torch.cat([uc] * uc_repeat, dim=0)[:max_chunk]

    c = torch.cat([p[None, ...] for p in c], dim=1)
    uc = torch.cat([p[None, ...] for p in uc], dim=1)

    return c, uc

# @spaces.GPU(duration=60)
# @torch.inference_mode()
@spaces.GPU(duration=60)
@torch.inference_mode()
def infer(
    prompt,
    image,  # This is already RGBA with background removed
    do_rembg=True,
    seed=42,
    randomize_seed=False,
    guidance_scale=3.0,
    num_inference_steps=50,
    reference_conditioning_scale=1.0,
    negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
    progress=gr.Progress(track_tqdm=True),
):
    #logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}")
    
    # Convert input to PIL if needed
    if isinstance(image, np.ndarray):
        if image.shape[-1] == 4:  # RGBA
            image = Image.fromarray(image, 'RGBA')
        else:  # RGB
            image = Image.fromarray(image, 'RGB')
    
    #logging.info(f"Converted to PIL Image mode: {image.mode}")
    
    # No need for remove_bg_fn since image is already processed
    remove_bg_fn = None
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
        
    images, preprocessed_image = run_pipeline(
        pipe,
        num_views=NUM_VIEWS,
        text=prompt,
        image=image,
        height=HEIGHT,
        width=WIDTH,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        seed=seed,
        remove_bg_fn=remove_bg_fn,  # Set to None since preprocessing is done
        reference_conditioning_scale=reference_conditioning_scale,
        negative_prompt=negative_prompt,
        device=device,
    )
    
    # logging.info(f"Output images shape: {[img.shape for img in images]}")
    # logging.info(f"Preprocessed image shape: {preprocessed_image.shape if preprocessed_image is not None else None}")
    return images


@spaces.GPU(duration=60)
@torch.inference_mode()
def pytorch2numpy(imgs, quant=True):
    results = []
    for x in imgs:
        y = x.movedim(0, -1)

        if quant:
            y = y * 127.5 + 127.5
            y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
        else:
            y = y * 0.5 + 0.5
            y = y.detach().float().cpu().numpy().clip(0, 1).astype(np.float32)

        results.append(y)
    return results

@spaces.GPU(duration=60)
@torch.inference_mode()
def numpy2pytorch(imgs):
    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0  # so that 127 must be strictly 0.0
    h = h.movedim(-1, 1)
    return h


def resize_and_center_crop(image, target_width, target_height):
    pil_image = Image.fromarray(image)
    original_width, original_height = pil_image.size
    scale_factor = max(target_width / original_width, target_height / original_height)
    resized_width = int(round(original_width * scale_factor))
    resized_height = int(round(original_height * scale_factor))
    resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
    left = (resized_width - target_width) / 2
    top = (resized_height - target_height) / 2
    right = (resized_width + target_width) / 2
    bottom = (resized_height + target_height) / 2
    cropped_image = resized_image.crop((left, top, right, bottom))
    return np.array(cropped_image)


def resize_without_crop(image, target_width, target_height):
    pil_image = Image.fromarray(image)
    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
    return np.array(resized_image)

# @spaces.GPU(duration=60)
# @torch.inference_mode()
# def run_rmbg(img, sigma=0.0):
#     # Convert RGBA to RGB if needed
#     if img.shape[-1] == 4:
#         # Use white background for alpha composition
#         alpha = img[..., 3:] / 255.0
#         rgb = img[..., :3]
#         white_bg = np.ones_like(rgb) * 255
#         img = (rgb * alpha + white_bg * (1 - alpha)).astype(np.uint8)
    
#     H, W, C = img.shape
#     assert C == 3
#     k = (256.0 / float(H * W)) ** 0.5
#     feed = resize_without_crop(img, int(64 * round(W * k)), int(64 * round(H * k)))
#     feed = numpy2pytorch([feed]).to(device=device, dtype=torch.float32)
#     alpha = rmbg(feed)[0][0]
#     alpha = torch.nn.functional.interpolate(alpha, size=(H, W), mode="bilinear")
#     alpha = alpha.movedim(1, -1)[0]
#     alpha = alpha.detach().float().cpu().numpy().clip(0, 1)
    
#     # Create RGBA image
#     rgba = np.dstack((img, alpha * 255)).astype(np.uint8)
#     result = 127 + (img.astype(np.float32) - 127 + sigma) * alpha
#     return result.clip(0, 255).astype(np.uint8), rgba

@spaces.GPU
@torch.inference_mode()
def run_rmbg(image):
    image_size = image.size
    input_images = transform_image(image).unsqueeze(0).to("cuda")
    # Prediction
    with torch.no_grad():
        preds = rmbg(input_images)[-1].sigmoid().cpu()
    pred = preds[0].squeeze()
    pred_pil = transforms.ToPILImage()(pred)
    mask = pred_pil.resize(image_size)
    image.putalpha(mask)
    return image



def preprocess_image(image: Image.Image, height=768, width=768):
    image = np.array(image)
    alpha = image[..., 3] > 0
    H, W = alpha.shape
    # get the bounding box of alpha
    y, x = np.where(alpha)
    y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
    x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
    image_center = image[y0:y1, x0:x1]
    # resize the longer side to H * 0.9
    H, W, _ = image_center.shape
    if H > W:
        W = int(W * (height * 0.9) / H)
        H = int(height * 0.9)
    else:
        H = int(H * (width * 0.9) / W)
        W = int(width * 0.9)
    image_center = np.array(Image.fromarray(image_center).resize((W, H)))
    # pad to H, W
    start_h = (height - H) // 2
    start_w = (width - W) // 2
    image = np.zeros((height, width, 4), dtype=np.uint8)
    image[start_h : start_h + H, start_w : start_w + W] = image_center
    image = image.astype(np.float32) / 255.0
    image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
    image = (image * 255).clip(0, 255).astype(np.uint8)
    image = Image.fromarray(image)
    return image
  

@spaces.GPU(duration=60)
@torch.inference_mode()
def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
    clear_memory()

    # Get input dimensions
    input_height, input_width = input_fg.shape[:2]

    bg_source = BGSource(bg_source)


    if bg_source == BGSource.UPLOAD:
        pass
    elif bg_source == BGSource.UPLOAD_FLIP:
        input_bg = np.fliplr(input_bg)
    if bg_source == BGSource.GREY:
        input_bg = np.zeros(shape=(input_height, input_width, 3), dtype=np.uint8) + 64
    elif bg_source == BGSource.LEFT:
        gradient = np.linspace(255, 0, input_width)
        image = np.tile(gradient, (input_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.RIGHT:
        gradient = np.linspace(0, 255, input_width)
        image = np.tile(gradient, (input_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.TOP:
        gradient = np.linspace(255, 0, input_height)[:, None]
        image = np.tile(gradient, (1, input_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.BOTTOM:
        gradient = np.linspace(0, 255, input_height)[:, None]
        image = np.tile(gradient, (1, input_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    else:
        raise 'Wrong initial latent!'

    rng = torch.Generator(device=device).manual_seed(int(seed))

    # Use input dimensions directly
    fg = resize_without_crop(input_fg, input_width, input_height)

    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor

    conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)

    if input_bg is None:
        latents = t2i_pipe(
            prompt_embeds=conds,
            negative_prompt_embeds=unconds,
            width=input_width,
            height=input_height,
            num_inference_steps=steps,
            num_images_per_prompt=num_samples,
            generator=rng,
            output_type='latent',
            guidance_scale=cfg,
            cross_attention_kwargs={'concat_conds': concat_conds},
        ).images.to(vae.dtype) / vae.config.scaling_factor
    else:
        bg = resize_without_crop(input_bg, input_width, input_height)
        bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype)
        bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor
        latents = i2i_pipe(
            image=bg_latent,
            strength=lowres_denoise,
            prompt_embeds=conds,
            negative_prompt_embeds=unconds,
            width=input_width,
            height=input_height,
            num_inference_steps=int(round(steps / lowres_denoise)),
            num_images_per_prompt=num_samples,
            generator=rng,
            output_type='latent',
            guidance_scale=cfg,
            cross_attention_kwargs={'concat_conds': concat_conds},
        ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels)
    pixels = [resize_without_crop(
        image=p,
        target_width=int(round(input_width * highres_scale / 64.0) * 64),
        target_height=int(round(input_height * highres_scale / 64.0) * 64))
    for p in pixels]

    pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
    latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
    latents = latents.to(device=unet.device, dtype=unet.dtype)

    highres_height, highres_width = latents.shape[2] * 8, latents.shape[3] * 8

    fg = resize_without_crop(input_fg, highres_width, highres_height)
    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor

    latents = i2i_pipe(
        image=latents,
        strength=highres_denoise,
        prompt_embeds=conds,
        negative_prompt_embeds=unconds,
        width=highres_width,
        height=highres_height,
        num_inference_steps=int(round(steps / highres_denoise)),
        num_images_per_prompt=num_samples,
        generator=rng,
        output_type='latent',
        guidance_scale=cfg,
        cross_attention_kwargs={'concat_conds': concat_conds},
    ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels)
    
    # Resize back to input dimensions
    pixels = [resize_without_crop(p, input_width, input_height) for p in pixels]
    pixels = np.stack(pixels)

    return pixels

def extract_foreground(image):
    if image is None:
        return None, gr.update(visible=True), gr.update(visible=True)
    #logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}")
    #result, rgba = run_rmbg(image)
    result = run_rmbg(image)
    result = preprocess_image(result)
    #logging.info(f"Result shape: {result.shape}, dtype: {result.dtype}")
    #logging.info(f"RGBA shape: {rgba.shape}, dtype: {rgba.dtype}")
    return result, gr.update(visible=True), gr.update(visible=True)

def update_extracted_fg_height(selected_image: gr.SelectData):
    if selected_image:
        # Get the height of the selected image
        height = selected_image.value['image']['shape'][0]  # Assuming the image is in numpy format
        return gr.update(height=height)  # Update the height of extracted_fg
    return gr.update(height=480)  # Default height if no image is selected

@torch.inference_mode()
def process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
    clear_memory()
    bg_source = BGSource(bg_source)

    if bg_source == BGSource.UPLOAD:
        pass
    elif bg_source == BGSource.UPLOAD_FLIP:
        input_bg = np.fliplr(input_bg)
    elif bg_source == BGSource.GREY:
        input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64
    elif bg_source == BGSource.LEFT:
        gradient = np.linspace(224, 32, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.RIGHT:
        gradient = np.linspace(32, 224, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.TOP:
        gradient = np.linspace(224, 32, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.BOTTOM:
        gradient = np.linspace(32, 224, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    else:
        raise 'Wrong background source!'

    rng = torch.Generator(device=device).manual_seed(seed)

    fg = resize_and_center_crop(input_fg, image_width, image_height)
    bg = resize_and_center_crop(input_bg, image_width, image_height)
    concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
    concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1)

    conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)

    latents = t2i_pipe(
        prompt_embeds=conds,
        negative_prompt_embeds=unconds,
        width=image_width,
        height=image_height,
        num_inference_steps=steps,
        num_images_per_prompt=num_samples,
        generator=rng,
        output_type='latent',
        guidance_scale=cfg,
        cross_attention_kwargs={'concat_conds': concat_conds},
    ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels)
    pixels = [resize_without_crop(
        image=p,
        target_width=int(round(image_width * highres_scale / 64.0) * 64),
        target_height=int(round(image_height * highres_scale / 64.0) * 64))
    for p in pixels]

    pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
    latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
    latents = latents.to(device=unet.device, dtype=unet.dtype)

    image_height, image_width = latents.shape[2] * 8, latents.shape[3] * 8
    fg = resize_and_center_crop(input_fg, image_width, image_height)
    bg = resize_and_center_crop(input_bg, image_width, image_height)
    concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
    concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1)

    latents = i2i_pipe(
        image=latents,
        strength=highres_denoise,
        prompt_embeds=conds,
        negative_prompt_embeds=unconds,
        width=image_width,
        height=image_height,
        num_inference_steps=int(round(steps / highres_denoise)),
        num_images_per_prompt=num_samples,
        generator=rng,
        output_type='latent',
        guidance_scale=cfg,
        cross_attention_kwargs={'concat_conds': concat_conds},
    ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels, quant=False)

    clear_memory()
    return pixels, [fg, bg]


@torch.inference_mode()
def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
    # Convert input foreground from PIL to NumPy array if it's in PIL format
    if isinstance(input_fg, Image.Image):
        input_fg = np.array(input_fg)
    logging.info(f"Input foreground shape: {input_fg.shape}, dtype: {input_fg.dtype}")
    results = process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source)
    logging.info(f"Results shape: {results.shape}, dtype: {results.dtype}")
    return results



@torch.inference_mode()
def process_relight_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
    bg_source = BGSource(bg_source)

    # bg_source = "Use Background Image"
    
    # Convert numerical inputs to appropriate types
    image_width = int(image_width)
    image_height = int(image_height)
    num_samples = int(num_samples)
    seed = int(seed)
    steps = int(steps)
    cfg = float(cfg)
    highres_scale = float(highres_scale)
    highres_denoise = float(highres_denoise)

    if bg_source == BGSource.UPLOAD:
        pass
    elif bg_source == BGSource.UPLOAD_FLIP:
        input_bg = np.fliplr(input_bg)
    elif bg_source == BGSource.GREY:
        input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64
    elif bg_source == BGSource.LEFT:
        gradient = np.linspace(224, 32, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.RIGHT:
        gradient = np.linspace(32, 224, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.TOP:
        gradient = np.linspace(224, 32, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.BOTTOM:
        gradient = np.linspace(32, 224, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    else:
        raise ValueError('Wrong background source!')
    
    input_fg, matting = run_rmbg(input_fg)
    results, extra_images = process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source)
    results = [(x * 255.0).clip(0, 255).astype(np.uint8) for x in results]
    final_results = results + extra_images

    # Save the generated images
    save_images(results, prefix="relight")
    
    return results


quick_prompts = [
    'sunshine from window',
    'golden time',
    'natural lighting',
    'warm atmosphere, at home, bedroom',
    'shadow from window',
    'soft studio lighting',
    'home atmosphere, cozy bedroom illumination',
]
quick_prompts = [[x] for x in quick_prompts]


quick_subjects = [
    'modern sofa, high quality leather',
    'elegant dining table, polished wood',
    'luxurious bed, premium mattress',
    'minimalist office desk, clean design',
    'vintage wooden cabinet, antique finish',
]
quick_subjects = [[x] for x in quick_subjects]


class BGSource(Enum):
    UPLOAD = "Use Background Image"
    UPLOAD_FLIP = "Use Flipped Background Image"
    LEFT = "Left Light"
    RIGHT = "Right Light"
    TOP = "Top Light"
    BOTTOM = "Bottom Light"
    GREY = "Ambient"

# Add save function
def save_images(images, prefix="relight"):
    # Create output directory if it doesn't exist
    output_dir = Path("outputs")
    output_dir.mkdir(exist_ok=True)
    
    # Create timestamp for unique filenames
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    saved_paths = []
    for i, img in enumerate(images):
        if isinstance(img, np.ndarray):
            # Convert to PIL Image if numpy array
            img = Image.fromarray(img)
        
        # Create filename with timestamp
        filename = f"{prefix}_{timestamp}_{i+1}.png"
        filepath = output_dir / filename
        
        # Save image
        img.save(filepath)


    # print(f"Saved {len(saved_paths)} images to {output_dir}")
    return saved_paths


class MaskMover:
    def __init__(self):
        self.extracted_fg = None
        self.original_fg = None  # Store original foreground
        
    def set_extracted_fg(self, fg_image):
        """Store the extracted foreground with alpha channel"""
        if isinstance(fg_image, np.ndarray):
            self.extracted_fg = fg_image.copy()
            self.original_fg = fg_image.copy()
        else:
            self.extracted_fg = np.array(fg_image)
            self.original_fg = np.array(fg_image)
        return self.extracted_fg
    
    def create_composite(self, background, x_pos, y_pos, scale=1.0):
        """Create composite with foreground at specified position"""
        if self.original_fg is None or background is None:
            return background
        
        # Convert inputs to PIL Images
        if isinstance(background, np.ndarray):
            bg = Image.fromarray(background).convert('RGBA')
        else:
            bg = background.convert('RGBA')
        
        if isinstance(self.original_fg, np.ndarray):
            fg = Image.fromarray(self.original_fg).convert('RGBA')
        else:
            fg = self.original_fg.convert('RGBA')
        
        # Scale the foreground size
        new_width = int(fg.width * scale)
        new_height = int(fg.height * scale)
        fg = fg.resize((new_width, new_height), Image.LANCZOS)
        
        # Center the scaled foreground at the position
        x = int(x_pos - new_width / 2)
        y = int(y_pos - new_height / 2)
        
        # Create composite
        result = bg.copy()
        result.paste(fg, (x, y), fg)  # Use fg as the mask (requires fg to be in 'RGBA' mode)
        
        return np.array(result.convert('RGB'))  # Convert back to 'RGB' if needed
        
def get_depth(image):
    if image is None:
        return None
    # Convert from PIL/gradio format to cv2
    raw_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    # Get depth map
    depth = model.infer_image(raw_img) # HxW raw depth map
    # Normalize depth for visualization
    depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8)
    # Convert to RGB for display
    depth_colored = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
    depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
    return Image.fromarray(depth_colored)


from PIL import Image

def compress_image(image):
    # Convert Gradio image (numpy array) to PIL Image
    img = Image.fromarray(image)
    
    # Resize image if dimensions are too large
    max_size = 1024  # Maximum dimension size
    if img.width > max_size or img.height > max_size:
        ratio = min(max_size/img.width, max_size/img.height)
        new_size = (int(img.width * ratio), int(img.height * ratio))
        img = img.resize(new_size, Image.Resampling.LANCZOS)
    
    quality = 95  # Start with high quality
    img.save("compressed_image.jpg", "JPEG", quality=quality)  # Initial save
    
    # Check file size and adjust quality if necessary
    while os.path.getsize("compressed_image.jpg") > 100 * 1024:  # 100KB limit
        quality -= 5  # Decrease quality
        img.save("compressed_image.jpg", "JPEG", quality=quality)
        if quality < 20:  # Prevent quality from going too low
            break
    
    # Convert back to numpy array for Gradio
    compressed_img = np.array(Image.open("compressed_image.jpg"))
    return compressed_img

def use_orientation(selected_image:gr.SelectData):
    return selected_image.value['image']['path']
    

@spaces.GPU(duration=60)
@torch.inference_mode
def process_image(input_image, input_text):
    """Main processing function for the Gradio interface"""

    
    
    if isinstance(input_image, Image.Image):
        input_image = np.array(input_image)

    # Initialize configs
    API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
    SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
    SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml")
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    HEIGHT = 768
    WIDTH = 768


    # Initialize DDS client
    config = Config(API_TOKEN)
    client = Client(config)

    # Process classes from text prompt
    classes = [x.strip().lower() for x in input_text.split('.') if x]
    class_name_to_id = {name: id for id, name in enumerate(classes)}
    class_id_to_name = {id: name for name, id in class_name_to_id.items()}

    

    # Save input image to temp file and get URL
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
        cv2.imwrite(tmpfile.name, input_image)
        image_url = client.upload_file(tmpfile.name)
    os.remove(tmpfile.name)

    # Process detection results
    input_boxes = []
    masks = []
    confidences = []
    class_names = []
    class_ids = []

    if len(input_text) == 0:
        task = DinoxTask(
        image_url=image_url,
        prompts=[TextPrompt(text="<prompt_free>")],
        # targets=[DetectionTarget.BBox, DetectionTarget.Mask]
        )
        
        client.run_task(task)
        predictions = task.result.objects
        classes = [pred.category for pred in predictions]
        classes = list(set(classes))
        class_name_to_id = {name: id for id, name in enumerate(classes)}
        class_id_to_name = {id: name for name, id in class_name_to_id.items()}

        for idx, obj in enumerate(predictions):
            input_boxes.append(obj.bbox)
            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
            confidences.append(obj.score)
            cls_name = obj.category.lower().strip()
            class_names.append(cls_name)
            class_ids.append(class_name_to_id[cls_name])

        boxes = np.array(input_boxes)
        masks = np.array(masks)
        class_ids = np.array(class_ids)
        labels = [
            f"{class_name} {confidence:.2f}"
            for class_name, confidence
            in zip(class_names, confidences)
        ]
        detections = sv.Detections(
            xyxy=boxes,
            mask=masks.astype(bool),
            class_id=class_ids
        )

        box_annotator = sv.BoxAnnotator()
        label_annotator = sv.LabelAnnotator()
        mask_annotator = sv.MaskAnnotator()

        annotated_frame = input_image.copy()
        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)

        # Create transparent mask for first detected object
        if len(detections) > 0:
            # Get first mask
            first_mask = detections.mask[0]
            
            # Get original RGB image
            img = input_image.copy()

            H, W, C = img.shape
            
            # Create RGBA image
            alpha = np.zeros((H, W, 1), dtype=np.uint8)
            
            alpha[first_mask] = 255
            
            # rgba = np.dstack((img, alpha)).astype(np.uint8)
            
            # Crop to mask bounds to minimize image size
            # y_indices, x_indices = np.where(first_mask)
            # y_min, y_max = y_indices.min(), y_indices.max()
            # x_min, x_max = x_indices.min(), x_indices.max()
            
            # Crop the RGBA image
            # cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
            
            # Set extracted foreground for mask mover
            # mask_mover.set_extracted_fg(cropped_rgba)

            # alpha = img[..., 3] > 0
            H, W = alpha.shape
            # get the bounding box of alpha
            y, x = np.where(alpha > 0)
            y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
            x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
            
            image_center = img[y0:y1, x0:x1]
            # resize the longer side to H * 0.9
            H, W, _ = image_center.shape
            if H > W:
                W = int(W * (HEIGHT * 0.9) / H)
                H = int(HEIGHT * 0.9)
            else:
                H = int(H * (WIDTH * 0.9) / W)
                W = int(WIDTH * 0.9)
                
            image_center = np.array(Image.fromarray(image_center).resize((W, H)))
            # pad to H, W
            start_h = (HEIGHT - H) // 2
            start_w = (WIDTH - W) // 2
            image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
            image[start_h : start_h + H, start_w : start_w + W] = image_center
            image = image.astype(np.float32) / 255.0
            image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
            image = (image * 255).clip(0, 255).astype(np.uint8)
            image = Image.fromarray(image)
            
            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)

    
    else:
        # Run DINO-X detection
        task = DinoxTask(
            image_url=image_url,
            prompts=[TextPrompt(text=input_text)],
            targets=[DetectionTarget.BBox, DetectionTarget.Mask]
        )
        
        client.run_task(task)
        result = task.result
        objects = result.objects
    
    
    
        # for obj in objects:
        #     input_boxes.append(obj.bbox)
        #     confidences.append(obj.score)
        #     cls_name = obj.category.lower().strip()
        #     class_names.append(cls_name)
        #     class_ids.append(class_name_to_id[cls_name])
    
        # input_boxes = np.array(input_boxes)
        # class_ids = np.array(class_ids)

        predictions = task.result.objects
        classes = [x.strip().lower() for x in input_text.split('.') if x]
        class_name_to_id = {name: id for id, name in enumerate(classes)}
        class_id_to_name = {id: name for name, id in class_name_to_id.items()}
        
        boxes = []
        masks = []
        confidences = []
        class_names = []
        class_ids = []
        
        for idx, obj in enumerate(predictions):
            boxes.append(obj.bbox)
            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
            confidences.append(obj.score)
            cls_name = obj.category.lower().strip()
            class_names.append(cls_name)
            class_ids.append(class_name_to_id[cls_name])

        boxes = np.array(boxes)
        masks = np.array(masks)
        class_ids = np.array(class_ids)
        labels = [
            f"{class_name} {confidence:.2f}"
            for class_name, confidence
            in zip(class_names, confidences)
        ]
        
        # Initialize SAM2
        # torch.autocast(device_type=DEVICE, dtype=torch.bfloat16).__enter__()
        # if torch.cuda.get_device_properties(0).major >= 8:
        #     torch.backends.cuda.matmul.allow_tf32 = True
        #     torch.backends.cudnn.allow_tf32 = True
    
        # sam2_model = build_sam2(SAM2_MODEL_CONFIG, SAM2_CHECKPOINT, device=DEVICE)
        # sam2_predictor = SAM2ImagePredictor(sam2_model)
        # sam2_predictor.set_image(input_image)
    
        # sam2_predictor = run_sam_inference(SAM_IMAGE_MODEL, input_image, detections)
    
    
        # Get masks from SAM2
        # masks, scores, logits = sam2_predictor.predict(
        #     point_coords=None,
        #     point_labels=None,
        #     box=input_boxes,
        #     multimask_output=False,
        # )
        
        if masks.ndim == 4:
            masks = masks.squeeze(1)
    
        # Create visualization
        # labels = [f"{class_name} {confidence:.2f}" 
        #          for class_name, confidence in zip(class_names, confidences)]
    
        # detections = sv.Detections(
        #     xyxy=input_boxes,
        #     mask=masks.astype(bool),
        #     class_id=class_ids
        # )

        detections = sv.Detections(
        xyxy = boxes,
        mask = masks.astype(bool),
        class_id = class_ids,
    )
        
        box_annotator = sv.BoxAnnotator()
        label_annotator = sv.LabelAnnotator()
        mask_annotator = sv.MaskAnnotator()
    
        annotated_frame = input_image.copy()
        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
    
        # Create transparent mask for first detected object
        if len(detections) > 0:
            # Get first mask
            first_mask = detections.mask[0]
            
            # Get original RGB image
            img = input_image.copy()
            H, W, C = img.shape
            
            first_mask = detections.mask[0]
            

            
            # Create RGBA image
            alpha = np.zeros((H, W, 1), dtype=np.uint8)
            
            alpha[first_mask] = 255
            
            # rgba = np.dstack((img, alpha)).astype(np.uint8)
            
            # Crop to mask bounds to minimize image size
            # y_indices, x_indices = np.where(first_mask)
            # y_min, y_max = y_indices.min(), y_indices.max()
            # x_min, x_max = x_indices.min(), x_indices.max()
            
            # Crop the RGBA image
            # cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
            
            # Set extracted foreground for mask mover
            # mask_mover.set_extracted_fg(cropped_rgba)

            # alpha = img[..., 3] > 0
            H, W = alpha.shape
            # get the bounding box of alpha
            y, x = np.where(alpha > 0)
            y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
            x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
            
            image_center = img[y0:y1, x0:x1]
            # resize the longer side to H * 0.9
            H, W, _ = image_center.shape
            if H > W:
                W = int(W * (HEIGHT * 0.9) / H)
                H = int(HEIGHT * 0.9)
            else:
                H = int(H * (WIDTH * 0.9) / W)
                W = int(WIDTH * 0.9)
                
            image_center = np.array(Image.fromarray(image_center).resize((W, H)))
            # pad to H, W
            start_h = (HEIGHT - H) // 2
            start_w = (WIDTH - W) // 2
            image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
            image[start_h : start_h + H, start_w : start_w + W] = image_center
            image = image.astype(np.float32) / 255.0
            image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
            image = (image * 255).clip(0, 255).astype(np.uint8)
            image = Image.fromarray(image)
            
            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
        return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)


@spaces.GPU(duration=60)
@torch.inference_mode
def process_image(input_image, input_text):
    """Main processing function for the Gradio interface"""

    if isinstance(input_image, Image.Image):
        input_image = np.array(input_image)

    # Initialize configs
    API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720"
    SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt"
    SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml")
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    HEIGHT = 768
    WIDTH = 768

    # Initialize DDS client
    config = Config(API_TOKEN)
    client = Client(config)

    # Process classes from text prompt
    classes = [x.strip().lower() for x in input_text.split('.') if x]
    class_name_to_id = {name: id for id, name in enumerate(classes)}
    class_id_to_name = {id: name for name, id in class_name_to_id.items()}

    # Save input image to temp file and get URL
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
        cv2.imwrite(tmpfile.name, input_image)
        image_url = client.upload_file(tmpfile.name)
    os.remove(tmpfile.name)

    # Process detection results
    input_boxes = []
    masks = []
    confidences = []
    class_names = []
    class_ids = []

    if len(input_text) == 0:
        task = DinoxTask(
            image_url=image_url,
            prompts=[TextPrompt(text="<prompt_free>")],
            # targets=[DetectionTarget.BBox, DetectionTarget.Mask]
        )

        client.run_task(task)
        predictions = task.result.objects
        classes = [pred.category for pred in predictions]
        classes = list(set(classes))
        class_name_to_id = {name: id for id, name in enumerate(classes)}
        class_id_to_name = {id: name for name, id in class_name_to_id.items()}

        for idx, obj in enumerate(predictions):
            input_boxes.append(obj.bbox)
            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
            confidences.append(obj.score)
            cls_name = obj.category.lower().strip()
            class_names.append(cls_name)
            class_ids.append(class_name_to_id[cls_name])

        boxes = np.array(input_boxes)
        masks = np.array(masks)
        class_ids = np.array(class_ids)
        labels = [
            f"{class_name} {confidence:.2f}"
            for class_name, confidence
            in zip(class_names, confidences)
        ]
        detections = sv.Detections(
            xyxy=boxes,
            mask=masks.astype(bool),
            class_id=class_ids
        )

        box_annotator = sv.BoxAnnotator()
        label_annotator = sv.LabelAnnotator()
        mask_annotator = sv.MaskAnnotator()

        annotated_frame = input_image.copy()
        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)

        # Create transparent mask for first detected object
        if len(detections) > 0:
            # Get first mask
            first_mask = detections.mask[0]

            # Get original RGB image
            img = input_image.copy()
            H, W, C = img.shape

            # Create RGBA image with default 255 alpha
            alpha = np.zeros((H, W, 1), dtype=np.uint8)
            alpha[~first_mask] = 0 # 128 # for semi-transparency background
            alpha[first_mask] = 255 # Make the foreground opaque
            alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D
            rgba = np.dstack((img, alpha)).astype(np.uint8)

            # get the bounding box of alpha
            y, x = np.where(alpha > 0)
            y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
            x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)

            image_center = rgba[y0:y1, x0:x1]
            # resize the longer side to H * 0.9
            H, W, _ = image_center.shape
            if H > W:
                W = int(W * (HEIGHT * 0.9) / H)
                H = int(HEIGHT * 0.9)
            else:
                H = int(H * (WIDTH * 0.9) / W)
                W = int(WIDTH * 0.9)
                
            image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
            # pad to H, W
            start_h = (HEIGHT - H) // 2
            start_w = (WIDTH - W) // 2
            image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
            image[start_h : start_h + H, start_w : start_w + W] = image_center
            image = image.astype(np.float32) / 255.0
            image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
            image = (image * 255).clip(0, 255).astype(np.uint8)
            image = Image.fromarray(image)

            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
        return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
    else:
        # Run DINO-X detection
        task = DinoxTask(
            image_url=image_url,
            prompts=[TextPrompt(text=input_text)],
            targets=[DetectionTarget.BBox, DetectionTarget.Mask]
        )
        
        client.run_task(task)
        result = task.result
        objects = result.objects
    
        predictions = task.result.objects
        classes = [x.strip().lower() for x in input_text.split('.') if x]
        class_name_to_id = {name: id for id, name in enumerate(classes)}
        class_id_to_name = {id: name for name, id in class_name_to_id.items()}
        
        boxes = []
        masks = []
        confidences = []
        class_names = []
        class_ids = []
        
        for idx, obj in enumerate(predictions):
            boxes.append(obj.bbox)
            masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size))  # convert mask to np.array using DDS API
            confidences.append(obj.score)
            cls_name = obj.category.lower().strip()
            class_names.append(cls_name)
            class_ids.append(class_name_to_id[cls_name])

        boxes = np.array(boxes)
        masks = np.array(masks)
        class_ids = np.array(class_ids)
        labels = [
            f"{class_name} {confidence:.2f}"
            for class_name, confidence
            in zip(class_names, confidences)
        ]

        detections = sv.Detections(
            xyxy=boxes,
            mask=masks.astype(bool),
            class_id=class_ids,
        )
        
        box_annotator = sv.BoxAnnotator()
        label_annotator = sv.LabelAnnotator()
        mask_annotator = sv.MaskAnnotator()
    
        annotated_frame = input_image.copy()
        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections)
        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
        annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
    
        # Create transparent mask for first detected object
        if len(detections) > 0:
            # Get first mask
            first_mask = detections.mask[0]
            
            # Get original RGB image
            img = input_image.copy()
            H, W, C = img.shape
            
            # Create RGBA image with default 255 alpha
            alpha = np.zeros((H, W, 1), dtype=np.uint8)
            alpha[~first_mask] = 0 # 128 for semi-transparency background
            alpha[first_mask] = 255 # Make the foreground opaque
            alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D
            rgba = np.dstack((img, alpha)).astype(np.uint8)
            # get the bounding box of alpha
            y, x = np.where(alpha > 0)
            y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
            x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)

            image_center = rgba[y0:y1, x0:x1]
            # resize the longer side to H * 0.9
            H, W, _ = image_center.shape
            if H > W:
                W = int(W * (HEIGHT * 0.9) / H)
                H = int(HEIGHT * 0.9)
            else:
                H = int(H * (WIDTH * 0.9) / W)
                W = int(WIDTH * 0.9)
                
            image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
            # pad to H, W
            start_h = (HEIGHT - H) // 2
            start_w = (WIDTH - W) // 2
            image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8)
            image[start_h : start_h + H, start_w : start_w + W] = image_center
            image = image.astype(np.float32) / 255.0
            image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
            image = (image * 255).clip(0, 255).astype(np.uint8)
            image = Image.fromarray(image)
            
            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
        return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
        

block = gr.Blocks().queue()
with block:
    with gr.Tab("Text"):
        with gr.Row():
            gr.Markdown("## Product Placement from Text")
        with gr.Row():
            with gr.Column():
                with gr.Row():
                    input_fg = gr.Image(type="pil", label="Image", height=480)
                with gr.Row():
                    with gr.Group():
                        find_objects_button = gr.Button(value="(Option 1) Segment Object from text")
                        text_prompt = gr.Textbox(
                                label="Text Prompt", 
                                placeholder="Enter object classes separated by periods (e.g. 'car . person .'), leave empty to get all objects",
                                value=""
                            )
                    extract_button = gr.Button(value="Remove Background")
                with gr.Row():
                    extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480)
                    extracted_fg = gr.Image(type="pil", label="Extracted Foreground", height=480)
                    angles_fg = gr.Image(type="pil", label="Converted Foreground", height=480, visible=False)

                    
                    
                    # output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480)
                with gr.Group():
                    run_button = gr.Button("Generate alternative angles")
                    orientation_result = gr.Gallery(
                                label="Result",
                                show_label=False,
                                columns=[3],
                                rows=[2],
                                object_fit="fill",
                                height="auto",
                                allow_preview=False,
                            )                
    
                if orientation_result:
                    orientation_result.select(use_orientation, inputs=None, outputs=extracted_fg)
                    
                dummy_image_for_outputs = gr.Image(visible=False, label='Result')

                
            with gr.Column():
                result_gallery = gr.Gallery(height=832, object_fit='contain', label='Outputs')

                with gr.Row():
                    with gr.Group():
                        prompt = gr.Textbox(label="Prompt")
                        bg_source = gr.Radio(choices=[e.value for e in list(BGSource)[2:]],
                                            value=BGSource.LEFT.value,
                                            label="Lighting Preference (Initial Latent)", type='value')
                    
                        example_quick_subjects = gr.Dataset(samples=quick_subjects, label='Subject Quick List', samples_per_page=1000, components=[prompt])
                        example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Lighting Quick List', samples_per_page=1000, components=[prompt])
                with gr.Row():    
                    relight_button = gr.Button(value="Relight")

                with gr.Group(visible=False):
                    with gr.Row():
                        num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                        seed = gr.Number(label="Seed", value=12345, precision=0)

                    with gr.Row():
                        image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
                        image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)

                        with gr.Accordion("Advanced options", open=False):
                            steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=15, step=1)
                            cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=2, step=0.01, visible=False)
                            lowres_denoise = gr.Slider(label="Lowres Denoise (for initial latent)", minimum=0.1, maximum=1.0, value=0.9, step=0.01)
                            highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=3.0, value=1.5, step=0.01)
                            highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=1.0, value=0.5, step=0.01)
                            a_prompt = gr.Textbox(label="Added Prompt", value='best quality', visible=False)
                            n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality', visible=False)
                            x_slider = gr.Slider(
                                minimum=0,
                                maximum=1000,
                                label="X Position",
                                value=500,
                                visible=False
                            )
                            y_slider = gr.Slider(
                                minimum=0,
                                maximum=1000,
                                label="Y Position",
                                value=500,
                                visible=False
                            )
                        
        # with gr.Row():
            
            # gr.Examples(
            #     fn=lambda *args: ([args[-1]], None),
            #     examples=db_examples.foreground_conditioned_examples,
            #     inputs=[
            #         input_fg, prompt, bg_source, image_width, image_height, seed, dummy_image_for_outputs
            #     ],
            #     outputs=[result_gallery, output_bg],
            #     run_on_click=True, examples_per_page=1024
            # )
        ips = [extracted_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source]
        relight_button.click(fn=process_relight, inputs=ips, outputs=[result_gallery])
        example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
        example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)


        def convert_to_pil(image):
            try:
                #logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}")
                image = image.astype(np.uint8)
                logging.info(f"Converted image shape: {image.shape}, dtype: {image.dtype}")
                return image
            except Exception as e:
                logging.error(f"Error converting image: {e}")
            return image
                      
        run_button.click(
            fn=convert_to_pil, 
            inputs=extracted_fg,  # This is already RGBA with removed background
            outputs=angles_fg
        ).then(
            fn=infer,
            inputs=[
                text_prompt,
                extracted_fg,  # Already processed RGBA image
            ],
            outputs=[orientation_result],
        )
        
        find_objects_button.click(
            fn=process_image,
            inputs=[input_fg, text_prompt],
            outputs=[extracted_objects, extracted_fg]
            )
        
        extract_button.click(
            fn=extract_foreground,
            inputs=[input_fg],
            outputs=[extracted_fg, x_slider, y_slider]
        )

block.launch(server_name='0.0.0.0', share=False)