Spaces:

NikhilJoson
/

Add-it

Running on Zero

App Files Files Community

Add-it / app.py

NikhilJoson

Update app.py

5eea90e verified 14 days ago

raw

history blame

13.9 kB

	#Importing required libraries
	import spaces
	import gradio as gr

	import os
	import random
	import numpy as np
	import cv2
	from PIL import Image
	from dataclasses import dataclass
	from typing import Any, List, Dict, Optional, Union, Tuple

	import torch
	import google.generativeai as genai
	from transformers import AutoModelForMaskGeneration, AutoProcessor, pipeline, T5EncoderModel, CLIPTextModel
	from diffusers import FluxTransformer2DModel, FluxInpaintPipeline


	MARKDOWN = """
	# Add-It🎨
	Add or Replace anything to any image by using a single Prompt and an Image.
	Made using [Flux (Schnell)](https://huggingface.co/black-forest-labs/FLUX.1-schnell), [Grounding-DINO](https://huggingface.co/docs/transformers/main/en/model_doc/grounding-dino) and [SAM](https://huggingface.co/docs/transformers/en/model_doc/sam).
	"""


	#Gemini Setup
	genai.configure(api_key = os.environ['Gemini_API'])
	gemini_flash = genai.GenerativeModel(model_name='gemini-1.5-flash-002')

	def gemini_predict(prompt):
	system_message = f"""You are the best text analyser.
	You have to analyse a user query and identify what the user wants to change, from a given user query.

	Examples:
	Query: Change Lipstick colour to blue
	Response: Lips

	Query: Add a nose stud
	Response: Nose

	Query: Add a wallpaper to the right wall
	Response: Right wall

	Query: Change the Sofa's colour to Purple
	Response: Sofa

	Your response should be in 1 or 2-3 words
	Query : {prompt}
	"""
	response = gemini_flash.generate_content(system_message)
	return(str(response.text)[:-1])



	MAX_SEED = np.iinfo(np.int32).max
	SAM_device = "cuda" # or "cpu"
	DEVICE = "cuda"


	###GroundingDINO & SAM Setup

	#To store DINO results
	@dataclass
	class BoundingBox:
	xmin: int
	ymin: int
	xmax: int
	ymax: int

	@property
	def xyxy(self) -> List[float]:
	return [self.xmin, self.ymin, self.xmax, self.ymax]

	@dataclass
	class DetectionResult:
	score: float
	label: str
	box: BoundingBox
	mask: Optional[np.array] = None

	@classmethod
	def from_dict(cls, detection_dict: Dict) -> 'DetectionResult':
	return cls(score=detection_dict['score'],
	label=detection_dict['label'],
	box=BoundingBox(xmin=detection_dict['box']['xmin'],
	ymin=detection_dict['box']['ymin'],
	xmax=detection_dict['box']['xmax'],
	ymax=detection_dict['box']['ymax']))

	#Utility Functions for Mask Generation
	def mask_to_polygon(mask: np.ndarray) -> List[List[int]]:
	# Find contours in the binary mask
	contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	# Find the contour with the largest area
	largest_contour = max(contours, key=cv2.contourArea)

	# Extract the vertices of the contour
	polygon = largest_contour.reshape(-1, 2).tolist()

	return polygon

	def polygon_to_mask(polygon: List[Tuple[int, int]], image_shape: Tuple[int, int]) -> np.ndarray:
	"""
	Convert a polygon to a segmentation mask.

	Args:
	- polygon (list): List of (x, y) coordinates representing the vertices of the polygon.
	- image_shape (tuple): Shape of the image (height, width) for the mask.

	Returns:
	- np.ndarray: Segmentation mask with the polygon filled.
	"""
	# Create an empty mask
	mask = np.zeros(image_shape, dtype=np.uint8)

	# Convert polygon to an array of points
	pts = np.array(polygon, dtype=np.int32)

	# Fill the polygon with white color (255)
	cv2.fillPoly(mask, [pts], color=(255,))

	return mask

	def get_boxes(results: DetectionResult) -> List[List[List[float]]]:
	boxes = []
	for result in results:
	xyxy = result.box.xyxy
	boxes.append(xyxy)

	return [boxes]

	def refine_masks(masks: torch.BoolTensor, polygon_refinement: bool = False) -> List[np.ndarray]:
	masks = masks.cpu().float()
	masks = masks.permute(0, 2, 3, 1)
	masks = masks.mean(axis=-1)
	masks = (masks > 0).int()
	masks = masks.numpy().astype(np.uint8)
	masks = list(masks)

	#print(masks)

	if polygon_refinement:
	for idx, mask in enumerate(masks):
	shape = mask.shape
	polygon = mask_to_polygon(mask)
	mask = polygon_to_mask(polygon, shape)
	masks[idx] = mask

	return masks

	def get_alphacomp_mask(mask, image, random_color=True):
	annotated_frame_pil = Image.fromarray(image).convert("RGBA")
	mask_image_pil = Image.fromarray(mask).convert("RGBA")

	return np.array(Image.alpha_composite(annotated_frame_pil, mask_image_pil))


	# Use Grounding DINO to detect a set of labels in an image in a zero-shot fashion.
	detector_id = "IDEA-Research/grounding-dino-tiny"
	object_detector = pipeline(model=detector_id, task="zero-shot-object-detection", device=SAM_device)

	#Use Segment Anything (SAM) to generate masks given an image + a set of bounding boxes.
	segmenter_id = "facebook/sam-vit-base"
	processor = AutoProcessor.from_pretrained(segmenter_id)
	segmentator = AutoModelForMaskGeneration.from_pretrained(segmenter_id).to(SAM_device)

	def detect(image: Image.Image, labels: List[str], threshold: float = 0.3) -> List[Dict[str, Any]]:
	labels = [label if label.endswith(".") else label+"." for label in labels]

	with torch.no_grad():
	results = object_detector(image, candidate_labels=labels, threshold=threshold)
	torch.cuda.empty_cache()

	results = [DetectionResult.from_dict(result) for result in results]
	#print("DINO results:", results)
	return results

	def segment_SAM(image: Image.Image, detection_results: List[Dict[str, Any]], polygon_refinement: bool = False) -> List[DetectionResult]:
	boxes = get_boxes(detection_results)
	inputs = processor(images=image, input_boxes=boxes, return_tensors="pt").to(SAM_device)

	with torch.no_grad():
	outputs = segmentator(**inputs)
	torch.cuda.empty_cache()

	masks = processor.post_process_masks(masks=outputs.pred_masks, original_sizes=inputs.original_sizes,
	reshaped_input_sizes=inputs.reshaped_input_sizes)[0]

	#print("Masks:", masks)
	masks = refine_masks(masks, polygon_refinement)

	for detection_result, mask in zip(detection_results, masks):
	detection_result.mask = mask

	return detection_results

	def grounded_segmentation(image: Union[Image.Image, str], labels: List[str], threshold: float = 0.3,
	polygon_refinement: bool = False) -> Tuple[np.ndarray, List[DetectionResult]]:

	if isinstance(image, str):
	image = load_image(image)

	detections = detect(image, labels, threshold)
	segmented = segment_SAM(image, detections, polygon_refinement)

	return np.array(image), segmented

	def get_finalmask(image_array, detections):
	for i,d in enumerate(detections):
	mask_ = d.__getattribute__('mask')
	if i==0:
	image_with_mask = get_alphacomp_mask(mask_, image_array)
	else:
	image_with_mask += get_alphacomp_mask(mask_, image_array)

	return image_with_mask

	#Preprocessing Mask
	kernel = np.ones((3, 3), np.uint8) # Taking a matrix of size 3 as the kernel
	def preprocess_mask(pipe, inp_mask, expan_lvl, blur_lvl):
	if expan_lvl>0:
	inp_mask = Image.fromarray(cv2.dilate(np.array(inp_mask), kernel, iterations=expan_lvl))

	if blur_lvl>0:
	inp_mask = pipe.mask_processor.blur(inp_mask, blur_factor=blur_lvl)

	# inp_mask = Image.fromarray(np.array(inp_mask))
	return inp_mask


	def generate_mask(inp_image, label, threshold):
	image_array, segments = grounded_segmentation(image=inp_image, labels=label, threshold=threshold, polygon_refinement=True,)
	inp_mask = get_finalmask(image_array, segments)
	# print(type(inp_mask))
	return inp_mask


	#Setting up Flux (Schnell) Inpainting
	transformer_ = FluxTransformer2DModel.from_pretrained("ashen0209/Flux-Dev2Pro", torch_dtype=torch.bfloat16)
	text_encoder_ = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
	text_encoder_2_ = T5EncoderModel.from_pretrained("xlabs-ai/xflux_text_encoders", torch_dtype=torch.bfloat16)

	inpaint_pipe = FluxInpaintPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell",transformer=transformer_,text_encoder=text_encoder_,text_encoder_2=text_encoder_2_, torch_dtype=torch.bfloat16).to(DEVICE)
	#inpaint_pipe.load_lora_weights("XLabs-AI/flux-RealismLora")


	#Uncomment the following 4 lines, if you want LoRA Realism weights added to the pipeline
	# inpaint_pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-realism_v2.3.safetensors', adapter_name="better")
	# inpaint_pipe.set_adapters(["better"], adapter_weights=[2.6])
	# inpaint_pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
	# inpaint_pipe.unload_lora_weights()

	#torch.cuda.empty_cache()

	@spaces.GPU()
	def process(input_image_editor, input_text, strength, seed, randomize_seed, num_inference_steps, guidance_scale, threshold, expan_lvl, blur_lvl, progress=gr.Progress(track_tqdm=True)):
	if not input_text:
	raise gr.Error("Please enter a text prompt.")
	#Object identification
	item = gemini_predict(input_text)
	#print(item)

	image = input_image_editor['background']
	if not image:
	raise gr.Error("Please upload an image.")
	width, height = image.size
	if width>1024 or height>1024:
	image.thumbnail((1024, 1024))

	if randomize_seed:
	seed = random.randint(0, MAX_SEED)


	#Generating Mask
	label = [item]
	gen_mask = generate_mask(image, label, threshold)
	#Pre-processing Mask, optional
	if expan_lvl>0 or blur_lvl>0:
	gen_mask = preprocess_mask(inpaint_pipe, gen_mask, expan_lvl, blur_lvl)

	#Inpainting
	generator = torch.Generator(device=DEVICE).manual_seed(seed)
	result = inpaint_pipe(prompt=input_text, image=image, mask_image=gen_mask, width=width, height=height,
	strength=strength, num_inference_steps=num_inference_steps, generator=generator,
	guidance_scale=guidance_scale).images[0]


	return result, gen_mask, seed, item

	with gr.Blocks(theme=gr.themes.Ocean()) as demo:
	gr.Markdown(MARKDOWN)
	with gr.Row():
	with gr.Column(scale=1):
	input_image_component = gr.ImageEditor(
	label='Image',
	type='pil',
	sources=["upload", "webcam"],
	image_mode='RGB',
	layers=False)
	input_text_component = gr.Text(
	label="Prompt",
	show_label=False,
	max_lines=1,
	placeholder="Enter your prompt",
	container=False,)
	with gr.Accordion("Advanced Settings", open=False):
	strength_slider = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.96,
	step=0.01,
	label="Strength"
	)
	num_inference_steps = gr.Slider(
	minimum=1,
	maximum=100,
	value=16,
	step=1,
	label="Number of inference steps"
	)
	guidance_scale = gr.Slider(
	label="Guidance Scale",
	minimum=1,
	maximum=15,
	step=0.1,
	value=5,
	)
	seed_number = gr.Number(
	label="Seed",
	value=26,
	precision=0
	)
	randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
	with gr.Accordion("Mask Settings", open=False):
	SAM_threshold = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.4,
	step=0.01,
	label="Threshold"
	)
	expansion_level = gr.Slider(
	minimum=0,
	maximum=10,
	value=2,
	step=1,
	label="Mask Expansion level"
	)
	blur_level = gr.Slider(
	minimum=0,
	maximum=5,
	step=1,
	value=0,
	label="Mask Blur level"
	)

	submit_button_component = gr.Button(value='Inpaint', variant='primary')
	with gr.Column(scale=1):
	output_image_component = gr.Image(type='pil', image_mode='RGB', label='Generated Image')
	output_mask_component = gr.Image(type='pil', image_mode='RGB', label='Generated Mask')
	with gr.Accordion("Debug Info", open=False):
	output_seed = gr.Number(label="Used Seed")
	identified_item = gr.Textbox(label="Gemini predicted item")

	submit_button_component.click(
	fn=process,
	inputs=[input_image_component, input_text_component, strength_slider, seed_number, randomize_seed, num_inference_steps, guidance_scale, SAM_threshold, expansion_level, blur_level],
	outputs=[output_image_component, output_mask_component, output_seed, identified_item]
	)

	demo.launch(debug=False, show_error=True)