TextDiffuser / util.py
imjunaidafzal's picture
Duplicate from JingyeChen22/TextDiffuser
bb90efe
raw
history blame
12 kB
# ------------------------------------------
# TextDiffuser: Diffusion Models as Text Painters
# Paper Link: https://arxiv.org/abs/2305.10855
# Code Link: https://github.com/microsoft/unilm/tree/master/textdiffuser
# Copyright (c) Microsoft Corporation.
# This file defines a set of commonly used utility functions.
# ------------------------------------------
import os
import re
import cv2
import math
import shutil
import string
import textwrap
import numpy as np
from PIL import Image, ImageFont, ImageDraw, ImageOps
from typing import *
# define alphabet and alphabet_dic
alphabet = string.digits + string.ascii_lowercase + string.ascii_uppercase + string.punctuation + ' ' # len(aphabet) = 95
alphabet_dic = {}
for index, c in enumerate(alphabet):
alphabet_dic[c] = index + 1 # the index 0 stands for non-character
def transform_mask_pil(mask_root, size):
"""
This function extracts the mask area and text area from the images.
Args:
mask_root (str): The path of mask image.
* The white area is the unmasked area
* The gray area is the masked area
* The white area is the text area
"""
img = np.array(mask_root)
img = cv2.resize(img, (size, size), interpolation=cv2.INTER_NEAREST)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ret, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY) # pixel value is set to 0 or 255 according to the threshold
return 1 - (binary.astype(np.float32) / 255)
def transform_mask(mask_root, size):
"""
This function extracts the mask area and text area from the images.
Args:
mask_root (str): The path of mask image.
* The white area is the unmasked area
* The gray area is the masked area
* The white area is the text area
"""
img = cv2.imread(mask_root)
img = cv2.resize(img, (size, size), interpolation=cv2.INTER_NEAREST)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ret, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY) # pixel value is set to 0 or 255 according to the threshold
return 1 - (binary.astype(np.float32) / 255)
def segmentation_mask_visualization(font_path: str, segmentation_mask: np.array):
"""
This function visualizes the segmentaiton masks with characters.
Args:
font_path (str): The path of font. We recommand to use Arial.ttf
segmentation_mask (np.array): The character-level segmentation mask.
"""
segmentation_mask = cv2.resize(segmentation_mask, (64, 64), interpolation=cv2.INTER_NEAREST)
font = ImageFont.truetype(font_path, 8)
blank = Image.new('RGB', (512,512), (0,0,0))
d = ImageDraw.Draw(blank)
for i in range(64):
for j in range(64):
if int(segmentation_mask[i][j]) == 0 or int(segmentation_mask[i][j])-1 >= len(alphabet):
continue
else:
d.text((j*8, i*8), alphabet[int(segmentation_mask[i][j])-1], font=font, fill=(0, 255, 0))
return blank
def make_caption_pil(font_path: str, captions: List[str]):
"""
This function converts captions into pil images.
Args:
font_path (str): The path of font. We recommand to use Arial.ttf
captions (List[str]): List of captions.
"""
caption_pil_list = []
font = ImageFont.truetype(font_path, 18)
for caption in captions:
border_size = 2
img = Image.new('RGB', (512-4,48-4), (255,255,255))
img = ImageOps.expand(img, border=(border_size, border_size, border_size, border_size), fill=(127, 127, 127))
draw = ImageDraw.Draw(img)
border_size = 2
text = caption
lines = textwrap.wrap(text, width=40)
x, y = 4, 4
line_height = font.getsize('A')[1] + 4
start = 0
for line in lines:
draw.text((x, y+start), line, font=font, fill=(200, 127, 0))
y += line_height
caption_pil_list.append(img)
return caption_pil_list
def filter_segmentation_mask(segmentation_mask: np.array):
"""
This function removes some noisy predictions of segmentation masks.
Args:
segmentation_mask (np.array): The character-level segmentation mask.
"""
segmentation_mask[segmentation_mask==alphabet_dic['-']] = 0
segmentation_mask[segmentation_mask==alphabet_dic[' ']] = 0
return segmentation_mask
def combine_image(args, resolution, sub_output_dir: str, pred_image_list: List, image_pil: Image, character_mask_pil: Image, character_mask_highlight_pil: Image, caption_pil_list: List):
"""
This function combines all the outputs and useful inputs together.
Args:
args (argparse.ArgumentParser): The arguments.
pred_image_list (List): List of predicted images.
image_pil (Image): The original image.
character_mask_pil (Image): The character-level segmentation mask.
character_mask_highlight_pil (Image): The character-level segmentation mask highlighting character regions with green color.
caption_pil_list (List): List of captions.
"""
size = len(pred_image_list)
if size == 1:
return pred_image_list[0]
elif size == 2:
blank = Image.new('RGB', (resolution*2, resolution), (0,0,0))
blank.paste(pred_image_list[0],(0,0))
blank.paste(pred_image_list[1],(resolution,0))
elif size == 3:
blank = Image.new('RGB', (resolution*3, resolution), (0,0,0))
blank.paste(pred_image_list[0],(0,0))
blank.paste(pred_image_list[1],(resolution,0))
blank.paste(pred_image_list[2],(resolution*2,0))
elif size == 4:
blank = Image.new('RGB', (resolution*2, resolution*2), (0,0,0))
blank.paste(pred_image_list[0],(0,0))
blank.paste(pred_image_list[1],(resolution,0))
blank.paste(pred_image_list[2],(0,resolution))
blank.paste(pred_image_list[3],(resolution,resolution))
return blank
def combine_image_gradio(args, size, sub_output_dir: str, pred_image_list: List, image_pil: Image, character_mask_pil: Image, character_mask_highlight_pil: Image, caption_pil_list: List):
"""
This function combines all the outputs and useful inputs together.
Args:
args (argparse.ArgumentParser): The arguments.
pred_image_list (List): List of predicted images.
image_pil (Image): The original image.
character_mask_pil (Image): The character-level segmentation mask.
character_mask_highlight_pil (Image): The character-level segmentation mask highlighting character regions with green color.
caption_pil_list (List): List of captions.
"""
size = len(pred_image_list)
if size == 1:
return pred_image_list[0]
elif size == 2:
blank = Image.new('RGB', (size*2, size), (0,0,0))
blank.paste(pred_image_list[0],(0,0))
blank.paste(pred_image_list[1],(size,0))
elif size == 3:
blank = Image.new('RGB', (size*3, size), (0,0,0))
blank.paste(pred_image_list[0],(0,0))
blank.paste(pred_image_list[1],(size,0))
blank.paste(pred_image_list[2],(size*2,0))
elif size == 4:
blank = Image.new('RGB', (size*2, size*2), (0,0,0))
blank.paste(pred_image_list[0],(0,0))
blank.paste(pred_image_list[1],(size,0))
blank.paste(pred_image_list[2],(0,size))
blank.paste(pred_image_list[3],(size,size))
return blank
def get_width(font_path, text):
"""
This function calculates the width of the text.
Args:
font_path (str): user prompt.
text (str): user prompt.
"""
font = ImageFont.truetype(font_path, 24)
width, _ = font.getsize(text)
return width
def get_key_words(text: str):
"""
This function detect keywords (enclosed by quotes) from user prompts. The keywords are used to guide the layout generation.
Args:
text (str): user prompt.
"""
words = []
text = text
matches = re.findall(r"'(.*?)'", text) # find the keywords enclosed by ''
if matches:
for match in matches:
words.extend(match.split())
if len(words) >= 8:
return []
return words
def adjust_overlap_box(box_output, current_index):
"""
This function adjust the overlapping boxes.
Args:
box_output (List): List of predicted boxes.
current_index (int): the index of current box.
"""
if current_index == 0:
return box_output
else:
# judge whether it contains overlap with the last output
last_box = box_output[0, current_index-1, :]
xmin_last, ymin_last, xmax_last, ymax_last = last_box
current_box = box_output[0, current_index, :]
xmin, ymin, xmax, ymax = current_box
if xmin_last <= xmin <= xmax_last and ymin_last <= ymin <= ymax_last:
print('adjust overlapping')
distance_x = xmax_last - xmin
distance_y = ymax_last - ymin
if distance_x <= distance_y:
# avoid overlap
new_x_min = xmax_last + 0.025
new_x_max = xmax - xmin + xmax_last + 0.025
box_output[0,current_index,0] = new_x_min
box_output[0,current_index,2] = new_x_max
else:
new_y_min = ymax_last + 0.025
new_y_max = ymax - ymin + ymax_last + 0.025
box_output[0,current_index,1] = new_y_min
box_output[0,current_index,3] = new_y_max
elif xmin_last <= xmin <= xmax_last and ymin_last <= ymax <= ymax_last:
print('adjust overlapping')
new_x_min = xmax_last + 0.05
new_x_max = xmax - xmin + xmax_last + 0.05
box_output[0,current_index,0] = new_x_min
box_output[0,current_index,2] = new_x_max
return box_output
def shrink_box(box, scale_factor = 0.9):
"""
This function shrinks the box.
Args:
box (List): List of predicted boxes.
scale_factor (float): The scale factor of shrinking.
"""
x1, y1, x2, y2 = box
x1_new = x1 + (x2 - x1) * (1 - scale_factor) / 2
y1_new = y1 + (y2 - y1) * (1 - scale_factor) / 2
x2_new = x2 - (x2 - x1) * (1 - scale_factor) / 2
y2_new = y2 - (y2 - y1) * (1 - scale_factor) / 2
return (x1_new, y1_new, x2_new, y2_new)
def adjust_font_size(args, width, height, draw, text):
"""
This function adjusts the font size.
Args:
args (argparse.ArgumentParser): The arguments.
width (int): The width of the text.
height (int): The height of the text.
draw (ImageDraw): The ImageDraw object.
text (str): The text.
"""
size_start = height
while True:
font = ImageFont.truetype(args.font_path, size_start)
text_width, _ = draw.textsize(text, font=font)
if text_width >= width:
size_start = size_start - 1
else:
return size_start
def inpainting_merge_image(original_image, mask_image, inpainting_image):
"""
This function merges the original image, mask image and inpainting image.
Args:
original_image (PIL.Image): The original image.
mask_image (PIL.Image): The mask images.
inpainting_image (PIL.Image): The inpainting images.
"""
original_image = original_image.resize((512, 512))
mask_image = mask_image.resize((512, 512))
inpainting_image = inpainting_image.resize((512, 512))
mask_image.convert('L')
threshold = 250
table = []
for i in range(256):
if i < threshold:
table.append(1)
else:
table.append(0)
mask_image = mask_image.point(table, "1")
merged_image = Image.composite(inpainting_image, original_image, mask_image)
return merged_image