VISOR-GPT / utils /
szukevin's picture
history blame
13.5 kB
decode sequential output to visual locations
import random
from tqdm import tqdm
import json
import numpy as np
import re
import argparse
import cv2
import math
import os
# COCO keypoints
stickwidth = 4
limbSeq_coco = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
[1, 16], [16, 18], [3, 17], [6, 18]]
limbSeq_cp = [[14, 2], [14, 1], [2, 4], [4, 6], [1, 3], [3, 5], [14, 8], [8, 10], [10, 12], [14, 7], [7, 9], [9, 11], [13, 14]]
# CrowdPose
# {'0': 'left shoulder', '1': 'right shoulder', '2': 'left elbow', '3': 'right elbow', '4': 'left wrist', '5': 'right wrist', '6': 'left hip', '7': 'right hip', '8': 'left knee', '9': 'right knee', '10': 'left ankle', '11': 'right ankle', '12': 'head', '13': 'neck'}
# for human pose visualization
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
[0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
# for box visualization
colors_box = [[217, 221, 116], [137, 165, 171], [230, 126, 175], [63, 157, 5], [107, 51, 75], [217, 147, 152], [129, 132, 8], [232, 85, 249], [254, 98, 33], [89, 108, 230], [253, 34, 161], [91, 150, 30], [255, 147, 26], [209, 154, 205], [134, 57, 11], [143, 181, 122], [241, 176, 87], [104, 73, 26], [122, 147, 59], [235, 230, 229], [119, 18, 125], [185, 61, 138], [237, 115, 90], [13, 209, 111], [219, 172, 212]]
# Plots one bounding box on image
def plot_one_box(x, img, color=None, label=None, line_thickness=None, idx=0):
tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line thickness
color = color or [random.randint(0, 255) for _ in range(3)]
color = colors_box[idx]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1) # filled
cv2.putText(img, label, c1, 0, tl / 3, [0, 0, 0], thickness=tf, lineType=cv2.LINE_AA)
return img
# decode one sequence to visual locations
def decode(coordinate_str, type='box'):
# find numbers
locations = np.array([int(i) for i in re.findall(r"\d+", coordinate_str)])
if type == 'box':
locations = locations.reshape(-1, 4)
elif type == 'cocokeypoint':
locations = locations.reshape(-1, 18, 2)
visible = np.ones((locations.shape[0], 18, 1))
eq_0_idx = np.where(locations[:, :, 0] * locations[:, :, 1] == 0)
visible[eq_0_idx] = 0
locations = np.concatenate([locations, visible], axis=-1)
for i in range(locations.shape[0]):
if locations[i, 2, -1] == 0 or locations[i, 5, -1] == 0:
locations[i, 1, -1] = 0
elif type == 'crowdpose':
locations = locations.reshape(-1, 14, 2)
visible = np.ones((locations.shape[0], 14, 1))
eq_0_idx = np.where(locations[:, :, 0] * locations[:, :, 1] == 0)
visible[eq_0_idx] = 0
locations = np.concatenate([locations, visible], axis=-1)
elif type == 'mask':
locations = []
for c_str in coordinate_str.split('m0'):
c_str = ''.join(re.split(r'm\d+', c_str))
mask_coord = np.array([int(i) for i in re.findall(r"\d+ ", c_str)])
if len(mask_coord) != 0:
locations.append(mask_coord.reshape(-1, 1, 2))
raise NotImplementedError
return locations
# process raw sequences inferred by VisorGPT
def to_coordinate(file_path, ctn=True):
if isinstance(file_path, list):
texts = [i.strip().replace(' ##', '') for i in file_path]
with open(file_path, 'r') as file:
texts = [i.strip().replace(' ##', '') for i in file.readlines()]
location_list = []
classname_list = []
type_list = []
valid_sequences = []
cnt = 0
print('to coordinate ...')
for ste in tqdm(texts):
cnt += 1
if 'box' in ste:
type = 'box'
elif 'key point' in ste:
type = 'cocokeypoint' if '; 18 ;' in ste else 'crowdpose'
elif 'mask' in ste:
type = 'mask'
raise NotImplementedError
if '[SEP]' not in ste:
if ctn:
temp = ste[:ste.index('[SEP]')].split(' ; ')[5].split('] ')
classnames = []
for t in temp:
classnames.append(t.split(' xmin ')[0].split(' m0')[0][2:])
classnames = classnames[:-1]
locations = decode(ste[:ste.index('[SEP]')].split(' ; ')[5], type=type)
classnames = ste[:ste.index('[SEP]')].split(' ; ')[5].split(' , ')
locations = decode(ste[:ste.index('[SEP]')].split(' ; ')[6], type=type)
with open('valid_sequences.txt', 'w') as file:
[file.write(i.split('[CLS] ')[-1] + '\n') for i in valid_sequences]
return location_list, classname_list, type_list, valid_sequences
# visualize object locations on a canvas
def visualization(location_list, classname_list, type_list, save_dir='debug/', save_fig=False):
if save_fig:
if not os.path.exists(save_dir):
print('visualizing ...')
for b, (loc, classnames, type) in tqdm(enumerate(zip(location_list, classname_list, type_list))):
canvas = np.zeros((512, 512, 3), dtype=np.uint8) + 50
if len(loc) != len(classnames):
if type == 'box':
for i in range(loc.shape[0]):
canvas = plot_one_box(loc[i], canvas, label=classnames[i], idx=i)
elif type == 'cocokeypoint':
for i in range(loc.shape[0]):
for j in range(loc.shape[1]):
x, y, v = loc[i, j]
if v != 0:, (int(x), int(y)), 4, colors[j], thickness=-1)
for j in range(17):
lim = limbSeq_coco[j]
cur_canvas = canvas.copy()
Y = [loc[i][lim[0] - 1][0], loc[i][lim[1] - 1][0]]
X = [loc[i][lim[0] - 1][1], loc[i][lim[1] - 1][1]]
if loc[i][lim[0] - 1][-1] == 0 or loc[i][lim[1] - 1][-1] == 0:
mX = np.mean(X)
mY = np.mean(Y)
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
cv2.fillConvexPoly(cur_canvas, polygon, colors[j])
canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
elif type == 'crowdpose':
for i in range(loc.shape[0]):
for j in range(loc.shape[1]):
x, y, _ = loc[i, j]
if x != 0 and y != 0:, (int(x), int(y)), 4, colors[j], thickness=-1)
for j in range(13):
lim = limbSeq_cp[j]
cur_canvas = canvas.copy()
Y = [loc[i][lim[0] - 1][0], loc[i][lim[1] - 1][0]]
X = [loc[i][lim[0] - 1][1], loc[i][lim[1] - 1][1]]
if (Y[0] == 0 and X[0] == 0) or (Y[1] == 0 and X[1] == 0):
mX = np.mean(X)
mY = np.mean(Y)
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
cv2.fillConvexPoly(cur_canvas, polygon, colors[j])
canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
elif type == 'mask':
for i in range(len(loc)):
color = [random.randint(0, 255) for _ in range(3)]
xmin, ymin, xmax, ymax = loc[i][:, :, 0].min(), loc[i][:, :, 1].min(), loc[i][:, :, 0].max(), loc[i][:, :, 1].max()
cur_canvas = canvas.copy()
cv2.fillPoly(cur_canvas, [loc[i]], color)
cur_canvas = plot_one_box((xmin, ymin, xmax, ymax), cur_canvas, color=color, label=classnames[i])
canvas = cv2.addWeighted(canvas, 0.5, cur_canvas, 0.5, 0)
raise NotImplementedError
if save_fig:
cv2.imwrite(f'{save_dir}/test_{b}.png', canvas[..., ::-1])
return canvas[..., ::-1]
# to json output
def to_json(location_list, classname_list, type_list, valid_sequences):
ret_json_box = {'bboxes': [], 'sequences': []}
ret_json_mask = {'masks': [], 'sequences': []}
ret_json_keypoint = {'keypoints': [], 'sequences': []}
print('to json ...')
for loc, classnames, type, seq in tqdm(zip(location_list, classname_list, type_list, valid_sequences)):
ins_list = []
kpt_list = []
mask_list = []
seq_list = []
if len(loc) != len(classnames):# or len(classnames) > 8:
if type == 'box':
for i in range(loc.shape[0]):
# xmin, ymin, xmax, ymax = loc[i]
# area = (xmax - xmin) * (ymax - ymin)
# compute area and omit very small one due to the synthesis ability of AIGC
# if area < 32**2:
# continue
dic = {classnames[i]: loc[i].tolist()}
if len(seq_list) == 0:
elif type == 'cocokeypoint' or type == 'crowdpose':
for i in range(loc.shape[0]):
# compute validate key points and omit the less one, as the synthesis ability of AIGC
# if loc[i, :, -1].sum() <= 4:
# continue
# compute area and omit very small one due to the synthesis ability of AIGC
# xmin, ymin, xmax, ymax = loc[i, :, 0].min(), loc[i, :, 1].min(), loc[i, :, 0].max(), loc[i, :, 1].max()
# area = (xmax - xmin) * (ymax - ymin)
# if area < 32 ** 2:
# continue
dic = {classnames[i]: loc[i][:, :].tolist()}
if len(seq_list) == 0:
elif type == 'mask':
for i in range(len(loc)):
# xmin, ymin, xmax, ymax = loc[i][:, :, 0].min(), loc[i][:, :, 1].min(), loc[i][:, :, 0].max(), loc[i][:, :, 1].max()
# area = (xmax - xmin) * (ymax - ymin)
# if area < 32 ** 2:
# continue
dic = {classnames[i]: loc[i].tolist()}
if len(seq_list) == 0:
raise NotImplementedError
if len(ins_list) != 0:
if len(kpt_list) != 0:
if len(mask_list) != 0:
return [ret_json_box, ret_json_mask, ret_json_keypoint]
def gen_cond_mask(texts, ctn):
location_list, classname_list, type_list, valid_sequences = to_coordinate(texts, ctn)
ret_mask = visualization(location_list, classname_list, type_list, None, False)
ret_json = to_json(location_list, classname_list, type_list, valid_sequences)
return ret_mask, ret_json
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--file_path', type=str, required=True)
parser.add_argument('--save_dir', type=str, default='debug')
parser.add_argument('--visualize', type=bool, default=False)
args = parser.parse_args()
location_list, classname_list, type_list, valid_sequences = to_coordinate(args.file_path)
if not os.path.exists(args.save_dir):
# visualization
if args.visualize:
visualization(location_list, classname_list, type_list, args.save_dir)
# to json data
rets = to_json(location_list, classname_list, type_list, valid_sequences)
for ret, flag in zip(rets, ['box', 'mask', 'keypoint']):
save_path = args.file_path.split('/')[-1].split('.')[0] + f'_{flag}.json'
with open('files/' + save_path, 'w') as file:
json.dump(ret, file, indent=2)