Spaces:

szukevin
/

VISOR-GPT

Runtime error

App Files Files Community

VISOR-GPT / app.py

szukevin

update

4168477 over 1 year ago

raw

history blame contribute delete

19.1 kB

	import gradio as gr
	import numpy as np
	import torch
	import re
	from PIL import Image
	from tqdm import tqdm
	from train.scripts.generate_lm_multiple import gen_sequence, build_visorgpt
	from utils.seq2coord import gen_cond_mask
	from visor_gligen.gligen_inference_box import gligen_infer, build_gligen_model
	from visor_controlnet.gradio_pose2image_v2 import control_infer, build_control_model, build_controlv11_model

	# init models
	visorgpt_config_path = 'train/models/gpt2/config.json'
	visorgpt_model_path = 'demo/ckpts/visorgpt/visorgpt_dagger_ta_tb.pt'
	visorgpt_vocab_path = 'train/models/google_uncased_en_coord_vocab.txt'

	# control_model_path = 'demo/ckpts/controlnet/control_sd15_openpose.pth'
	control_model_path = 'demo/ckpts/controlnet/control_v11p_sd15_openpose.pth' # v1.1
	control_sd_path = 'demo/ckpts/controlnet/v1-5-pruned-emaonly.safetensors'
	control_model_config = 'demo/ckpts/controlnet/cldm_v15.yaml'

	gligen_model_path = 'demo/ckpts/gligen/diffusion_pytorch_model_box.bin'


	visorgpt_args, visorgpt_model = build_visorgpt(model_config=visorgpt_config_path,
	model_path=visorgpt_model_path,
	vocab_path=visorgpt_vocab_path)
	control_model, ddim_sampler = build_controlv11_model(model_path=control_model_path,
	sd_path=control_sd_path,
	config_path=control_model_config)

	# build gligen model
	g_model, g_autoencoder, g_text_encoder, g_diffusion, \
	g_config, g_grounding_tokenizer_input = build_gligen_model(ckpt=gligen_model_path)


	# maximum number of instances
	max_num_keypoint = 16
	max_num_bbox = 16
	max_num_mask = 8

	def generate_sequence(gen_type,
	data_type,
	instance_size,
	num_instance,
	object_name_inbox):

	ctn = True

	if gen_type == 'key point':
	num_keypoint = 18
	if num_instance > max_num_keypoint:
	num_instance = max_num_keypoint

	seq_prompt = '; '.join([gen_type, data_type, instance_size, str(num_instance), str(num_keypoint)]) + ' ; [person'

	elif gen_type == 'box' or gen_type == 'mask':

	if not object_name_inbox.strip():
	if gen_type == 'mask':
	object_name_inbox = "bottle; cup"
	else:
	if data_type == 'object centric':
	object_name_inbox = "great white shark"
	else:
	object_name_inbox = "person; frisbee"

	num_keypoint = 0

	if gen_type == 'mask':
	if num_instance > max_num_mask:
	num_instance = max_num_mask
	if gen_type == 'box':
	if num_instance > max_num_bbox:
	num_instance = max_num_bbox

	if data_type == 'object centric':
	num_instance = 1

	objects = ', '.join(object_name_inbox.strip().split(";"))
	seq_prompt = '; '.join([gen_type, data_type, instance_size,
	str(num_instance), str(num_keypoint)]) + '; ' + objects

	if len(object_name_inbox.split(';')) > num_instance:
	return {
	raw_sequence: gr.update(
	value="The umber of category names should be less than the number of instances, please try again :)",
	visible=True)
	}

	print("input prompt: \n", seq_prompt)
	sequence = gen_sequence(visorgpt_args, visorgpt_model, seq_prompt)
	assert isinstance(sequence, list)

	try:
	cond_mask, cond_json = gen_cond_mask(sequence, ctn)
	if gen_type == 'key point':
	ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'box':
	ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'mask':
	ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]'
	except:
	cond_mask, cond_json = gen_cond_mask(sequence, not ctn)
	if gen_type == 'key point':
	ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'box':
	ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'mask':
	ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]'

	ret_img = Image.fromarray(cond_mask)

	if not gen_type == 'mask':
	return {
	result_gallery: [ret_img],
	raw_sequence: gr.update(value=ori_sequence, visible=True),
	images_button: gr.update(visible=True),
	text_container: cond_json,
	sequence_container: ori_sequence
	}
	else:
	return {
	result_gallery: [ret_img],
	raw_sequence: gr.update(value=ori_sequence, visible=True),
	images_button: gr.update(visible=False),
	text_container: cond_json,
	sequence_container: ori_sequence
	}

	def add_contents(gen_type,
	data_type,
	instance_size,
	num_instance,
	object_name_inbox,
	num_continuous_gen,
	global_seq):

	ctn = True

	if gen_type == 'key point':
	num_keypoint = 18
	seq_prompt = '; '.join([gen_type, data_type, instance_size, str(num_instance), str(num_keypoint)]) + ' ; [person'

	if num_continuous_gen:
	ctn = True
	cur_instance = int(global_seq.split(';')[3].strip())
	new_number = cur_instance + num_continuous_gen
	if new_number > max_num_keypoint:
	new_number = max_num_keypoint

	# prompt type a
	if global_seq.split(';')[5].find('[') == -1:
	global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '')
	objects = re.findall(re.compile(r'[\[](.*?)[]]', re.S), global_seq)
	objects = ' '.join(['[ person' + x + ']' for x in objects])
	seq_prompt = '; '.join([gen_type, data_type, instance_size, str(new_number), str(num_keypoint), objects])
	# prompt type b
	else:
	global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '')
	seq_list = global_seq.split(';')
	seq_list[3] = str(new_number)
	seq_prompt = ';'.join(seq_list)

	elif gen_type == 'box' or gen_type == 'mask':
	num_keypoint = 0
	if data_type == 'object centric':
	num_instance = 1
	objects = ', '.join(object_name_inbox.strip().split(";"))
	seq_prompt = '; '.join([gen_type, data_type, instance_size,
	str(num_instance), str(num_keypoint)]) + '; ' + objects
	if len(object_name_inbox.split(';')) > num_instance:
	return {
	raw_sequence: gr.update(value=f"The umber of category names should be less than the number of instances, please try again :)", visible=True)
	}

	if num_continuous_gen:
	cur_instance = int(global_seq.split(';')[3].strip())
	new_number = cur_instance + num_continuous_gen

	if gen_type == 'mask':
	if new_number > max_num_mask:
	new_number = max_num_mask
	if gen_type == 'box':
	if new_number > max_num_bbox:
	new_number = max_num_bbox

	# prompt type a
	if global_seq.split(';')[5].find('[') == -1:
	global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '')
	coords = re.findall(re.compile(r'[\[](.*?)[]]', re.S), global_seq)

	objects = global_seq.split(';')[5].split(',')
	objects = ' '.join(['[ ' + objects[i] + coords[i] + ']' for i in range(len(coords))])

	seq_prompt = '; '.join([gen_type, data_type, instance_size, str(new_number), str(num_keypoint), objects])
	# prompt type b
	else:
	global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '')
	seq_list = global_seq.split(';')
	seq_list[3] = str(new_number)
	seq_prompt = ';'.join(seq_list)

	# import ipdb;ipdb.set_trace()
	print("input prompt: \n", seq_prompt)
	with torch.no_grad():
	sequence = gen_sequence(visorgpt_args, visorgpt_model, seq_prompt)
	torch.cuda.empty_cache()

	assert isinstance(sequence, list)

	try:
	cond_mask, cond_json = gen_cond_mask(sequence, ctn)
	if gen_type == 'key point':
	ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'box':
	ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'mask':
	ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]'
	except:
	cond_mask, cond_json = gen_cond_mask(sequence, not ctn)
	if gen_type == 'key point':
	ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'box':
	ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]'
	elif gen_type == 'mask':
	ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]'

	ret_img = Image.fromarray(cond_mask)


	if not gen_type == 'mask':
	return {
	result_gallery: [ret_img],
	raw_sequence: gr.update(value=ori_sequence, visible=True),
	images_button: gr.update(visible=True),
	text_container: cond_json,
	sequence_container: ori_sequence
	}
	else:
	return {
	result_gallery: [ret_img],
	raw_sequence: gr.update(value=ori_sequence, visible=True),
	images_button: gr.update(visible=False),
	text_container: cond_json,
	sequence_container: ori_sequence
	}

	def generate_images(gen_type,
	num_samples,
	ddim_steps,
	object_prompt,
	seed,
	global_text,
	global_seq):

	if gen_type == 'key point':

	data = global_text[2]['keypoints']
	idx = np.arange(len(data))
	split_idx = list(np.array_split(idx, 1)[0])
	for idx in tqdm(split_idx):
	item = data[idx]
	keypoint_list = []
	for ins in item:
	kv = list(ins.items())[0]
	keypoint = (np.array(kv[1])).tolist()
	keypoint_list.append(keypoint)

	with torch.no_grad():
	ret_img = control_infer(model=control_model,
	ddim_sampler=ddim_sampler,
	keypoint_list=keypoint_list,
	prompt=object_prompt.strip(),
	num_samples=num_samples,
	ddim_steps=ddim_steps,
	seed=seed)
	torch.cuda.empty_cache()

	elif gen_type == 'box':

	data = global_text[0]['bboxes']

	with torch.no_grad():
	ret_img = gligen_infer(model=g_model,
	autoencoder=g_autoencoder,
	text_encoder=g_text_encoder,
	diffusion=g_diffusion,
	config=g_config,
	grounding_tokenizer_input=g_grounding_tokenizer_input,
	context_prompt=object_prompt.strip(),
	bbox_lists=data,
	ddim_steps=ddim_steps,
	batch_size=num_samples,
	seed=seed)
	torch.cuda.empty_cache()

	if not gen_type == 'mask':
	return {
	result_gallery: ret_img,
	text_container: global_text,
	sequence_container: global_seq
	}
	else:
	return {
	raw_sequence: "sequence to mask is not supported yet :)",
	text_container: global_text,
	sequence_container: global_seq
	}


	def object_name_inbox_fn(gen_type):

	if gen_type == 'key point':
	return {
	object_name_inbox: gr.update(visible=False),
	data_type: gr.update(choices=['multiple instances']),
	images_button: gr.update(value='Synthesize images using ControlNet'),
	ddim_steps: gr.update(value=20),
	object_prompt: gr.update(placeholder='in suit'),
	num_instance: gr.update(visible=True, minimum=1, maximum=16, value=2, step=1),
	sequence_container: None
	}
	elif gen_type == 'box':
	return {
	object_name_inbox: gr.update(visible=True, value='person; frisbee'),
	data_type: gr.update(choices=['multiple instances', 'object centric']),
	images_button: gr.update(value='Synthesize images using GLIGEN'),
	ddim_steps: gr.update(value=50),
	object_prompt: gr.update(placeholder='man and frisbee'),
	num_instance: gr.update(visible=True, minimum=1, maximum=16, value=2, step=1),
	sequence_container: None
	}

	elif gen_type == 'mask':
	return {
	object_name_inbox: gr.update(visible=True,
	label="MS COCO categories to be generated (separated by semicolon)", value='bottle; cup'),
	data_type: gr.update(choices=['multiple instances']),
	images_button: gr.update(value='Synthesize images using GLIGEN'),
	ddim_steps: gr.update(value=50),
	object_prompt: gr.update(placeholder='bottle and cup'),
	num_instance: gr.update(visible=True, minimum=1, maximum=8, value=2, step=1),
	sequence_container: None
	}


	def instance_type_change_fn(data_type):

	if data_type == 'multiple instances':
	return {
	md_title: gr.update(visible=True),
	num_continuous_gen: gr.update(visible=True),
	continuous_btn: gr.update(visible=True),
	object_name_inbox: gr.update(label="MS COCO categories to be generated (separated by semicolon)", value='person; frisbee'),
	object_prompt: gr.update(placeholder='man and frisbee'),
	num_instance: gr.update(visible=True, minimum=1, maximum=16, value=2, step=1),
	}

	elif data_type == 'object centric':
	return {
	md_title: gr.update(visible=False),
	num_continuous_gen: gr.update(visible=False),
	continuous_btn: gr.update(visible=False),
	object_name_inbox: gr.update(label="ImageNet-1K categories to be generated", value='great white shark'),
	object_prompt: gr.update(placeholder='great white shark'),
	num_instance: gr.update(visible=False, value=1),
	}

	block = gr.Blocks()
	with block:

	text_container = gr.State()
	sequence_container = gr.State()

	#gr.Markdown('<div align=center> <img src="file/visorgpt_title_all.jpg" width = "100%" height = "100%" /> </div>')
	description = """<p style="text-align: center; font-weight: bold;">
	<span style="font-size: 28px">VisorGPT: Learning Visual Prior via Generative Pre-Training</span>
	<br>
	<span style="font-size: 18px" id="paper-info">
	[<a href="https://sierkinhane.github.io/visor-gpt/" target="_blank">Project Page</a>]
	[<a href="https://arxiv.org/abs/2305.13777" target="_blank">Paper</a>]
	[<a href="https://github.com/Sierkinhane/VisorGPT" target="_blank">GitHub</a>]
	</span>
	</p>"""
	gr.HTML(description)

	with gr.Row():
	with gr.Column():

	gr.Markdown("### Params to generate sequences")
	gen_type = gr.inputs.Dropdown(choices=['key point', 'box', 'mask'], type='value', default='key point', label='Annotation Type')
	data_type = gr.inputs.Dropdown(choices=['multiple instances'], type='value', default='multiple instances', label='Data Type')
	instance_size = gr.inputs.Dropdown(choices=['small', 'medium', 'large'], type='value', default='large', label='Instance Size')
	num_instance = gr.Slider(label="Number of instances per image", minimum=1, maximum=16, value=2, step=1)
	object_name_inbox = gr.Textbox(label="MS COCO categories to be generated (separated by semicolon)", placeholder="person; frisbee", visible=False)
	sequence_button = gr.Button(value="Step 1 - Customize sequential output")


	md_title = gr.Markdown("### Continuous generation (Optional)")
	num_continuous_gen = gr.Slider(label="Number of instances to be added", minimum=1, maximum=16, value=1, step=1)

	continuous_btn = gr.Button(value="(Optional) Step 2 - Add instances to the current scene")

	gr.Markdown("### Params to synthesize images")
	object_prompt = gr.Textbox(label="Context Prompt", placeholder="in suit", visible=True)
	num_samples = gr.Slider(label="Batch Size", minimum=1, maximum=2, value=1, step=1)
	ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
	seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True)
	images_button = gr.Button(value="Step 3 - Synthesize images using ControlNet", visible=False)


	with gr.Column():
	raw_sequence = gr.Textbox(label="Raw Sequence", visible=False)
	result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto', preview=True)

	gen_type.change(object_name_inbox_fn, inputs=[gen_type],
	outputs=[object_name_inbox, data_type, images_button, ddim_steps, object_prompt, num_instance, sequence_container])

	data_type.change(instance_type_change_fn, inputs=[data_type],
	outputs=[md_title, num_continuous_gen, continuous_btn, object_name_inbox, object_prompt, num_instance])


	ips = [gen_type, data_type, instance_size, num_instance, object_name_inbox]
	sequence_button.click(fn=generate_sequence, inputs=ips, outputs=[result_gallery, raw_sequence, images_button, text_container, sequence_container])

	ips = [gen_type, data_type, instance_size, num_instance, object_name_inbox, num_continuous_gen, sequence_container]
	continuous_btn.click(fn=add_contents, inputs=ips, outputs=[result_gallery, raw_sequence, images_button, text_container, sequence_container])

	ips = [gen_type, num_samples, ddim_steps, object_prompt, seed, text_container, sequence_container]
	images_button.click(fn=generate_images, inputs=ips, outputs=[result_gallery, raw_sequence, text_container, sequence_container])



	block.queue(concurrency_count=1)
	block.launch()