Spaces:
Runtime error
Runtime error
import gradio as gr | |
import numpy as np | |
import torch | |
import re | |
from PIL import Image | |
from tqdm import tqdm | |
from train.scripts.generate_lm_multiple import gen_sequence, build_visorgpt | |
from utils.seq2coord import gen_cond_mask | |
from visor_gligen.gligen_inference_box import gligen_infer, build_gligen_model | |
from visor_controlnet.gradio_pose2image_v2 import control_infer, build_control_model, build_controlv11_model | |
# init models | |
visorgpt_config_path = 'train/models/gpt2/config.json' | |
visorgpt_model_path = 'demo/ckpts/visorgpt/visorgpt_dagger_ta_tb.pt' | |
visorgpt_vocab_path = 'train/models/google_uncased_en_coord_vocab.txt' | |
# control_model_path = 'demo/ckpts/controlnet/control_sd15_openpose.pth' | |
control_model_path = 'demo/ckpts/controlnet/control_v11p_sd15_openpose.pth' # v1.1 | |
control_sd_path = 'demo/ckpts/controlnet/v1-5-pruned-emaonly.safetensors' | |
control_model_config = 'demo/ckpts/controlnet/cldm_v15.yaml' | |
gligen_model_path = 'demo/ckpts/gligen/diffusion_pytorch_model_box.bin' | |
visorgpt_args, visorgpt_model = build_visorgpt(model_config=visorgpt_config_path, | |
model_path=visorgpt_model_path, | |
vocab_path=visorgpt_vocab_path) | |
control_model, ddim_sampler = build_controlv11_model(model_path=control_model_path, | |
sd_path=control_sd_path, | |
config_path=control_model_config) | |
# build gligen model | |
g_model, g_autoencoder, g_text_encoder, g_diffusion, \ | |
g_config, g_grounding_tokenizer_input = build_gligen_model(ckpt=gligen_model_path) | |
# maximum number of instances | |
max_num_keypoint = 16 | |
max_num_bbox = 16 | |
max_num_mask = 8 | |
def generate_sequence(gen_type, | |
data_type, | |
instance_size, | |
num_instance, | |
object_name_inbox): | |
ctn = True | |
if gen_type == 'key point': | |
num_keypoint = 18 | |
if num_instance > max_num_keypoint: | |
num_instance = max_num_keypoint | |
seq_prompt = '; '.join([gen_type, data_type, instance_size, str(num_instance), str(num_keypoint)]) + ' ; [person' | |
elif gen_type == 'box' or gen_type == 'mask': | |
if not object_name_inbox.strip(): | |
if gen_type == 'mask': | |
object_name_inbox = "bottle; cup" | |
else: | |
if data_type == 'object centric': | |
object_name_inbox = "great white shark" | |
else: | |
object_name_inbox = "person; frisbee" | |
num_keypoint = 0 | |
if gen_type == 'mask': | |
if num_instance > max_num_mask: | |
num_instance = max_num_mask | |
if gen_type == 'box': | |
if num_instance > max_num_bbox: | |
num_instance = max_num_bbox | |
if data_type == 'object centric': | |
num_instance = 1 | |
objects = ', '.join(object_name_inbox.strip().split(";")) | |
seq_prompt = '; '.join([gen_type, data_type, instance_size, | |
str(num_instance), str(num_keypoint)]) + '; ' + objects | |
if len(object_name_inbox.split(';')) > num_instance: | |
return { | |
raw_sequence: gr.update( | |
value="The umber of category names should be less than the number of instances, please try again :)", | |
visible=True) | |
} | |
print("input prompt: \n", seq_prompt) | |
sequence = gen_sequence(visorgpt_args, visorgpt_model, seq_prompt) | |
assert isinstance(sequence, list) | |
try: | |
cond_mask, cond_json = gen_cond_mask(sequence, ctn) | |
if gen_type == 'key point': | |
ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'box': | |
ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'mask': | |
ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]' | |
except: | |
cond_mask, cond_json = gen_cond_mask(sequence, not ctn) | |
if gen_type == 'key point': | |
ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'box': | |
ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'mask': | |
ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]' | |
ret_img = Image.fromarray(cond_mask) | |
if not gen_type == 'mask': | |
return { | |
result_gallery: [ret_img], | |
raw_sequence: gr.update(value=ori_sequence, visible=True), | |
images_button: gr.update(visible=True), | |
text_container: cond_json, | |
sequence_container: ori_sequence | |
} | |
else: | |
return { | |
result_gallery: [ret_img], | |
raw_sequence: gr.update(value=ori_sequence, visible=True), | |
images_button: gr.update(visible=False), | |
text_container: cond_json, | |
sequence_container: ori_sequence | |
} | |
def add_contents(gen_type, | |
data_type, | |
instance_size, | |
num_instance, | |
object_name_inbox, | |
num_continuous_gen, | |
global_seq): | |
ctn = True | |
if gen_type == 'key point': | |
num_keypoint = 18 | |
seq_prompt = '; '.join([gen_type, data_type, instance_size, str(num_instance), str(num_keypoint)]) + ' ; [person' | |
if num_continuous_gen: | |
ctn = True | |
cur_instance = int(global_seq.split(';')[3].strip()) | |
new_number = cur_instance + num_continuous_gen | |
if new_number > max_num_keypoint: | |
new_number = max_num_keypoint | |
# prompt type a | |
if global_seq.split(';')[5].find('[') == -1: | |
global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '') | |
objects = re.findall(re.compile(r'[\[](.*?)[]]', re.S), global_seq) | |
objects = ' '.join(['[ person' + x + ']' for x in objects]) | |
seq_prompt = '; '.join([gen_type, data_type, instance_size, str(new_number), str(num_keypoint), objects]) | |
# prompt type b | |
else: | |
global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '') | |
seq_list = global_seq.split(';') | |
seq_list[3] = str(new_number) | |
seq_prompt = ';'.join(seq_list) | |
elif gen_type == 'box' or gen_type == 'mask': | |
num_keypoint = 0 | |
if data_type == 'object centric': | |
num_instance = 1 | |
objects = ', '.join(object_name_inbox.strip().split(";")) | |
seq_prompt = '; '.join([gen_type, data_type, instance_size, | |
str(num_instance), str(num_keypoint)]) + '; ' + objects | |
if len(object_name_inbox.split(';')) > num_instance: | |
return { | |
raw_sequence: gr.update(value=f"The umber of category names should be less than the number of instances, please try again :)", visible=True) | |
} | |
if num_continuous_gen: | |
cur_instance = int(global_seq.split(';')[3].strip()) | |
new_number = cur_instance + num_continuous_gen | |
if gen_type == 'mask': | |
if new_number > max_num_mask: | |
new_number = max_num_mask | |
if gen_type == 'box': | |
if new_number > max_num_bbox: | |
new_number = max_num_bbox | |
# prompt type a | |
if global_seq.split(';')[5].find('[') == -1: | |
global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '') | |
coords = re.findall(re.compile(r'[\[](.*?)[]]', re.S), global_seq) | |
objects = global_seq.split(';')[5].split(',') | |
objects = ' '.join(['[ ' + objects[i] + coords[i] + ']' for i in range(len(coords))]) | |
seq_prompt = '; '.join([gen_type, data_type, instance_size, str(new_number), str(num_keypoint), objects]) | |
# prompt type b | |
else: | |
global_seq = global_seq.replace('[CLS]', '').replace('[SEP]', '') | |
seq_list = global_seq.split(';') | |
seq_list[3] = str(new_number) | |
seq_prompt = ';'.join(seq_list) | |
# import ipdb;ipdb.set_trace() | |
print("input prompt: \n", seq_prompt) | |
with torch.no_grad(): | |
sequence = gen_sequence(visorgpt_args, visorgpt_model, seq_prompt) | |
torch.cuda.empty_cache() | |
assert isinstance(sequence, list) | |
try: | |
cond_mask, cond_json = gen_cond_mask(sequence, ctn) | |
if gen_type == 'key point': | |
ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'box': | |
ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'mask': | |
ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]' | |
except: | |
cond_mask, cond_json = gen_cond_mask(sequence, not ctn) | |
if gen_type == 'key point': | |
ori_sequence = cond_json[2]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'box': | |
ori_sequence = cond_json[0]['sequences'][0][0] + '[SEP]' | |
elif gen_type == 'mask': | |
ori_sequence = cond_json[1]['sequences'][0][0] + '[SEP]' | |
ret_img = Image.fromarray(cond_mask) | |
if not gen_type == 'mask': | |
return { | |
result_gallery: [ret_img], | |
raw_sequence: gr.update(value=ori_sequence, visible=True), | |
images_button: gr.update(visible=True), | |
text_container: cond_json, | |
sequence_container: ori_sequence | |
} | |
else: | |
return { | |
result_gallery: [ret_img], | |
raw_sequence: gr.update(value=ori_sequence, visible=True), | |
images_button: gr.update(visible=False), | |
text_container: cond_json, | |
sequence_container: ori_sequence | |
} | |
def generate_images(gen_type, | |
num_samples, | |
ddim_steps, | |
object_prompt, | |
seed, | |
global_text, | |
global_seq): | |
if gen_type == 'key point': | |
data = global_text[2]['keypoints'] | |
idx = np.arange(len(data)) | |
split_idx = list(np.array_split(idx, 1)[0]) | |
for idx in tqdm(split_idx): | |
item = data[idx] | |
keypoint_list = [] | |
for ins in item: | |
kv = list(ins.items())[0] | |
keypoint = (np.array(kv[1])).tolist() | |
keypoint_list.append(keypoint) | |
with torch.no_grad(): | |
ret_img = control_infer(model=control_model, | |
ddim_sampler=ddim_sampler, | |
keypoint_list=keypoint_list, | |
prompt=object_prompt.strip(), | |
num_samples=num_samples, | |
ddim_steps=ddim_steps, | |
seed=seed) | |
torch.cuda.empty_cache() | |
elif gen_type == 'box': | |
data = global_text[0]['bboxes'] | |
with torch.no_grad(): | |
ret_img = gligen_infer(model=g_model, | |
autoencoder=g_autoencoder, | |
text_encoder=g_text_encoder, | |
diffusion=g_diffusion, | |
config=g_config, | |
grounding_tokenizer_input=g_grounding_tokenizer_input, | |
context_prompt=object_prompt.strip(), | |
bbox_lists=data, | |
ddim_steps=ddim_steps, | |
batch_size=num_samples, | |
seed=seed) | |
torch.cuda.empty_cache() | |
if not gen_type == 'mask': | |
return { | |
result_gallery: ret_img, | |
text_container: global_text, | |
sequence_container: global_seq | |
} | |
else: | |
return { | |
raw_sequence: "sequence to mask is not supported yet :)", | |
text_container: global_text, | |
sequence_container: global_seq | |
} | |
def object_name_inbox_fn(gen_type): | |
if gen_type == 'key point': | |
return { | |
object_name_inbox: gr.update(visible=False), | |
data_type: gr.update(choices=['multiple instances']), | |
images_button: gr.update(value='Synthesize images using ControlNet'), | |
ddim_steps: gr.update(value=20), | |
object_prompt: gr.update(placeholder='in suit'), | |
num_instance: gr.update(visible=True, minimum=1, maximum=16, value=2, step=1), | |
sequence_container: None | |
} | |
elif gen_type == 'box': | |
return { | |
object_name_inbox: gr.update(visible=True, value='person; frisbee'), | |
data_type: gr.update(choices=['multiple instances', 'object centric']), | |
images_button: gr.update(value='Synthesize images using GLIGEN'), | |
ddim_steps: gr.update(value=50), | |
object_prompt: gr.update(placeholder='man and frisbee'), | |
num_instance: gr.update(visible=True, minimum=1, maximum=16, value=2, step=1), | |
sequence_container: None | |
} | |
elif gen_type == 'mask': | |
return { | |
object_name_inbox: gr.update(visible=True, | |
label="MS COCO categories to be generated (separated by semicolon)", value='bottle; cup'), | |
data_type: gr.update(choices=['multiple instances']), | |
images_button: gr.update(value='Synthesize images using GLIGEN'), | |
ddim_steps: gr.update(value=50), | |
object_prompt: gr.update(placeholder='bottle and cup'), | |
num_instance: gr.update(visible=True, minimum=1, maximum=8, value=2, step=1), | |
sequence_container: None | |
} | |
def instance_type_change_fn(data_type): | |
if data_type == 'multiple instances': | |
return { | |
md_title: gr.update(visible=True), | |
num_continuous_gen: gr.update(visible=True), | |
continuous_btn: gr.update(visible=True), | |
object_name_inbox: gr.update(label="MS COCO categories to be generated (separated by semicolon)", value='person; frisbee'), | |
object_prompt: gr.update(placeholder='man and frisbee'), | |
num_instance: gr.update(visible=True, minimum=1, maximum=16, value=2, step=1), | |
} | |
elif data_type == 'object centric': | |
return { | |
md_title: gr.update(visible=False), | |
num_continuous_gen: gr.update(visible=False), | |
continuous_btn: gr.update(visible=False), | |
object_name_inbox: gr.update(label="ImageNet-1K categories to be generated", value='great white shark'), | |
object_prompt: gr.update(placeholder='great white shark'), | |
num_instance: gr.update(visible=False, value=1), | |
} | |
block = gr.Blocks() | |
with block: | |
text_container = gr.State() | |
sequence_container = gr.State() | |
#gr.Markdown('<div align=center> <img src="file/visorgpt_title_all.jpg" width = "100%" height = "100%" /> </div>') | |
description = """<p style="text-align: center; font-weight: bold;"> | |
<span style="font-size: 28px">VisorGPT: Learning Visual Prior via Generative Pre-Training</span> | |
<br> | |
<span style="font-size: 18px" id="paper-info"> | |
[<a href="https://sierkinhane.github.io/visor-gpt/" target="_blank">Project Page</a>] | |
[<a href="https://arxiv.org/abs/2305.13777" target="_blank">Paper</a>] | |
[<a href="https://github.com/Sierkinhane/VisorGPT" target="_blank">GitHub</a>] | |
</span> | |
</p>""" | |
gr.HTML(description) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Params to generate sequences") | |
gen_type = gr.inputs.Dropdown(choices=['key point', 'box', 'mask'], type='value', default='key point', label='Annotation Type') | |
data_type = gr.inputs.Dropdown(choices=['multiple instances'], type='value', default='multiple instances', label='Data Type') | |
instance_size = gr.inputs.Dropdown(choices=['small', 'medium', 'large'], type='value', default='large', label='Instance Size') | |
num_instance = gr.Slider(label="Number of instances per image", minimum=1, maximum=16, value=2, step=1) | |
object_name_inbox = gr.Textbox(label="MS COCO categories to be generated (separated by semicolon)", placeholder="person; frisbee", visible=False) | |
sequence_button = gr.Button(value="Step 1 - Customize sequential output") | |
md_title = gr.Markdown("### Continuous generation (Optional)") | |
num_continuous_gen = gr.Slider(label="Number of instances to be added", minimum=1, maximum=16, value=1, step=1) | |
continuous_btn = gr.Button(value="(Optional) Step 2 - Add instances to the current scene") | |
gr.Markdown("### Params to synthesize images") | |
object_prompt = gr.Textbox(label="Context Prompt", placeholder="in suit", visible=True) | |
num_samples = gr.Slider(label="Batch Size", minimum=1, maximum=2, value=1, step=1) | |
ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) | |
seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) | |
images_button = gr.Button(value="Step 3 - Synthesize images using ControlNet", visible=False) | |
with gr.Column(): | |
raw_sequence = gr.Textbox(label="Raw Sequence", visible=False) | |
result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto', preview=True) | |
gen_type.change(object_name_inbox_fn, inputs=[gen_type], | |
outputs=[object_name_inbox, data_type, images_button, ddim_steps, object_prompt, num_instance, sequence_container]) | |
data_type.change(instance_type_change_fn, inputs=[data_type], | |
outputs=[md_title, num_continuous_gen, continuous_btn, object_name_inbox, object_prompt, num_instance]) | |
ips = [gen_type, data_type, instance_size, num_instance, object_name_inbox] | |
sequence_button.click(fn=generate_sequence, inputs=ips, outputs=[result_gallery, raw_sequence, images_button, text_container, sequence_container]) | |
ips = [gen_type, data_type, instance_size, num_instance, object_name_inbox, num_continuous_gen, sequence_container] | |
continuous_btn.click(fn=add_contents, inputs=ips, outputs=[result_gallery, raw_sequence, images_button, text_container, sequence_container]) | |
ips = [gen_type, num_samples, ddim_steps, object_prompt, seed, text_container, sequence_container] | |
images_button.click(fn=generate_images, inputs=ips, outputs=[result_gallery, raw_sequence, text_container, sequence_container]) | |
block.queue(concurrency_count=1) | |
block.launch() | |