CogVLM / utils.py
lykeven's picture
add grounding
6accf0d
import seaborn as sns
from PIL import Image, ImageDraw, ImageFont
import matplotlib.font_manager
import spacy
import re
nlp = spacy.load("en_core_web_sm-3.6.0")
def draw_boxes(image, boxes, texts, output_fn='output.png'):
box_width = 5
color_palette = sns.color_palette("husl", len(boxes))
colors = [(int(r*255), int(g*255), int(b*255)) for r, g, b in color_palette]
width, height = image.size
absolute_boxes = [[(int(box[0] * width), int(box[1] * height), int(box[2] * width), int(box[3] * height)) for box in b] for b in boxes]
overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
draw = ImageDraw.Draw(overlay)
font_path = sorted(matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf'))[0]
font = ImageFont.truetype(font_path, size=26)
for box, text, color in zip(absolute_boxes, texts, colors):
for b in box:
draw.rectangle(b, outline=color, width=box_width)
if not text:
continue
splited_text = text.split('\n')
num_lines = len(splited_text)
text_width, text_height = font.getbbox(splited_text[0])[-2:]
y_start = b[3] - text_height * num_lines - box_width
if b[2] - b[0] < 100 or b[3] - b[1] < 100:
y_start = b[3]
for i, line in enumerate(splited_text):
text_width, text_height = font.getbbox(line)[-2:]
x = b[0] + box_width
y = y_start + text_height * i
draw.rectangle([x, y, x+text_width, y+text_height], fill=(128, 128, 128, 160))
draw.text((x, y), line, font=font, fill=(255, 255, 255))
img_with_overlay = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB')
img_with_overlay.save(output_fn)
def boxstr_to_boxes(box_str):
boxes = [[int(y)/1000 for y in x.split(',')] for x in box_str.split(';') if x.replace(',', '').isdigit()]
return boxes
def text_to_dict(text):
doc = nlp(text)
box_matches = list(re.finditer(r'\[\[([^\]]+)\]\]', text))
box_positions = [match.start() for match in box_matches]
noun_phrases = []
boxes = []
for match, box_position in zip(box_matches, box_positions):
nearest_np_start = max([0] + [chunk.start_char for chunk in doc.noun_chunks if chunk.end_char <= box_position])
noun_phrase = text[nearest_np_start:box_position].strip()
if noun_phrase and noun_phrase[-1] == '?':
noun_phrase = text[:box_position].strip()
box_string = match.group(1)
noun_phrases.append(noun_phrase)
boxes.append(boxstr_to_boxes(box_string))
pairs = []
for noun_phrase, box_string in zip(noun_phrases, boxes):
pairs.append((noun_phrase.lower(), box_string))
return dict(pairs)
def parse_response(img, response, output_fn='output.png'):
img = img.convert('RGB')
width, height = img.size
ratio = min(1920 / width, 1080 / height)
new_width = int(width * ratio)
new_height = int(height * ratio)
new_img = img.resize((new_width, new_height), Image.LANCZOS)
pattern = r"\[\[(.*?)\]\]"
positions = re.findall(pattern, response)
boxes = [[[int(y) for y in x.split(',')] for x in pos.split(';') if x.replace(',', '').isdigit()] for pos in positions]
dic = text_to_dict(response)
if not dic:
texts = []
boxes = []
else:
texts, boxes = zip(*dic.items())
draw_boxes(new_img, boxes, texts, output_fn=output_fn)