Spaces:
Running
Running
import seaborn as sns | |
from PIL import Image, ImageDraw, ImageFont | |
import matplotlib.font_manager | |
import spacy | |
import re | |
nlp = spacy.load("en_core_web_sm-3.6.0") | |
def draw_boxes(image, boxes, texts, output_fn='output.png'): | |
box_width = 5 | |
color_palette = sns.color_palette("husl", len(boxes)) | |
colors = [(int(r*255), int(g*255), int(b*255)) for r, g, b in color_palette] | |
width, height = image.size | |
absolute_boxes = [[(int(box[0] * width), int(box[1] * height), int(box[2] * width), int(box[3] * height)) for box in b] for b in boxes] | |
overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) | |
draw = ImageDraw.Draw(overlay) | |
font_path = sorted(matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf'))[0] | |
font = ImageFont.truetype(font_path, size=26) | |
for box, text, color in zip(absolute_boxes, texts, colors): | |
for b in box: | |
draw.rectangle(b, outline=color, width=box_width) | |
if not text: | |
continue | |
splited_text = text.split('\n') | |
num_lines = len(splited_text) | |
text_width, text_height = font.getbbox(splited_text[0])[-2:] | |
y_start = b[3] - text_height * num_lines - box_width | |
if b[2] - b[0] < 100 or b[3] - b[1] < 100: | |
y_start = b[3] | |
for i, line in enumerate(splited_text): | |
text_width, text_height = font.getbbox(line)[-2:] | |
x = b[0] + box_width | |
y = y_start + text_height * i | |
draw.rectangle([x, y, x+text_width, y+text_height], fill=(128, 128, 128, 160)) | |
draw.text((x, y), line, font=font, fill=(255, 255, 255)) | |
img_with_overlay = Image.alpha_composite(image.convert('RGBA'), overlay).convert('RGB') | |
img_with_overlay.save(output_fn) | |
def boxstr_to_boxes(box_str): | |
boxes = [[int(y)/1000 for y in x.split(',')] for x in box_str.split(';') if x.replace(',', '').isdigit()] | |
return boxes | |
def text_to_dict(text): | |
doc = nlp(text) | |
box_matches = list(re.finditer(r'\[\[([^\]]+)\]\]', text)) | |
box_positions = [match.start() for match in box_matches] | |
noun_phrases = [] | |
boxes = [] | |
for match, box_position in zip(box_matches, box_positions): | |
nearest_np_start = max([0] + [chunk.start_char for chunk in doc.noun_chunks if chunk.end_char <= box_position]) | |
noun_phrase = text[nearest_np_start:box_position].strip() | |
if noun_phrase and noun_phrase[-1] == '?': | |
noun_phrase = text[:box_position].strip() | |
box_string = match.group(1) | |
noun_phrases.append(noun_phrase) | |
boxes.append(boxstr_to_boxes(box_string)) | |
pairs = [] | |
for noun_phrase, box_string in zip(noun_phrases, boxes): | |
pairs.append((noun_phrase.lower(), box_string)) | |
return dict(pairs) | |
def parse_response(img, response, output_fn='output.png'): | |
img = img.convert('RGB') | |
width, height = img.size | |
ratio = min(1920 / width, 1080 / height) | |
new_width = int(width * ratio) | |
new_height = int(height * ratio) | |
new_img = img.resize((new_width, new_height), Image.LANCZOS) | |
pattern = r"\[\[(.*?)\]\]" | |
positions = re.findall(pattern, response) | |
boxes = [[[int(y) for y in x.split(',')] for x in pos.split(';') if x.replace(',', '').isdigit()] for pos in positions] | |
dic = text_to_dict(response) | |
if not dic: | |
texts = [] | |
boxes = [] | |
else: | |
texts, boxes = zip(*dic.items()) | |
draw_boxes(new_img, boxes, texts, output_fn=output_fn) |