Spaces:
Runtime error
Runtime error
File size: 15,127 Bytes
4121bec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 |
import logging
import numpy as np
import os
from collections import OrderedDict
from detectron2.config import global_cfg as cfg
import torch
from fvcore.common.file_io import PathManager
from detectron2.structures.boxes import pairwise_iou
from detectron2.utils.comm import all_gather, is_main_process, synchronize
import pickle
from .evaluator import DatasetEvaluator
import json
from detectron2.structures import Boxes
import html
import ftfy
import regex as re
PATTN = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
def basic_clean(text):
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text))
return text.strip()
def whitespace_clean(text):
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
class FLICKR30KEvaluator(DatasetEvaluator):
"""
Evaluate semantic segmentation
"""
def __init__(self, dataset_name, distributed=True, output_dir=None):
"""
Args:
dataset_name (str): name of the dataset to be evaluated.
distributed (True): if True, will collect results from all ranks for evaluation.
Otherwise, will evaluate the results in the current process.
num_classes (int): number of classes
ignore_label (int): value in semantic segmentation ground truth. Predictions for the
corresponding pixels should be ignored.
output_dir (str): an output directory to dump results.
"""
self._dataset_name = dataset_name
self._distributed = distributed
self._output_dir = output_dir
self._cpu_device = torch.device("cpu")
self._logger = logging.getLogger(__name__)
self.gt_boxes = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/bounding_boxes_test.json"))
self.gt_sents = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/sentences_test.json"))
def reset(self):
self._predictions = {}
def process(self, inputs, outputs):
"""
Args:
inputs: the inputs to a model.
It is a list of dicts. Each dict corresponds to an image and
contains keys like "height", "width", "file_name", "image_id".
outputs: the outputs of a model. It is either list of semantic segmentation predictions
(Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
segmentation prediction in the same format.
"""
assert len(inputs) == 1 # batch = 1 during inference
dataset_name, img_id, (img_height, img_width), all_str2id_links = inputs[0][-1]
img_id = img_id.split('/')[-1]
match_scores, processed_results = outputs
match_scores = match_scores.to(self._cpu_device)
pred_boxes = processed_results[0]['instances'].proposal_boxes.to(self._cpu_device)
self._predictions.update({img_id: [img_height, img_width, all_str2id_links, match_scores, pred_boxes]})
def merge_gt_boxes(self, box_anno):
gt_boxes = []
phrase_ids = []
scene_box_ids = box_anno['scene']
for k, v in box_anno['boxes'].items():
if k in scene_box_ids: # important: remove scene boxes, otherwise the number of each phrase type cannot match paper
continue
phrase_ids.append(k)
if len(v) == 1:
gt_boxes.append(v[0])
else:
# when a phrase respond to multiple regions, we take the union of them as paper given
v = np.array(v)
box = [v[:, 0].min(), v[:, 1].min(), v[:, 2].max(), v[:, 3].max()]
gt_boxes.append(box)
gt_boxes = np.array(gt_boxes)
return phrase_ids, gt_boxes
def find_ground_box(self, match_scores, all_str2id_links, sentences, gt_phrase_ids):
""" Given matching matrix between region feats and token feats, find the box that grounds a phrase
"""
num_box = match_scores.size(0)
num_cap = int(match_scores.size(1) / 77)
all_phrase_score = []
all_phrase_ids = []
for i in range(num_cap): # per sentence
this_score = match_scores[:, i*77:(i+1)*77] # [#boxes, 77]
input_ids = [iitem for item in all_str2id_links[i] for iitem in item[1]]
input_tokens = [item[0] for item in all_str2id_links[i]]
phrases = sentences[i]['phrases']
for j, phrase in enumerate(phrases): # per phrase
if phrase['phrase_id'] not in gt_phrase_ids: # no gt box for this phrase, skip
continue
# locate the word
words = whitespace_clean(basic_clean(phrase['phrase'])).lower() # phrase['phrase'].lower().replace("-"," ").split()
words = re.findall(PATTN, words)
first_word_index = None # phrase['first_word_index']
for idx in range(len(input_tokens) - len(words) + 1): # search start word of this phrase
if input_tokens[idx : idx + len(words)] == words: # NOTE: key step for alignment btw model prediction and annotation
first_word_index = idx
break
if first_word_index is None:
print("Fail to find phrase [{}] in input tokens [{}]".format(words, input_tokens))
start_wd_ind = first_word_index
end_wd_ind = first_word_index + len(words)
if len(words) != len(phrase['phrase'].split()):
pass # print('tokens: {} <--> phrase: {}'.format(words, phrase['phrase']))
# locate the token
start_tk_ind = 0
for k_i, k in enumerate(range(0, start_wd_ind)):
start_tk_ind += len(all_str2id_links[i][k][1])
token_cnt = 0
for k_i, k in enumerate(range(start_wd_ind, end_wd_ind)):
if all_str2id_links[i][k][0] != words[k_i]:
print("Word not matched: {} in model output but {} in annotation".format(all_str2id_links[i][k][0], words[k_i]))
else:
token_cnt += len(all_str2id_links[i][k][1]) # ith sentence, kth word, and its tokens
end_tk_ind = start_tk_ind + token_cnt
# sanity check
phrase_ids1 = [iitem for item in all_str2id_links[i][start_wd_ind:end_wd_ind] for iitem in item[1]] # way 1: use word index to accumulate token ids in a phrase
phrase_ids2 = input_ids[start_tk_ind:end_tk_ind] # way 2: use token index to directly index token ids in a phrase
if phrase_ids1 != phrase_ids2:
print("Santity check: {} from word {} in token".format(phrase_ids1, phrase_ids2))
# index similarity score
phrase_score = this_score[:, start_tk_ind:end_tk_ind]
phrase_score = phrase_score.mean(dim=1) # phrase_score.max(dim=1)[0] #
all_phrase_score.append(phrase_score)
all_phrase_ids.append(phrase['phrase_id'])
phrase_score_tensor = torch.cat(all_phrase_score)
phrase_score_tensor = phrase_score_tensor.view(len(all_phrase_ids), num_box) # NOTE: this should be [#phrases, #object proposals]
return phrase_score_tensor, all_phrase_ids
def evaluate(self):
"""
Evaluates Referring Segmentation IoU:
"""
if self._distributed:
synchronize()
self._predictions = all_gather(self._predictions)
if not is_main_process():
return
all_prediction = {}
for p in self._predictions:
all_prediction.update(p)
else:
all_prediction = self._predictions
if len(all_prediction) < 30: # resume inference results
save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(1000)
all_prediction = np.load(save_path, allow_pickle=True).tolist()
self._logger.info('Resume from {}'.format(save_path))
else: # new run
save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(len(all_prediction))
np.save(save_path, all_prediction)
self._logger.info('Save results to {}'.format(save_path))
self._logger.info('Got {} images!'.format(len(all_prediction)))
image_unique_ids = list(all_prediction.keys())
image_evaled = []
total_num = 0
recall_num = 0
num_type = {}
recall_type = {}
acc_type = {}
recall_topk_num = {5:0, 10:0}
point_recall_num = 0
EVAL_THRESH = 0.5
type_cnts = {}
for img_sent_id in image_unique_ids:
if img_sent_id not in self.gt_boxes:
continue
else:
image_evaled.append(img_sent_id)
# results from model
result = all_prediction[img_sent_id]
phrase_ids = None
phrase_types = [] # phrase type: each phrase belongs to a coarse object concept
pred_boxes = None # an object proposal selected by model for each phrase
img_height, img_width, all_str2id_links = result[0], result[1], result[2] # all_str2id_links: each word and its tokenized ids
match_scores = result[3] # matching score [#object proposals, #tokens]
precomp_boxes = result[4] # object proposals from offline module
# annotation from dataset
sentences = self.gt_sents[img_sent_id]
box_anno = self.gt_boxes[img_sent_id]
# sanity check and box merging
assert box_anno['height'] == img_height, box_anno['width'] == img_width
gt_phrase_ids, gt_boxes = self.merge_gt_boxes(box_anno) # merged if multiple boxes for the same phrase
if len(gt_phrase_ids) == 0: # no gt box for this image
continue
for sent_item in sentences:
for phrase_item in sent_item['phrases']:
if phrase_item['phrase_id'] in gt_phrase_ids:
phrase_types.append(phrase_item['phrase_type'])
# merge similarity scores from token level to phrase level, and find the box that grounds the phrase
phrase_score_tensor, all_phrase_ids = self.find_ground_box(match_scores, all_str2id_links, sentences, gt_phrase_ids)
pred_boxes_ind = torch.argmax(phrase_score_tensor, dim=1)
pred_boxes = precomp_boxes[pred_boxes_ind]
pred_similarity = phrase_score_tensor # .t() # pred_similarity: matching score [#phrases, #object proposals]
# get single target/gt box for each phrase
# 1. any gt box that can be matched as target
# refer to (https://github.com/BigRedT/info-ground/blob/22ae6d6ec8b38df473e73034fc895ebf97d39897/exp/ground/eval_flickr_phrase_loc.py#L90)
phrase_boxes = [box_anno['boxes'][p_id] for p_id in all_phrase_ids]
targets = []
for pr_b, pd_b in zip(phrase_boxes, pred_boxes):
matched = False
for single_b in pr_b:
this_iou = pairwise_iou(Boxes(torch.from_numpy(np.array([single_b])).float()), Boxes(pd_b.view(1,-1)))
if (this_iou >= EVAL_THRESH).sum() > 0:
targets.append(single_b)
matched = True
break
if not matched:
targets.append(single_b)
targets = Boxes(torch.from_numpy(np.array(targets)).float())
# 2. union box as target
# target_ind = np.array([gt_phrase_ids.index(p_id) for p_id in all_phrase_ids])
# targets = gt_boxes[target_ind] # ground-truth boxes for each phrase in each sentence
# targets = Boxes(torch.from_numpy(targets).float())
assert len(phrase_types) == len(targets)
# single predicted box for each phrase
ious = pairwise_iou(targets, pred_boxes) # this function will change the target_boxes into cuda mode
iou = ious.numpy().diagonal()
total_num += iou.shape[0]
recall_num += int((iou >= EVAL_THRESH).sum()) # 0.5
# metric of point (can be ignored)
pred_boxes_tensor = pred_boxes.tensor
pred_center = (pred_boxes_tensor[:, :2] + pred_boxes_tensor[:, 2:]) / 2.0
pred_center = pred_center.repeat(1, 2) ## x_c, y_c, x_c, y_c
targets_tensor = targets.tensor
fall_tensor = targets_tensor - pred_center
fall_tensor = (fall_tensor[:, :2] <= 0).float().sum(1) + (fall_tensor[:, 2:] >= 0).float().sum(1)
point_recall_num += (fall_tensor == 4).float().numpy().sum()
# detailed accuracy across different phrase types
for pid, p_type in enumerate(phrase_types):
p_type = p_type[0]
num_type[p_type] = num_type.setdefault(p_type, 0) + 1
recall_type[p_type] = recall_type.setdefault(p_type, 0) + (iou[pid] >= EVAL_THRESH)
# metric of recall when multiple predicted boxes for each phrase
ious_top = pairwise_iou(targets, precomp_boxes).cpu()
for k in [5, 10]:
top_k = torch.topk(pred_similarity, k=k, dim=1)[0][:, [-1]]
pred_similarity_topk = (pred_similarity >= top_k).float()
ious_top_k = (ious_top * pred_similarity_topk).numpy()
recall_topk_num[k] += int(((ious_top_k >= EVAL_THRESH).sum(1) > 0).sum())
acc = recall_num / total_num
acc_top5 = recall_topk_num[5] / total_num
acc_top10 = recall_topk_num[10] / total_num
point_acc = point_recall_num / total_num
# details about each coarse type of phrase
for type, type_num in num_type.items():
acc_type[type] = recall_type[type] / type_num
# if self._output_dir:
# PathManager.mkdirs(self._output_dir)
# file_path = os.path.join(self._output_dir, "prediction_{}.pkl".format(str(acc).replace('.', '_')[:6]))
# with PathManager.open(file_path, "wb") as f:
# pickle.dump(all_prediction, f)
del all_prediction
self._logger.info('evaluation on {} expression instances, detailed_iou: {}'.format(len(image_evaled), acc_type))
self._logger.info('Evaluate Pointing Accuracy: PointAcc:{}'.format(point_acc))
results = OrderedDict({"acc": acc, "acc_top5": acc_top5, "acc_top10": acc_top10})
self._logger.info(results)
self._logger.info(num_type)
return results |