File size: 15,127 Bytes
4121bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
import logging
import numpy as np
import os
from collections import OrderedDict
from detectron2.config import global_cfg as cfg
import torch
from fvcore.common.file_io import PathManager
from detectron2.structures.boxes import pairwise_iou

from detectron2.utils.comm import all_gather, is_main_process, synchronize
import pickle
from .evaluator import DatasetEvaluator
import json
from detectron2.structures import Boxes
import html
import ftfy
import regex as re

PATTN = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()

def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


class FLICKR30KEvaluator(DatasetEvaluator):

    """
    Evaluate semantic segmentation
    """

    def __init__(self, dataset_name, distributed=True, output_dir=None):
        """
        Args:
            dataset_name (str): name of the dataset to be evaluated.
            distributed (True): if True, will collect results from all ranks for evaluation.
                Otherwise, will evaluate the results in the current process.
            num_classes (int): number of classes
            ignore_label (int): value in semantic segmentation ground truth. Predictions for the
            corresponding pixels should be ignored.
            output_dir (str): an output directory to dump results.
        """
        self._dataset_name = dataset_name
        self._distributed = distributed
        self._output_dir = output_dir

        self._cpu_device = torch.device("cpu")
        self._logger = logging.getLogger(__name__)
        self.gt_boxes = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/bounding_boxes_test.json"))
        self.gt_sents = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/sentences_test.json"))

    def reset(self):
        self._predictions = {}

    def process(self, inputs, outputs):
        """
        Args:
            inputs: the inputs to a model.
                It is a list of dicts. Each dict corresponds to an image and
                contains keys like "height", "width", "file_name", "image_id".
            outputs: the outputs of a model. It is either list of semantic segmentation predictions
                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
                segmentation prediction in the same format.
        """
        assert len(inputs) == 1  # batch = 1 during inference
        dataset_name, img_id, (img_height, img_width), all_str2id_links = inputs[0][-1]
        img_id = img_id.split('/')[-1]
        match_scores, processed_results = outputs
        match_scores = match_scores.to(self._cpu_device)
        pred_boxes = processed_results[0]['instances'].proposal_boxes.to(self._cpu_device)

        self._predictions.update({img_id: [img_height, img_width, all_str2id_links, match_scores, pred_boxes]})

    def merge_gt_boxes(self, box_anno):
        gt_boxes = []
        phrase_ids = []
        scene_box_ids = box_anno['scene']
        for k, v in box_anno['boxes'].items():
            if k in scene_box_ids: # important: remove scene boxes, otherwise the number of each phrase type cannot match paper
                continue
            phrase_ids.append(k)
            if len(v) == 1:
                gt_boxes.append(v[0])
            else:
                # when a phrase respond to multiple regions, we take the union of them as paper given
                v = np.array(v)
                box = [v[:, 0].min(), v[:, 1].min(), v[:, 2].max(), v[:, 3].max()]
                gt_boxes.append(box)
        gt_boxes = np.array(gt_boxes)
        return phrase_ids, gt_boxes

    def find_ground_box(self, match_scores, all_str2id_links, sentences, gt_phrase_ids):
        """ Given matching matrix between region feats and token feats, find the box that grounds a phrase
        """
        num_box = match_scores.size(0)
        num_cap = int(match_scores.size(1) / 77)
        all_phrase_score = []
        all_phrase_ids = []
        for i in range(num_cap): # per sentence
            this_score = match_scores[:, i*77:(i+1)*77]  # [#boxes, 77]
            input_ids = [iitem for item in all_str2id_links[i] for iitem in item[1]]
            input_tokens = [item[0] for item in all_str2id_links[i]]
            phrases = sentences[i]['phrases']
            for j, phrase in enumerate(phrases):  # per phrase
                if phrase['phrase_id'] not in gt_phrase_ids:  #  no gt box for this phrase, skip
                    continue
                # locate the word
                words = whitespace_clean(basic_clean(phrase['phrase'])).lower() # phrase['phrase'].lower().replace("-"," ").split()
                words = re.findall(PATTN, words)
                first_word_index = None  #  phrase['first_word_index']
                for idx in range(len(input_tokens) - len(words) + 1):  # search start word of this phrase
                    if input_tokens[idx : idx + len(words)] == words:  # NOTE: key step for alignment btw model prediction and annotation
                        first_word_index = idx 
                        break
                if first_word_index is None:
                    print("Fail to find phrase [{}] in input tokens [{}]".format(words, input_tokens))
                start_wd_ind = first_word_index
                end_wd_ind = first_word_index + len(words)
                if len(words) != len(phrase['phrase'].split()):
                    pass # print('tokens: {} <--> phrase: {}'.format(words, phrase['phrase']))
                # locate the token
                start_tk_ind = 0
                for k_i, k in enumerate(range(0, start_wd_ind)):
                    start_tk_ind += len(all_str2id_links[i][k][1])
                token_cnt = 0
                for k_i, k in enumerate(range(start_wd_ind, end_wd_ind)):
                    if all_str2id_links[i][k][0] != words[k_i]:
                        print("Word not matched: {} in model output but {} in annotation".format(all_str2id_links[i][k][0], words[k_i]))
                    else:
                        token_cnt += len(all_str2id_links[i][k][1]) # ith sentence, kth word, and its tokens
                end_tk_ind = start_tk_ind + token_cnt
                # sanity check
                phrase_ids1 = [iitem for item in all_str2id_links[i][start_wd_ind:end_wd_ind] for iitem in item[1]]  # way 1: use word index to accumulate token ids in a phrase
                phrase_ids2 = input_ids[start_tk_ind:end_tk_ind] # way 2: use token index to directly index token ids in a phrase
                if phrase_ids1 != phrase_ids2:
                    print("Santity check: {} from word {} in token".format(phrase_ids1, phrase_ids2))
                # index similarity score
                phrase_score = this_score[:, start_tk_ind:end_tk_ind]
                phrase_score = phrase_score.mean(dim=1) # phrase_score.max(dim=1)[0] # 
                all_phrase_score.append(phrase_score)
                all_phrase_ids.append(phrase['phrase_id'])
        phrase_score_tensor = torch.cat(all_phrase_score)
        phrase_score_tensor = phrase_score_tensor.view(len(all_phrase_ids), num_box) # NOTE: this should be [#phrases, #object proposals]

        return phrase_score_tensor, all_phrase_ids

    def evaluate(self):
        """
        Evaluates Referring Segmentation IoU:
        """

        if self._distributed:
            synchronize()

            self._predictions = all_gather(self._predictions)

            if not is_main_process():
                return

            all_prediction = {}
            for p in self._predictions:
                all_prediction.update(p)
        else:
            all_prediction = self._predictions
        
        if len(all_prediction) < 30:  # resume inference results
            save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(1000)
            all_prediction = np.load(save_path, allow_pickle=True).tolist()
            self._logger.info('Resume from {}'.format(save_path))
        else:  # new run
            save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(len(all_prediction))
            np.save(save_path, all_prediction)
            self._logger.info('Save results to {}'.format(save_path))
        self._logger.info('Got {} images!'.format(len(all_prediction)))
        
        image_unique_ids = list(all_prediction.keys())
        image_evaled = []

        total_num = 0
        recall_num = 0
        num_type = {}
        recall_type = {}
        acc_type = {}
        recall_topk_num = {5:0, 10:0}
        point_recall_num = 0
        EVAL_THRESH = 0.5
        type_cnts = {}

        for img_sent_id in image_unique_ids:
            if img_sent_id not in self.gt_boxes:
                continue
            else:
                image_evaled.append(img_sent_id)
            # results from model
            result = all_prediction[img_sent_id]
            phrase_ids = None 
            phrase_types = []  #  phrase type: each phrase belongs to a coarse object concept
            pred_boxes = None  #  an object proposal selected by model for each phrase
            img_height, img_width, all_str2id_links = result[0], result[1], result[2]  # all_str2id_links: each word and its tokenized ids
            match_scores = result[3]  # matching score [#object proposals, #tokens]
            precomp_boxes = result[4]  # object proposals from offline module
            # annotation from dataset
            sentences = self.gt_sents[img_sent_id]
            box_anno = self.gt_boxes[img_sent_id]
            # sanity check and box merging
            assert box_anno['height'] == img_height, box_anno['width'] == img_width
            gt_phrase_ids, gt_boxes = self.merge_gt_boxes(box_anno)  # merged if multiple boxes for the same phrase
            if len(gt_phrase_ids) == 0: # no gt box for this image
                continue
            for sent_item in sentences:
                for phrase_item in sent_item['phrases']:
                    if phrase_item['phrase_id'] in gt_phrase_ids:
                        phrase_types.append(phrase_item['phrase_type']) 

            # merge similarity scores from token level to phrase level, and find the box that grounds the phrase
            phrase_score_tensor, all_phrase_ids = self.find_ground_box(match_scores, all_str2id_links, sentences, gt_phrase_ids)  
            pred_boxes_ind = torch.argmax(phrase_score_tensor, dim=1)
            pred_boxes = precomp_boxes[pred_boxes_ind]
            pred_similarity = phrase_score_tensor # .t() #  pred_similarity: matching score [#phrases, #object proposals]
            
            # get single target/gt box for each phrase
            # 1. any gt box that can be matched as target 
            # refer to (https://github.com/BigRedT/info-ground/blob/22ae6d6ec8b38df473e73034fc895ebf97d39897/exp/ground/eval_flickr_phrase_loc.py#L90)
            phrase_boxes = [box_anno['boxes'][p_id] for p_id in all_phrase_ids]
            targets = []
            for pr_b, pd_b in zip(phrase_boxes, pred_boxes):
                matched = False
                for single_b in pr_b:
                    this_iou = pairwise_iou(Boxes(torch.from_numpy(np.array([single_b])).float()), Boxes(pd_b.view(1,-1)))
                    if (this_iou >= EVAL_THRESH).sum() > 0:
                        targets.append(single_b)
                        matched = True
                        break
                if not matched:
                    targets.append(single_b)
            targets = Boxes(torch.from_numpy(np.array(targets)).float())
            # 2. union box as target
            # target_ind = np.array([gt_phrase_ids.index(p_id) for p_id in all_phrase_ids])
            # targets = gt_boxes[target_ind] # ground-truth boxes for each phrase in each sentence
            # targets = Boxes(torch.from_numpy(targets).float())
            assert len(phrase_types) == len(targets)

            # single predicted box for each phrase
            ious = pairwise_iou(targets, pred_boxes)  # this function will change the target_boxes into cuda mode
            iou = ious.numpy().diagonal()
            total_num += iou.shape[0]
            recall_num += int((iou >= EVAL_THRESH).sum())  # 0.5

            # metric of point (can be ignored)
            pred_boxes_tensor = pred_boxes.tensor
            pred_center = (pred_boxes_tensor[:, :2] + pred_boxes_tensor[:, 2:]) / 2.0
            pred_center = pred_center.repeat(1, 2)  ## x_c, y_c, x_c, y_c
            targets_tensor = targets.tensor
            fall_tensor = targets_tensor - pred_center
            fall_tensor = (fall_tensor[:, :2] <= 0).float().sum(1) + (fall_tensor[:, 2:] >= 0).float().sum(1)
            point_recall_num += (fall_tensor == 4).float().numpy().sum()

            # detailed accuracy across different phrase types
            for pid, p_type in enumerate(phrase_types):
                p_type = p_type[0]
                num_type[p_type] = num_type.setdefault(p_type, 0) + 1
                recall_type[p_type] = recall_type.setdefault(p_type, 0) + (iou[pid] >= EVAL_THRESH)
            
            # metric of recall when multiple predicted boxes for each phrase
            ious_top = pairwise_iou(targets, precomp_boxes).cpu()
            for k in [5, 10]:
                top_k = torch.topk(pred_similarity, k=k, dim=1)[0][:, [-1]]
                pred_similarity_topk = (pred_similarity >= top_k).float()
                ious_top_k = (ious_top * pred_similarity_topk).numpy()
                recall_topk_num[k] += int(((ious_top_k >= EVAL_THRESH).sum(1) > 0).sum())

        acc = recall_num / total_num
        acc_top5 = recall_topk_num[5] / total_num
        acc_top10 = recall_topk_num[10] / total_num
        point_acc = point_recall_num / total_num
        
        # details about each coarse type of phrase
        for type, type_num in num_type.items():
            acc_type[type] = recall_type[type] / type_num

        # if self._output_dir:
        #     PathManager.mkdirs(self._output_dir)
        #     file_path = os.path.join(self._output_dir, "prediction_{}.pkl".format(str(acc).replace('.', '_')[:6]))
        #     with PathManager.open(file_path, "wb") as f:
        #         pickle.dump(all_prediction, f)

        del all_prediction
        self._logger.info('evaluation on {} expression instances, detailed_iou: {}'.format(len(image_evaled), acc_type))
        self._logger.info('Evaluate Pointing Accuracy: PointAcc:{}'.format(point_acc))
        results = OrderedDict({"acc": acc, "acc_top5": acc_top5, "acc_top10": acc_top10})
        self._logger.info(results)
        self._logger.info(num_type)
        return results