Spaces:

CVPR
/

regionclip-demo

Runtime error

File size: 15,127 Bytes

4121bec

import logging
import numpy as np
import os
from collections import OrderedDict
from detectron2.config import global_cfg as cfg
import torch
from fvcore.common.file_io import PathManager
from detectron2.structures.boxes import pairwise_iou

from detectron2.utils.comm import all_gather, is_main_process, synchronize
import pickle
from .evaluator import DatasetEvaluator
import json
from detectron2.structures import Boxes
import html
import ftfy
import regex as re

PATTN = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()

def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


class FLICKR30KEvaluator(DatasetEvaluator):

    """
    Evaluate semantic segmentation
    """

    def __init__(self, dataset_name, distributed=True, output_dir=None):
        """
        Args:
            dataset_name (str): name of the dataset to be evaluated.
            distributed (True): if True, will collect results from all ranks for evaluation.
                Otherwise, will evaluate the results in the current process.
            num_classes (int): number of classes
            ignore_label (int): value in semantic segmentation ground truth. Predictions for the
            corresponding pixels should be ignored.
            output_dir (str): an output directory to dump results.
        """
        self._dataset_name = dataset_name
        self._distributed = distributed
        self._output_dir = output_dir

        self._cpu_device = torch.device("cpu")
        self._logger = logging.getLogger(__name__)
        self.gt_boxes = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/bounding_boxes_test.json"))
        self.gt_sents = json.load(open("/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/sentences_test.json"))

    def reset(self):
        self._predictions = {}

    def process(self, inputs, outputs):
        """
        Args:
            inputs: the inputs to a model.
                It is a list of dicts. Each dict corresponds to an image and
                contains keys like "height", "width", "file_name", "image_id".
            outputs: the outputs of a model. It is either list of semantic segmentation predictions
                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
                segmentation prediction in the same format.
        """
        assert len(inputs) == 1  # batch = 1 during inference
        dataset_name, img_id, (img_height, img_width), all_str2id_links = inputs[0][-1]
        img_id = img_id.split('/')[-1]
        match_scores, processed_results = outputs
        match_scores = match_scores.to(self._cpu_device)
        pred_boxes = processed_results[0]['instances'].proposal_boxes.to(self._cpu_device)

        self._predictions.update({img_id: [img_height, img_width, all_str2id_links, match_scores, pred_boxes]})

    def merge_gt_boxes(self, box_anno):
        gt_boxes = []
        phrase_ids = []
        scene_box_ids = box_anno['scene']
        for k, v in box_anno['boxes'].items():
            if k in scene_box_ids: # important: remove scene boxes, otherwise the number of each phrase type cannot match paper
                continue
            phrase_ids.append(k)
            if len(v) == 1:
                gt_boxes.append(v[0])
            else:
                # when a phrase respond to multiple regions, we take the union of them as paper given
                v = np.array(v)
                box = [v[:, 0].min(), v[:, 1].min(), v[:, 2].max(), v[:, 3].max()]
                gt_boxes.append(box)
        gt_boxes = np.array(gt_boxes)
        return phrase_ids, gt_boxes

    def find_ground_box(self, match_scores, all_str2id_links, sentences, gt_phrase_ids):
        """ Given matching matrix between region feats and token feats, find the box that grounds a phrase
        """
        num_box = match_scores.size(0)
        num_cap = int(match_scores.size(1) / 77)
        all_phrase_score = []
        all_phrase_ids = []
        for i in range(num_cap): # per sentence
            this_score = match_scores[:, i*77:(i+1)*77]  # [#boxes, 77]
            input_ids = [iitem for item in all_str2id_links[i] for iitem in item[1]]
            input_tokens = [item[0] for item in all_str2id_links[i]]
            phrases = sentences[i]['phrases']
            for j, phrase in enumerate(phrases):  # per phrase
                if phrase['phrase_id'] not in gt_phrase_ids:  #  no gt box for this phrase, skip
                    continue
                # locate the word
                words = whitespace_clean(basic_clean(phrase['phrase'])).lower() # phrase['phrase'].lower().replace("-"," ").split()
                words = re.findall(PATTN, words)
                first_word_index = None  #  phrase['first_word_index']
                for idx in range(len(input_tokens) - len(words) + 1):  # search start word of this phrase
                    if input_tokens[idx : idx + len(words)] == words:  # NOTE: key step for alignment btw model prediction and annotation
                        first_word_index = idx 
                        break
                if first_word_index is None:
                    print("Fail to find phrase [{}] in input tokens [{}]".format(words, input_tokens))
                start_wd_ind = first_word_index
                end_wd_ind = first_word_index + len(words)
                if len(words) != len(phrase['phrase'].split()):
                    pass # print('tokens: {} <--> phrase: {}'.format(words, phrase['phrase']))
                # locate the token
                start_tk_ind = 0
                for k_i, k in enumerate(range(0, start_wd_ind)):
                    start_tk_ind += len(all_str2id_links[i][k][1])
                token_cnt = 0
                for k_i, k in enumerate(range(start_wd_ind, end_wd_ind)):
                    if all_str2id_links[i][k][0] != words[k_i]:
                        print("Word not matched: {} in model output but {} in annotation".format(all_str2id_links[i][k][0], words[k_i]))
                    else:
                        token_cnt += len(all_str2id_links[i][k][1]) # ith sentence, kth word, and its tokens
                end_tk_ind = start_tk_ind + token_cnt
                # sanity check
                phrase_ids1 = [iitem for item in all_str2id_links[i][start_wd_ind:end_wd_ind] for iitem in item[1]]  # way 1: use word index to accumulate token ids in a phrase
                phrase_ids2 = input_ids[start_tk_ind:end_tk_ind] # way 2: use token index to directly index token ids in a phrase
                if phrase_ids1 != phrase_ids2:
                    print("Santity check: {} from word {} in token".format(phrase_ids1, phrase_ids2))
                # index similarity score
                phrase_score = this_score[:, start_tk_ind:end_tk_ind]
                phrase_score = phrase_score.mean(dim=1) # phrase_score.max(dim=1)[0] # 
                all_phrase_score.append(phrase_score)
                all_phrase_ids.append(phrase['phrase_id'])
        phrase_score_tensor = torch.cat(all_phrase_score)
        phrase_score_tensor = phrase_score_tensor.view(len(all_phrase_ids), num_box) # NOTE: this should be [#phrases, #object proposals]

        return phrase_score_tensor, all_phrase_ids

    def evaluate(self):
        """
        Evaluates Referring Segmentation IoU:
        """

        if self._distributed:
            synchronize()

            self._predictions = all_gather(self._predictions)

            if not is_main_process():
                return

            all_prediction = {}
            for p in self._predictions:
                all_prediction.update(p)
        else:
            all_prediction = self._predictions
        
        if len(all_prediction) < 30:  # resume inference results
            save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(1000)
            all_prediction = np.load(save_path, allow_pickle=True).tolist()
            self._logger.info('Resume from {}'.format(save_path))
        else:  # new run
            save_path = "/home/v-yiwuzhong/projects/azureblobs/vyiwuzhong_phillytools/flickr30k_processed/grounding_results/grounding_{}_imgs.npy".format(len(all_prediction))
            np.save(save_path, all_prediction)
            self._logger.info('Save results to {}'.format(save_path))
        self._logger.info('Got {} images!'.format(len(all_prediction)))
        
        image_unique_ids = list(all_prediction.keys())
        image_evaled = []

        total_num = 0
        recall_num = 0
        num_type = {}
        recall_type = {}
        acc_type = {}
        recall_topk_num = {5:0, 10:0}
        point_recall_num = 0
        EVAL_THRESH = 0.5
        type_cnts = {}

        for img_sent_id in image_unique_ids:
            if img_sent_id not in self.gt_boxes:
                continue
            else:
                image_evaled.append(img_sent_id)
            # results from model
            result = all_prediction[img_sent_id]
            phrase_ids = None 
            phrase_types = []  #  phrase type: each phrase belongs to a coarse object concept
            pred_boxes = None  #  an object proposal selected by model for each phrase
            img_height, img_width, all_str2id_links = result[0], result[1], result[2]  # all_str2id_links: each word and its tokenized ids
            match_scores = result[3]  # matching score [#object proposals, #tokens]
            precomp_boxes = result[4]  # object proposals from offline module
            # annotation from dataset
            sentences = self.gt_sents[img_sent_id]
            box_anno = self.gt_boxes[img_sent_id]
            # sanity check and box merging
            assert box_anno['height'] == img_height, box_anno['width'] == img_width
            gt_phrase_ids, gt_boxes = self.merge_gt_boxes(box_anno)  # merged if multiple boxes for the same phrase
            if len(gt_phrase_ids) == 0: # no gt box for this image
                continue
            for sent_item in sentences:
                for phrase_item in sent_item['phrases']:
                    if phrase_item['phrase_id'] in gt_phrase_ids:
                        phrase_types.append(phrase_item['phrase_type']) 

            # merge similarity scores from token level to phrase level, and find the box that grounds the phrase
            phrase_score_tensor, all_phrase_ids = self.find_ground_box(match_scores, all_str2id_links, sentences, gt_phrase_ids)  
            pred_boxes_ind = torch.argmax(phrase_score_tensor, dim=1)
            pred_boxes = precomp_boxes[pred_boxes_ind]
            pred_similarity = phrase_score_tensor # .t() #  pred_similarity: matching score [#phrases, #object proposals]
            
            # get single target/gt box for each phrase
            # 1. any gt box that can be matched as target 
            # refer to (https://github.com/BigRedT/info-ground/blob/22ae6d6ec8b38df473e73034fc895ebf97d39897/exp/ground/eval_flickr_phrase_loc.py#L90)
            phrase_boxes = [box_anno['boxes'][p_id] for p_id in all_phrase_ids]
            targets = []
            for pr_b, pd_b in zip(phrase_boxes, pred_boxes):
                matched = False
                for single_b in pr_b:
                    this_iou = pairwise_iou(Boxes(torch.from_numpy(np.array([single_b])).float()), Boxes(pd_b.view(1,-1)))
                    if (this_iou >= EVAL_THRESH).sum() > 0:
                        targets.append(single_b)
                        matched = True
                        break
                if not matched:
                    targets.append(single_b)
            targets = Boxes(torch.from_numpy(np.array(targets)).float())
            # 2. union box as target
            # target_ind = np.array([gt_phrase_ids.index(p_id) for p_id in all_phrase_ids])
            # targets = gt_boxes[target_ind] # ground-truth boxes for each phrase in each sentence
            # targets = Boxes(torch.from_numpy(targets).float())
            assert len(phrase_types) == len(targets)

            # single predicted box for each phrase
            ious = pairwise_iou(targets, pred_boxes)  # this function will change the target_boxes into cuda mode
            iou = ious.numpy().diagonal()
            total_num += iou.shape[0]
            recall_num += int((iou >= EVAL_THRESH).sum())  # 0.5

            # metric of point (can be ignored)
            pred_boxes_tensor = pred_boxes.tensor
            pred_center = (pred_boxes_tensor[:, :2] + pred_boxes_tensor[:, 2:]) / 2.0
            pred_center = pred_center.repeat(1, 2)  ## x_c, y_c, x_c, y_c
            targets_tensor = targets.tensor
            fall_tensor = targets_tensor - pred_center
            fall_tensor = (fall_tensor[:, :2] <= 0).float().sum(1) + (fall_tensor[:, 2:] >= 0).float().sum(1)
            point_recall_num += (fall_tensor == 4).float().numpy().sum()

            # detailed accuracy across different phrase types
            for pid, p_type in enumerate(phrase_types):
                p_type = p_type[0]
                num_type[p_type] = num_type.setdefault(p_type, 0) + 1
                recall_type[p_type] = recall_type.setdefault(p_type, 0) + (iou[pid] >= EVAL_THRESH)
            
            # metric of recall when multiple predicted boxes for each phrase
            ious_top = pairwise_iou(targets, precomp_boxes).cpu()
            for k in [5, 10]:
                top_k = torch.topk(pred_similarity, k=k, dim=1)[0][:, [-1]]
                pred_similarity_topk = (pred_similarity >= top_k).float()
                ious_top_k = (ious_top * pred_similarity_topk).numpy()
                recall_topk_num[k] += int(((ious_top_k >= EVAL_THRESH).sum(1) > 0).sum())

        acc = recall_num / total_num
        acc_top5 = recall_topk_num[5] / total_num
        acc_top10 = recall_topk_num[10] / total_num
        point_acc = point_recall_num / total_num
        
        # details about each coarse type of phrase
        for type, type_num in num_type.items():
            acc_type[type] = recall_type[type] / type_num

        # if self._output_dir:
        #     PathManager.mkdirs(self._output_dir)
        #     file_path = os.path.join(self._output_dir, "prediction_{}.pkl".format(str(acc).replace('.', '_')[:6]))
        #     with PathManager.open(file_path, "wb") as f:
        #         pickle.dump(all_prediction, f)

        del all_prediction
        self._logger.info('evaluation on {} expression instances, detailed_iou: {}'.format(len(image_evaled), acc_type))
        self._logger.info('Evaluate Pointing Accuracy: PointAcc:{}'.format(point_acc))
        results = OrderedDict({"acc": acc, "acc_top5": acc_top5, "acc_top10": acc_top10})
        self._logger.info(results)
        self._logger.info(num_type)
        return results