Spaces:

3dlg-hcvc
/

opdmulti-demo

Running

File size: 3,651 Bytes

5ceacf4

import numpy as np
import pycocotools.mask as mask_util
from detectron2.structures import BoxMode


# MotionNet: based on instances_to_coco_json and relevant codes in densepose
def prediction_to_json(instances, img_id: str):
    """
    Args:
        instances (Instances): the output of the model
        img_id (str): the image id in COCO

    Returns:
        list[dict]: the results in densepose evaluation format
    """
    boxes = instances.pred_boxes.tensor.numpy()
    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
    boxes = boxes.tolist()
    scores = instances.scores.tolist()
    classes = instances.pred_classes.tolist()
    # Prediction for MotionNet
    # mtype = instances.mtype.squeeze(axis=1).tolist()

    # 2.0.3
    if instances.has("pdim"):
        pdim = instances.pdim.tolist()
    if instances.has("ptrans"):
        ptrans = instances.ptrans.tolist()
    if instances.has("prot"):
        prot = instances.prot.tolist()

    mtype = instances.mtype.tolist()
    morigin = instances.morigin.tolist()
    maxis = instances.maxis.tolist()
    mstate = instances.mstate.tolist()
    mstatemax = instances.mstatemax.tolist()
    if instances.has("mextrinsic"):
        mextrinsic = instances.mextrinsic.tolist()

    # if motionstate:
    #     mstate = instances.mstate.tolist()

    # MotionNet has masks in the annotation
    # use RLE to encode the masks, because they are too large and takes memory
    # since this evaluator stores outputs of the entire dataset
    rles = [mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks]
    for rle in rles:
        # "counts" is an array encoded by mask_util as a byte-stream. Python3's
        # json writer which always produces strings cannot serialize a bytestream
        # unless you decode it. Thankfully, utf-8 works out (which is also what
        # the pycocotools/_mask.pyx does).
        rle["counts"] = rle["counts"].decode("utf-8")

    results = []
    for k in range(len(instances)):
        if instances.has("pdim"):
            result = {
                "image_id": img_id,
                "category_id": classes[k],
                "bbox": boxes[k],
                "score": scores[k],
                "segmentation": rles[k],
                "pdim": pdim[k],
                "ptrans": ptrans[k],
                "prot": prot[k],
                "mtype": mtype[k],
                "morigin": morigin[k],
                "maxis": maxis[k],
                "mstate": mstate[k],
                "mstatemax": mstatemax[k],
            }
        elif instances.has("mextrinsic"):
            result = {
                "image_id": img_id,
                "category_id": classes[k],
                "bbox": boxes[k],
                "score": scores[k],
                "segmentation": rles[k],
                "mtype": mtype[k],
                "morigin": morigin[k],
                "maxis": maxis[k],
                "mextrinsic": mextrinsic[k],
                "mstate": mstate[k],
                "mstatemax": mstatemax[k],
            }
        else:
            result = {
                "image_id": img_id,
                "category_id": classes[k],
                "bbox": boxes[k],
                "score": scores[k],
                "segmentation": rles[k],
                "mtype": mtype[k],
                "morigin": morigin[k],
                "maxis": maxis[k],
                "mstate": mstate[k],
                "mstatemax": mstatemax[k],
            }
        # if motionstate:
        #     result["mstate"] = mstate[k]
        results.append(result)
    return results