rawalkhirodkar's picture
Add initial commit
28c256d
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# This script converts MOT labels into COCO style.
# Official website of the MOT dataset: https://motchallenge.net/
#
# Label format of MOT dataset:
# GTs:
# <frame_id> # starts from 1 but COCO style starts from 0,
# <instance_id>, <x1>, <y1>, <w>, <h>,
# <conf> # conf is annotated as 0 if the object is ignored,
# <class_id>, <visibility>
#
# DETs and Results:
# <frame_id>, <instance_id>, <x1>, <y1>, <w>, <h>, <conf>,
# <x>, <y>, <z> # for 3D objects
import argparse
import os
import os.path as osp
from collections import defaultdict
import mmengine
import numpy as np
from tqdm import tqdm
# Classes in MOT:
CLASSES = [
dict(id=1, name='pedestrian'),
dict(id=2, name='person_on_vehicle'),
dict(id=3, name='car'),
dict(id=4, name='bicycle'),
dict(id=5, name='motorbike'),
dict(id=6, name='non_mot_vehicle'),
dict(id=7, name='static_person'),
dict(id=8, name='distractor'),
dict(id=9, name='occluder'),
dict(id=10, name='occluder_on_ground'),
dict(id=11, name='occluder_full'),
dict(id=12, name='reflection'),
dict(id=13, name='crowd')
]
def parse_args():
parser = argparse.ArgumentParser(
description='Convert MOT label and detections to COCO-VID format.')
parser.add_argument('-i', '--input', help='path of MOT data')
parser.add_argument(
'-o', '--output', help='path to save coco formatted label file')
parser.add_argument(
'--convert-det',
action='store_true',
help='convert official detection results.')
parser.add_argument(
'--split-train',
action='store_true',
help='split the train set into half-train and half-validate.')
return parser.parse_args()
def parse_gts(gts, is_mot15):
outputs = defaultdict(list)
for gt in gts:
gt = gt.strip().split(',')
frame_id, ins_id = map(int, gt[:2])
bbox = list(map(float, gt[2:6]))
if is_mot15:
conf = 1.
category_id = 1
visibility = 1.
else:
conf = float(gt[6])
category_id = int(gt[7])
visibility = float(gt[8])
anns = dict(
category_id=category_id,
bbox=bbox,
area=bbox[2] * bbox[3],
iscrowd=False,
visibility=visibility,
mot_instance_id=ins_id,
mot_conf=conf)
outputs[frame_id].append(anns)
return outputs
def parse_dets(dets):
outputs = defaultdict(list)
for det in dets:
det = det.strip().split(',')
frame_id, ins_id = map(int, det[:2])
assert ins_id == -1
bbox = list(map(float, det[2:7]))
# [x1, y1, x2, y2] to be consistent with mmdet
bbox = [
bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3], bbox[4]
]
outputs[frame_id].append(bbox)
return outputs
def main():
args = parse_args()
if not osp.isdir(args.output):
os.makedirs(args.output)
sets = ['train', 'test']
if args.split_train:
sets += ['half-train', 'half-val']
vid_id, img_id, ann_id = 1, 1, 1
for subset in sets:
ins_id = 0
print(f'Converting {subset} set to COCO format')
if 'half' in subset:
in_folder = osp.join(args.input, 'train')
else:
in_folder = osp.join(args.input, subset)
out_file = osp.join(args.output, f'{subset}_cocoformat.json')
outputs = defaultdict(list)
outputs['categories'] = CLASSES
if args.convert_det:
det_file = osp.join(args.output, f'{subset}_detections.pkl')
detections = dict(det_bboxes=dict())
video_names = os.listdir(in_folder)
for video_name in tqdm(video_names):
# basic params
parse_gt = 'test' not in subset
ins_maps = dict()
# load video infos
video_folder = osp.join(in_folder, video_name)
infos = mmengine.list_from_file(f'{video_folder}/seqinfo.ini')
# video-level infos
assert video_name == infos[1].strip().split('=')[1]
img_folder = infos[2].strip().split('=')[1]
img_names = os.listdir(f'{video_folder}/{img_folder}')
img_names = sorted(img_names)
fps = int(infos[3].strip().split('=')[1])
num_imgs = int(infos[4].strip().split('=')[1])
assert num_imgs == len(img_names)
width = int(infos[5].strip().split('=')[1])
height = int(infos[6].strip().split('=')[1])
video = dict(
id=vid_id,
name=video_name,
fps=fps,
width=width,
height=height)
# parse annotations
if parse_gt:
gts = mmengine.list_from_file(f'{video_folder}/gt/gt.txt')
if 'MOT15' in video_folder:
img2gts = parse_gts(gts, True)
else:
img2gts = parse_gts(gts, False)
if args.convert_det:
dets = mmengine.list_from_file(f'{video_folder}/det/det.txt')
img2dets = parse_dets(dets)
# make half sets
if 'half' in subset:
split_frame = num_imgs // 2 + 1
if 'train' in subset:
img_names = img_names[:split_frame]
elif 'val' in subset:
img_names = img_names[split_frame:]
else:
raise ValueError(
'subset must be named with `train` or `val`')
mot_frame_ids = [str(int(_.split('.')[0])) for _ in img_names]
with open(f'{video_folder}/gt/gt_{subset}.txt', 'wt') as f:
for gt in gts:
if gt.split(',')[0] in mot_frame_ids:
f.writelines(f'{gt}\n')
# image and box level infos
for frame_id, name in enumerate(img_names):
img_name = osp.join(video_name, img_folder, name)
mot_frame_id = int(name.split('.')[0])
image = dict(
id=img_id,
video_id=vid_id,
file_name=img_name,
height=height,
width=width,
frame_id=frame_id,
mot_frame_id=mot_frame_id)
if parse_gt:
gts = img2gts[mot_frame_id]
for gt in gts:
gt.update(id=ann_id, image_id=img_id)
mot_ins_id = gt['mot_instance_id']
if mot_ins_id in ins_maps:
gt['instance_id'] = ins_maps[mot_ins_id]
else:
gt['instance_id'] = ins_id
ins_maps[mot_ins_id] = ins_id
ins_id += 1
outputs['annotations'].append(gt)
ann_id += 1
if args.convert_det:
dets = np.array(img2dets[mot_frame_id])
if dets.ndim == 1:
assert len(dets) == 0
dets = np.zeros((0, 5))
detections['det_bboxes'][img_name] = [dets]
outputs['images'].append(image)
img_id += 1
outputs['videos'].append(video)
vid_id += 1
outputs['num_instances'] = ins_id
print(f'{subset} has {ins_id} instances.')
mmengine.dump(outputs, out_file)
if args.convert_det:
mmengine.dump(detections, det_file)
print(f'Done! Saved as {out_file} and {det_file}')
else:
print(f'Done! Saved as {out_file}')
if __name__ == '__main__':
main()