# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # This script converts MOT labels into COCO style. # Official website of the MOT dataset: https://motchallenge.net/ # # Label format of MOT dataset: # GTs: # # starts from 1 but COCO style starts from 0, # , , , , , # # conf is annotated as 0 if the object is ignored, # , # # DETs and Results: # , , , , , , , # , , # for 3D objects import argparse import os import os.path as osp from collections import defaultdict import mmengine import numpy as np from tqdm import tqdm # Classes in MOT: CLASSES = [ dict(id=1, name='pedestrian'), dict(id=2, name='person_on_vehicle'), dict(id=3, name='car'), dict(id=4, name='bicycle'), dict(id=5, name='motorbike'), dict(id=6, name='non_mot_vehicle'), dict(id=7, name='static_person'), dict(id=8, name='distractor'), dict(id=9, name='occluder'), dict(id=10, name='occluder_on_ground'), dict(id=11, name='occluder_full'), dict(id=12, name='reflection'), dict(id=13, name='crowd') ] def parse_args(): parser = argparse.ArgumentParser( description='Convert MOT label and detections to COCO-VID format.') parser.add_argument('-i', '--input', help='path of MOT data') parser.add_argument( '-o', '--output', help='path to save coco formatted label file') parser.add_argument( '--convert-det', action='store_true', help='convert official detection results.') parser.add_argument( '--split-train', action='store_true', help='split the train set into half-train and half-validate.') return parser.parse_args() def parse_gts(gts, is_mot15): outputs = defaultdict(list) for gt in gts: gt = gt.strip().split(',') frame_id, ins_id = map(int, gt[:2]) bbox = list(map(float, gt[2:6])) if is_mot15: conf = 1. category_id = 1 visibility = 1. else: conf = float(gt[6]) category_id = int(gt[7]) visibility = float(gt[8]) anns = dict( category_id=category_id, bbox=bbox, area=bbox[2] * bbox[3], iscrowd=False, visibility=visibility, mot_instance_id=ins_id, mot_conf=conf) outputs[frame_id].append(anns) return outputs def parse_dets(dets): outputs = defaultdict(list) for det in dets: det = det.strip().split(',') frame_id, ins_id = map(int, det[:2]) assert ins_id == -1 bbox = list(map(float, det[2:7])) # [x1, y1, x2, y2] to be consistent with mmdet bbox = [ bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3], bbox[4] ] outputs[frame_id].append(bbox) return outputs def main(): args = parse_args() if not osp.isdir(args.output): os.makedirs(args.output) sets = ['train', 'test'] if args.split_train: sets += ['half-train', 'half-val'] vid_id, img_id, ann_id = 1, 1, 1 for subset in sets: ins_id = 0 print(f'Converting {subset} set to COCO format') if 'half' in subset: in_folder = osp.join(args.input, 'train') else: in_folder = osp.join(args.input, subset) out_file = osp.join(args.output, f'{subset}_cocoformat.json') outputs = defaultdict(list) outputs['categories'] = CLASSES if args.convert_det: det_file = osp.join(args.output, f'{subset}_detections.pkl') detections = dict(det_bboxes=dict()) video_names = os.listdir(in_folder) for video_name in tqdm(video_names): # basic params parse_gt = 'test' not in subset ins_maps = dict() # load video infos video_folder = osp.join(in_folder, video_name) infos = mmengine.list_from_file(f'{video_folder}/seqinfo.ini') # video-level infos assert video_name == infos[1].strip().split('=')[1] img_folder = infos[2].strip().split('=')[1] img_names = os.listdir(f'{video_folder}/{img_folder}') img_names = sorted(img_names) fps = int(infos[3].strip().split('=')[1]) num_imgs = int(infos[4].strip().split('=')[1]) assert num_imgs == len(img_names) width = int(infos[5].strip().split('=')[1]) height = int(infos[6].strip().split('=')[1]) video = dict( id=vid_id, name=video_name, fps=fps, width=width, height=height) # parse annotations if parse_gt: gts = mmengine.list_from_file(f'{video_folder}/gt/gt.txt') if 'MOT15' in video_folder: img2gts = parse_gts(gts, True) else: img2gts = parse_gts(gts, False) if args.convert_det: dets = mmengine.list_from_file(f'{video_folder}/det/det.txt') img2dets = parse_dets(dets) # make half sets if 'half' in subset: split_frame = num_imgs // 2 + 1 if 'train' in subset: img_names = img_names[:split_frame] elif 'val' in subset: img_names = img_names[split_frame:] else: raise ValueError( 'subset must be named with `train` or `val`') mot_frame_ids = [str(int(_.split('.')[0])) for _ in img_names] with open(f'{video_folder}/gt/gt_{subset}.txt', 'wt') as f: for gt in gts: if gt.split(',')[0] in mot_frame_ids: f.writelines(f'{gt}\n') # image and box level infos for frame_id, name in enumerate(img_names): img_name = osp.join(video_name, img_folder, name) mot_frame_id = int(name.split('.')[0]) image = dict( id=img_id, video_id=vid_id, file_name=img_name, height=height, width=width, frame_id=frame_id, mot_frame_id=mot_frame_id) if parse_gt: gts = img2gts[mot_frame_id] for gt in gts: gt.update(id=ann_id, image_id=img_id) mot_ins_id = gt['mot_instance_id'] if mot_ins_id in ins_maps: gt['instance_id'] = ins_maps[mot_ins_id] else: gt['instance_id'] = ins_id ins_maps[mot_ins_id] = ins_id ins_id += 1 outputs['annotations'].append(gt) ann_id += 1 if args.convert_det: dets = np.array(img2dets[mot_frame_id]) if dets.ndim == 1: assert len(dets) == 0 dets = np.zeros((0, 5)) detections['det_bboxes'][img_name] = [dets] outputs['images'].append(image) img_id += 1 outputs['videos'].append(video) vid_id += 1 outputs['num_instances'] = ins_id print(f'{subset} has {ins_id} instances.') mmengine.dump(outputs, out_file) if args.convert_det: mmengine.dump(detections, det_file) print(f'Done! Saved as {out_file} and {det_file}') else: print(f'Done! Saved as {out_file}') if __name__ == '__main__': main()