# coding=utf-8
# Copyright 2021 The Deeplab2 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Provides data from segmentation datasets.

Currently, we support the following datasets:

1. Cityscapes dataset (https://www.cityscapes-dataset.com).

The Cityscapes dataset contains 19 semantic labels (such as road, person, car,
and so on) for urban street scenes.


2. KITTI-STEP (http://www.cvlibs.net/datasets/kitti/).

The KITTI-STEP enriches the KITTI-MOTS data with additional `stuff'
anntotations.

3. MOTChallenge-STEP (https://motchallenge.net/).

The MOTChallenge-STEP enriches the MOTSChallenge data with additional `stuff'
annotations.

4. MSCOCO panoptic segmentation (http://cocodataset.org/#panoptic-2018).

Panoptic segmentation annotations for MSCOCO dataset. Note that we convert the
provided MSCOCO panoptic segmentation format to the following one:
panoptic label = semantic label * 256 + instance id.

5. Cityscapes-DVPS (https://github.com/joe-siyuan-qiao/ViP-DeepLab)
The Cityscapes-DVPS dataset augments Cityscapes-VPS
(https://github.com/mcahny/vps) with depth annotations.


References:

- Marius Cordts, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus
  Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, and Bernt Schiele, "The
  Cityscapes Dataset for Semantic Urban Scene Understanding." In CVPR, 2016.

- Andreas Geiger and Philip Lenz and Raquel Urtasun, "Are we ready for
  Autonomous Driving? The KITTI Vision Benchmark Suite." In CVPR, 2012.

- Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, and Piotr
  Dollar, "Panoptic Segmentation." In CVPR, 2019.

- Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B.
  Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollar, and C.
  Lawrence Zitnick, "Microsoft COCO: common objects in context." In ECCV, 2014.

- Anton Milan, Laura Leal-Taixe, Ian Reid, Stefan Roth, and Konrad Schindler,
  "Mot16: A benchmark for multi-object tracking." arXiv:1603.00831, 2016.

- Paul Voigtlaender, Michael Krause, Aljosa Osep, Jonathon Luiten, Berin
  Balachandar Gnana Sekar, Andreas Geiger, and Bastian Leibe. "MOTS:
  Multi-object tracking and segmentation." In CVPR, 2019

- Mark Weber, Jun Xie, Maxwell Collins, Yukun Zhu, Paul Voigtlaender, Hartwig
  Adam, Bradley Green, Andreas Geiger, Bastian Leibe, Daniel Cremers, Aljosa
  Osep, Laura Leal-Taixe, and Liang-Chieh Chen, "STEP: Segmenting and Tracking
  Every Pixel." arXiv: 2102.11859, 2021.

- Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon. "Video panoptic
  segmentation." In CVPR, 2020.

- Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen.
  "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic
  Segmentation." In CVPR, 2021.
"""

import collections


# Dataset names.
_CITYSCAPES = 'cityscapes'
_CITYSCAPES_PANOPTIC = 'cityscapes_panoptic'
_KITTI_STEP = 'kitti_step'
_MOTCHALLENGE_STEP = 'motchallenge_step'
_CITYSCAPES_DVPS = 'cityscapes_dvps'
_COCO_PANOPTIC = 'coco_panoptic'

# Colormap names.
_CITYSCAPES_COLORMAP = 'cityscapes'
_MOTCHALLENGE_COLORMAP = 'motchallenge'
_COCO_COLORMAP = 'coco'


# Named tuple to describe dataset properties.
DatasetDescriptor = collections.namedtuple(
    'DatasetDescriptor', [
        'dataset_name',  # Dataset name.
        'splits_to_sizes',  # Splits of the dataset into training, val and test.
        'num_classes',   # Number of semantic classes.
        'ignore_label',  # Ignore label value used for semantic segmentation.

        # Fields below are used for panoptic segmentation and will be None for
        # Semantic segmentation datasets.
        # Label divisor only used in panoptic segmentation annotation to infer
        # semantic label and instance id.
        'panoptic_label_divisor',
        # A tuple of classes that contains instance annotations. For example,
        # 'person' class has instance annotations while 'sky' does not.
        'class_has_instances_list',
        # A flag indicating whether the dataset is a video dataset that contains
        # sequence IDs and frame IDs.
        'is_video_dataset',
        # A string specifying the colormap that should be used for
        # visualization. E.g. 'cityscapes'.
        'colormap',
        # A flag indicating whether the dataset contains depth annotation.
        'is_depth_dataset',
    ]
)

CITYSCAPES_INFORMATION = DatasetDescriptor(
    dataset_name=_CITYSCAPES,
    splits_to_sizes={'train_fine': 2975,
                     'train_coarse': 22973,
                     'trainval_fine': 3475,
                     'trainval_coarse': 23473,
                     'val_fine': 500,
                     'test_fine': 1525},
    num_classes=19,
    ignore_label=255,
    panoptic_label_divisor=None,
    class_has_instances_list=None,
    is_video_dataset=False,
    colormap=_CITYSCAPES_COLORMAP,
    is_depth_dataset=False,
)

CITYSCAPES_PANOPTIC_INFORMATION = DatasetDescriptor(
    dataset_name=_CITYSCAPES_PANOPTIC,
    splits_to_sizes={'train_fine': 2975,
                     'val_fine': 500,
                     'trainval_fine': 3475,
                     'test_fine': 1525},
    num_classes=19,
    ignore_label=255,
    panoptic_label_divisor=1000,
    class_has_instances_list=tuple(range(11, 19)),
    is_video_dataset=False,
    colormap=_CITYSCAPES_COLORMAP,
    is_depth_dataset=False,
)

KITTI_STEP_INFORMATION = DatasetDescriptor(
    dataset_name=_KITTI_STEP,
    splits_to_sizes={'train': 5027,
                     'val': 2981,
                     'test': 11095},
    num_classes=19,
    ignore_label=255,
    panoptic_label_divisor=1000,
    class_has_instances_list=(11, 13),
    is_video_dataset=True,
    colormap=_CITYSCAPES_COLORMAP,
    is_depth_dataset=False,
)

MOTCHALLENGE_STEP_INFORMATION = DatasetDescriptor(
    dataset_name=_MOTCHALLENGE_STEP,
    splits_to_sizes={'train': 525,  # Sequence 9.
                     'val': 600,  # Sequence 2.
                     'test': 0},
    num_classes=7,
    ignore_label=255,
    panoptic_label_divisor=1000,
    class_has_instances_list=(4,),
    is_video_dataset=True,
    colormap=_MOTCHALLENGE_COLORMAP,
    is_depth_dataset=False,
)

CITYSCAPES_DVPS_INFORMATION = DatasetDescriptor(
    dataset_name=_CITYSCAPES_DVPS,
    # The numbers of images are 2400/300/300 for train/val/test. Here, the
    # sizes are the number of consecutive frame pairs. As each sequence has 6
    # frames, the number of pairs for the train split is 2400 / 6 * 5 = 2000.
    # Similarly, we get 250 pairs for the val split and the test split.
    splits_to_sizes={'train': 2000,
                     'val': 250,
                     'test': 250},
    num_classes=19,
    ignore_label=255,
    panoptic_label_divisor=1000,
    class_has_instances_list=tuple(range(11, 19)),
    is_video_dataset=True,
    colormap=_CITYSCAPES_COLORMAP,
    is_depth_dataset=True,
)

COCO_PANOPTIC_INFORMATION = DatasetDescriptor(
    dataset_name=_COCO_PANOPTIC,
    splits_to_sizes={'train': 118287,
                     'val': 5000,
                     'test': 40670},
    num_classes=134,
    ignore_label=0,
    panoptic_label_divisor=256,
    class_has_instances_list=tuple(range(1, 81)),
    is_video_dataset=False,
    colormap=_COCO_COLORMAP,
    is_depth_dataset=False,
)

MAP_NAME_TO_DATASET_INFO = {
    _CITYSCAPES: CITYSCAPES_INFORMATION,
    _CITYSCAPES_PANOPTIC: CITYSCAPES_PANOPTIC_INFORMATION,
    _KITTI_STEP: KITTI_STEP_INFORMATION,
    _MOTCHALLENGE_STEP: MOTCHALLENGE_STEP_INFORMATION,
    _CITYSCAPES_DVPS: CITYSCAPES_DVPS_INFORMATION,
    _COCO_PANOPTIC: COCO_PANOPTIC_INFORMATION,
}

MAP_NAMES = list(MAP_NAME_TO_DATASET_INFO.keys())