Spaces:
Runtime error
Runtime error
File size: 4,646 Bytes
7e8c559 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) Meta Platforms, Inc. All Rights Reserved
from detectron2.config import CfgNode as CN
def add_mask_former_default_config(cfg):
# data config
# select the dataset mapper
cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
# Color augmentation
cfg.INPUT.COLOR_AUG_SSD = False
# We retry random cropping until no single category in semantic segmentation GT occupies more
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
# Pad image and segmentation GT in dataset mapper.
cfg.INPUT.SIZE_DIVISIBILITY = -1
# solver config
# test batch size
cfg.SOLVER.TEST_IMS_PER_BATCH = 1
# weight decay on embedding
cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
# optimizer
cfg.SOLVER.OPTIMIZER = "ADAMW"
cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
# mask_former model config
cfg.MODEL.MASK_FORMER = CN()
# loss
cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
# transformer config
cfg.MODEL.MASK_FORMER.NHEADS = 8
cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
cfg.MODEL.MASK_FORMER.PRE_NORM = False
cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
# mask_former inference config
cfg.MODEL.MASK_FORMER.TEST = CN()
cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
# Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
# you can use this config to override
cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
# pixel decoder config
cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
# adding transformer in pixel decoder
cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
# pixel decoder
cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
# swin transformer backbone
cfg.MODEL.SWIN = CN()
cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
cfg.MODEL.SWIN.PATCH_SIZE = 4
cfg.MODEL.SWIN.EMBED_DIM = 96
cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
cfg.MODEL.SWIN.WINDOW_SIZE = 7
cfg.MODEL.SWIN.MLP_RATIO = 4.0
cfg.MODEL.SWIN.QKV_BIAS = True
cfg.MODEL.SWIN.QK_SCALE = None
cfg.MODEL.SWIN.NORM_INDICES = None
cfg.MODEL.SWIN.PROJECTION = False
cfg.MODEL.SWIN.PROJECT_DIM = 256
cfg.MODEL.SWIN.DROP_RATE = 0.0
cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
cfg.MODEL.SWIN.APE = False
cfg.MODEL.SWIN.PATCH_NORM = True
cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
def add_our_config(cfg):
cfg.TEST.SLIDING_WINDOW = False
cfg.TEST.SLIDING_TILE_SIZE = 224
cfg.TEST.SLIDING_OVERLAP = 2 / 3.0
# whether to use dense crf
cfg.TEST.DENSE_CRF = False
cfg.DATASETS.SAMPLE_PER_CLASS = -1
cfg.DATASETS.SAMPLE_SEED = 0
# embedding head
cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM = 512
cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM = 1024
cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS = 2
# clip_adapter
cfg.MODEL.CLIP_ADAPTER = CN()
cfg.MODEL.CLIP_ADAPTER.TEXT_TEMPLATES = "vild"
# for predefined
cfg.MODEL.CLIP_ADAPTER.PREDEFINED_PROMPT_TEMPLATES = ["a photo of a {}."]
# for learnable prompt
cfg.MODEL.CLIP_ADAPTER.PROMPT_CHECKPOINT = ""
cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME = "ViT-B/16"
cfg.MODEL.CLIP_ADAPTER.MASK_FILL = "mean"
cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO = 1.0
cfg.MODEL.CLIP_ADAPTER.MASK_THR = 0.4
cfg.MODEL.CLIP_ADAPTER.MASK_MATTING = False
cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED = True
cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE = True
cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT = 0.7
# for mask prompt
cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH = 3
cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD = False
# wandb
cfg.WANDB = CN()
cfg.WANDB.PROJECT = "open_vocab_seg"
cfg.WANDB.NAME = None
def add_ovseg_config(cfg):
"""
Add config for open_vocab_seg.
"""
add_mask_former_default_config(cfg)
add_our_config(cfg)
|