regionclip-demo / configs /CLIP_fast_rcnn_swin_base_C4.yaml
jwyang
first commit
4121bec
_BASE_: "./Base-RCNN-C4.yaml"
MODEL:
META_ARCHITECTURE: "CLIPFastRCNN" # "CLIPRCNN" # "GeneralizedRCNN"
BACKBONE:
NAME: "build_clip_swin" # "build_resnet_fpn_backbone"
FREEZE_AT: 2
TEXT_BACKBONE:
NAME: "build_clip_swin_text_backbone"
SPEC:
EMBED_DIM: 512
VISION:
PATCH_SIZE: 4
IN_CHANS: 3
EMBED_DIM: 128
DEPTHS: [ 2, 2, 18, 2 ]
NUM_HEADS: [ 4, 8, 16, 32 ]
WINDOW_SIZE: 7
MLP_RATIO: 4.
QKV_BIAS: True
APE: False
PATCH_NORM: True
DROP_RATE: 0.0
DROP_PATH_RATE: 0.2
OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
TEXT:
NAME: 'transformer'
TOKENIZER: clip
CONTEXT_LENGTH: 77
WIDTH: 512
HEADS: 8
LAYERS: 12
WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RPN:
HEAD_NAME: StandardRPNHead
IN_FEATURES: ["stage4"]
ROI_HEADS:
NAME: "CLIPSwinROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
IN_FEATURES: ["stage4"]
NUM_CLASSES: 1203
SCORE_THRESH_TEST: 0.0001
ROI_BOX_HEAD:
NAME: ""
NUM_FC: 0
POOLER_RESOLUTION: 14
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 0
POOLER_RESOLUTION: 14
PIXEL_MEAN: [0.485, 0.456, 0.406]
PIXEL_STD: [0.229, 0.224, 0.225]
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v1_train",)
TEST: ("lvis_v1_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
EVAL_PERIOD: 25000
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (120000, 160000)
MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001
INPUT:
MIN_SIZE_TRAIN_SAMPLING: choice
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
FORMAT: "RGB"