525 lines
12 KiB
YAML
525 lines
12 KiB
YAML
# ------------------------------------------------------------------------
|
|
# Semantic SAM
|
|
# Copyright (c) MicroSoft, Inc. and its affiliates.
|
|
# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li.
|
|
# ------------------------------------------------------------------------
|
|
|
|
##################
|
|
# Task settings
|
|
##################
|
|
WEIGHT: ''
|
|
PORT: 53711
|
|
VERBOSE: true
|
|
|
|
OUTPUT_DIR: '../../data/output/test'
|
|
# misc
|
|
LOADER:
|
|
JOINT: True
|
|
KEY_DATASET: 'coco'
|
|
# model
|
|
MODEL:
|
|
NAME: interactive_mask_dino
|
|
HEAD: general_head
|
|
MASK_ON: false
|
|
KEYPOINT_ON: false
|
|
LOAD_PROPOSALS: false
|
|
DIM_PROJ: 512
|
|
BACKBONE_DIM: 768
|
|
BACKGROUND: False
|
|
WEIGHTS: ''
|
|
TEXT:
|
|
ARCH: noencoder # no language encoder for training only sa-1b data
|
|
NAME: transformer
|
|
TOKENIZER: clip
|
|
CONTEXT_LENGTH: 18 # 77
|
|
WIDTH: 512
|
|
HEADS: 8
|
|
LAYERS: 12 # 6
|
|
AUTOGRESSIVE: True
|
|
BACKBONE:
|
|
NAME: swin
|
|
PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'
|
|
LOAD_PRETRAINED: true
|
|
SWIN:
|
|
PRETRAIN_IMG_SIZE: 384
|
|
PATCH_SIZE: 4
|
|
EMBED_DIM: 192
|
|
DEPTHS: [ 2, 2, 18, 2 ]
|
|
NUM_HEADS: [ 6, 12, 24, 48 ]
|
|
WINDOW_SIZE: 12
|
|
MLP_RATIO: 4.0
|
|
QKV_BIAS: true
|
|
QK_SCALE: ~
|
|
DROP_RATE: 0.0
|
|
ATTN_DROP_RATE: 0.0
|
|
DROP_PATH_RATE: 0.3
|
|
APE: false
|
|
PATCH_NORM: true
|
|
USE_CHECKPOINT: false
|
|
OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
|
|
ENCODER:
|
|
NAME: encoder_deform
|
|
IGNORE_VALUE: 255
|
|
NUM_CLASSES: 1
|
|
LOSS_WEIGHT: 1.0
|
|
CONVS_DIM: 256
|
|
MASK_DIM: 256
|
|
NORM: "GN"
|
|
IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
|
|
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
|
|
COMMON_STRIDE: 4
|
|
TRANSFORMER_ENC_LAYERS: 6
|
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
|
NUM_FEATURE_LEVELS: 3
|
|
FEATURE_ORDER: "low2high"
|
|
DECODER:
|
|
NAME: interactive_mask_dino
|
|
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
|
|
MASK: True
|
|
BOX: True
|
|
PART: True
|
|
GROUNDING:
|
|
ENABLED: False
|
|
MAX_LEN: 5
|
|
TEXT_WEIGHT: 2.0
|
|
CLASS_WEIGHT: 0.5
|
|
CAPTION:
|
|
ENABLED: False
|
|
PHRASE_PROB: 0.0
|
|
SIM_THRES: 0.95
|
|
CAPTIONING:
|
|
ENABLED: False
|
|
STEP: 50
|
|
RETRIEVAL:
|
|
ENABLED: False
|
|
DIM_IMG: 768
|
|
ENSEMBLE: True
|
|
OPENIMAGE:
|
|
ENABLED: False
|
|
NEGATIVE_SAMPLES: 5
|
|
GROUNDING:
|
|
ENABLED: False
|
|
MAX_LEN: 5
|
|
DEEP_SUPERVISION: True
|
|
NO_OBJECT_WEIGHT: 0.1
|
|
CLASS_WEIGHT: 4.0
|
|
MASK_WEIGHT: 5.0
|
|
DICE_WEIGHT: 5.0
|
|
BOX_WEIGHT: 5.0
|
|
GIOU_WEIGHT: 2.0
|
|
IOU_WEIGHT: 1.0
|
|
COST_CLASS_WEIGHT: 4.0
|
|
COST_DICE_WEIGHT: 5.0
|
|
COST_MASK_WEIGHT: 5.0
|
|
COST_BOX_WEIGHT: 5.0
|
|
COST_GIOU_WEIGHT: 2.0
|
|
HIDDEN_DIM: 256
|
|
NUM_OBJECT_QUERIES: 0
|
|
NHEADS: 8
|
|
DROPOUT: 0.0
|
|
DIM_FEEDFORWARD: 2048
|
|
ENC_LAYERS: 0
|
|
PRE_NORM: False
|
|
ENFORCE_INPUT_PROJ: False
|
|
SIZE_DIVISIBILITY: 32
|
|
DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query
|
|
TRAIN_NUM_POINTS: 12544
|
|
OVERSAMPLE_RATIO: 3.0
|
|
IMPORTANCE_SAMPLE_RATIO: 0.75
|
|
TWO_STAGE: False
|
|
INITIALIZE_BOX_TYPE: 'no'
|
|
DN: seg
|
|
DN_NOISE_SCALE: 0.4
|
|
DN_NUM: 100
|
|
INITIAL_PRED: False
|
|
LEARN_TGT: False
|
|
TOTAL_NUM_FEATURE_LEVELS: 4
|
|
SEMANTIC_CE_LOSS: False
|
|
PANO_BOX_LOSS: False
|
|
COCO: False
|
|
O365: False
|
|
SAM: True
|
|
PASCAL: False
|
|
RE_POINT: True
|
|
NUM_INTERACTIVE_TOKENS: 6
|
|
MAX_NUM_INSTANCE: 60
|
|
TEST:
|
|
SEMANTIC_ON: True
|
|
INSTANCE_ON: True
|
|
PANOPTIC_ON: True
|
|
BOX_INTERACTIVE: False
|
|
CLASSIFICATION_ON: False
|
|
OVERLAP_THRESHOLD: 0.8
|
|
OBJECT_MASK_THRESHOLD: 0.25
|
|
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
|
|
TEST_FOUCUS_ON_BOX: False
|
|
PANO_TRANSFORM_EVAL: True
|
|
PANO_TEMPERATURE: 0.06
|
|
|
|
TEST:
|
|
EVAL_PERIOD: 500000
|
|
PRECISE_BN:
|
|
NUM_ITER: 1
|
|
ENABLED: False
|
|
AUG:
|
|
ENABLED: False
|
|
|
|
SAM:
|
|
INPUT:
|
|
MIN_SIZE_TEST: 800
|
|
MAX_SIZE_TEST: 1333
|
|
IMAGE_SIZE: 1024
|
|
MIN_SCALE: 0.99
|
|
MAX_SCALE: 1.01
|
|
DATASET_MAPPER_NAME: "sam"
|
|
IGNORE_VALUE: 255
|
|
COLOR_AUG_SSD: False
|
|
SIZE_DIVISIBILITY: 32
|
|
RANDOM_FLIP: "horizontal"
|
|
MASK_FORMAT: "polygon"
|
|
FORMAT: "RGB"
|
|
CROP:
|
|
ENABLED: True
|
|
DATASET:
|
|
DATASET: 'sam'
|
|
TEST:
|
|
DETECTIONS_PER_IMAGE: 100
|
|
NAME: coco_eval
|
|
IOU_TYPE: ['bbox', 'segm']
|
|
USE_MULTISCALE: false
|
|
BATCH_SIZE_TOTAL: 8
|
|
MODEL_FILE: ''
|
|
AUG:
|
|
ENABLED: False
|
|
TRAIN:
|
|
BATCH_SIZE_TOTAL: 1
|
|
BATCH_SIZE_PER_GPU: 1
|
|
SHUFFLE: true
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 4
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
COCO:
|
|
INPUT:
|
|
MIN_SIZE_TEST: 800
|
|
MAX_SIZE_TEST: 1333
|
|
IMAGE_SIZE: 1024
|
|
MIN_SCALE: 0.1
|
|
MAX_SCALE: 2.0
|
|
DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
|
|
IGNORE_VALUE: 255
|
|
COLOR_AUG_SSD: False
|
|
SIZE_DIVISIBILITY: 32
|
|
RANDOM_FLIP: "horizontal"
|
|
MASK_FORMAT: "polygon"
|
|
FORMAT: "RGB"
|
|
CROP:
|
|
ENABLED: True
|
|
DATASET:
|
|
DATASET: 'coco'
|
|
TEST:
|
|
DETECTIONS_PER_IMAGE: 100
|
|
NAME: coco_eval
|
|
IOU_TYPE: ['bbox', 'segm']
|
|
USE_MULTISCALE: false
|
|
BATCH_SIZE_TOTAL: 1
|
|
MODEL_FILE: ''
|
|
AUG:
|
|
ENABLED: False
|
|
TRAIN:
|
|
BATCH_SIZE_TOTAL: 1
|
|
BATCH_SIZE_PER_GPU: 1
|
|
SHUFFLE: true
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 2
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
VLP:
|
|
INPUT:
|
|
IMAGE_SIZE: 224
|
|
DATASET_MAPPER_NAME: "vlpretrain"
|
|
IGNORE_VALUE: 255
|
|
COLOR_AUG_SSD: False
|
|
SIZE_DIVISIBILITY: 32
|
|
MASK_FORMAT: "polygon"
|
|
FORMAT: "RGB"
|
|
CROP:
|
|
ENABLED: True
|
|
TRAIN:
|
|
BATCH_SIZE_TOTAL: 2
|
|
BATCH_SIZE_PER_GPU: 2
|
|
TEST:
|
|
BATCH_SIZE_TOTAL: 256
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 16
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
INPUT:
|
|
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
|
PIXEL_STD: [58.395, 57.120, 57.375]
|
|
|
|
DATASETS:
|
|
TRAIN: ["sam_train"]
|
|
# interactive segmentation evaluation.
|
|
TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"]
|
|
# TEST: ["sam_minival"]
|
|
|
|
CLASS_CONCAT: false
|
|
SIZE_DIVISIBILITY: 32
|
|
PROPOSAL_FILES_TRAIN: []
|
|
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 16
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: True
|
|
|
|
# Detectron2 training config for optimizer and lr scheduler
|
|
SOLVER:
|
|
BASE_LR_END: 0.0
|
|
MOMENTUM: 0.9
|
|
NESTEROV: False
|
|
CHECKPOINT_PERIOD: 5000
|
|
IMS_PER_BATCH: 1
|
|
REFERENCE_WORLD_SIZE: 0
|
|
BIAS_LR_FACTOR: 1.0
|
|
WEIGHT_DECAY_BIAS: None
|
|
# original
|
|
BASE_LR: 0.0001
|
|
STEPS: [327778, 355092]
|
|
MAX_ITER: 368750
|
|
GAMMA: 0.1
|
|
WARMUP_FACTOR: 1.0
|
|
WARMUP_ITERS: 10
|
|
WARMUP_METHOD: "linear"
|
|
WEIGHT_DECAY: 0.05
|
|
OPTIMIZER: "ADAMW"
|
|
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
|
|
LR_MULTIPLIER:
|
|
backbone: 0.1
|
|
lang_encoder: 0.1
|
|
WEIGHT_DECAY_NORM: 0.0
|
|
WEIGHT_DECAY_EMBED: 0.0
|
|
CLIP_GRADIENTS:
|
|
ENABLED: True
|
|
CLIP_TYPE: "full_model"
|
|
CLIP_VALUE: 0.01
|
|
NORM_TYPE: 2.0
|
|
AMP:
|
|
ENABLED: True
|
|
|
|
# Evaluation Dataset
|
|
ADE20K:
|
|
INPUT:
|
|
MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
|
|
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
|
MIN_SIZE_TEST: 640
|
|
MAX_SIZE_TRAIN: 2560
|
|
MAX_SIZE_TEST: 2560
|
|
MASK_FORMAT: "polygon"
|
|
CROP:
|
|
ENABLED: True
|
|
TYPE: "absolute"
|
|
SIZE: [640, 640]
|
|
SINGLE_CATEGORY_MAX_AREA: 1.0
|
|
IGNORE_VALUE: 255
|
|
COLOR_AUG_SSD: True
|
|
SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
|
DATASET_MAPPER_NAME: "mask_former_panoptic"
|
|
FORMAT: "RGB"
|
|
DATASET:
|
|
DATASET: 'ade'
|
|
TRAIN:
|
|
ASPECT_RATIO_GROUPING: true
|
|
BATCH_SIZE_TOTAL: 16
|
|
BATCH_SIZE_PER_GPU: 2
|
|
SHUFFLE: true
|
|
TEST:
|
|
DETECTIONS_PER_IMAGE: 100
|
|
NAME: coco_eval
|
|
IOU_TYPE: ['bbox', 'segm']
|
|
USE_MULTISCALE: false
|
|
BATCH_SIZE_TOTAL: 8
|
|
MODEL_FILE: ''
|
|
AUG:
|
|
ENABLED: False
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 8
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: True
|
|
#ADE20K:
|
|
# INPUT:
|
|
# MIN_SIZE_TRAIN: 640
|
|
# MIN_SIZE_TRAIN_SAMPLING: "choice"
|
|
# MIN_SIZE_TEST: 640
|
|
# MAX_SIZE_TRAIN: 2560
|
|
# MAX_SIZE_TEST: 2560
|
|
# MASK_FORMAT: "polygon"
|
|
# CROP:
|
|
# ENABLED: True
|
|
# TYPE: "absolute"
|
|
# SIZE: (640, 640)
|
|
# SINGLE_CATEGORY_MAX_AREA: 1.0
|
|
# COLOR_AUG_SSD: True
|
|
# SIZE_DIVISIBILITY: 640 # used in dataset mapper
|
|
# DATASET_MAPPER_NAME: "mask_former_panoptic"
|
|
# FORMAT: "RGB"
|
|
# DATASET:
|
|
# DATASET: 'ade'
|
|
# TEST:
|
|
# BATCH_SIZE_TOTAL: 8
|
|
|
|
|
|
REF:
|
|
INPUT:
|
|
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
|
PIXEL_STD: [58.395, 57.120, 57.375]
|
|
MIN_SIZE_TEST: 512
|
|
MAX_SIZE_TEST: 1024
|
|
FORMAT: "RGB"
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 0
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: False
|
|
TEST:
|
|
BATCH_SIZE_TOTAL: 8
|
|
|
|
SUN:
|
|
INPUT:
|
|
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
|
PIXEL_STD: [58.395, 57.120, 57.375]
|
|
MIN_SIZE_TEST: 512
|
|
MAX_SIZE_TEST: 1024
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 0
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: False
|
|
TEST:
|
|
BATCH_SIZE_TOTAL: 8
|
|
|
|
SCAN:
|
|
INPUT:
|
|
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
|
PIXEL_STD: [58.395, 57.120, 57.375]
|
|
MIN_SIZE_TEST: 512
|
|
MAX_SIZE_TEST: 1024
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 0
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: False
|
|
TEST:
|
|
BATCH_SIZE_TOTAL: 8
|
|
|
|
BDD:
|
|
INPUT:
|
|
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
|
PIXEL_STD: [58.395, 57.120, 57.375]
|
|
MIN_SIZE_TEST: 800
|
|
MAX_SIZE_TEST: 1333
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 0
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: False
|
|
TEST:
|
|
BATCH_SIZE_TOTAL: 8
|
|
|
|
CITY:
|
|
INPUT:
|
|
MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
|
|
MIN_SIZE_TRAIN_SAMPLING: "choice"
|
|
MIN_SIZE_TEST: 1024
|
|
MAX_SIZE_TRAIN: 4096
|
|
MAX_SIZE_TEST: 2048
|
|
CROP:
|
|
ENABLED: True
|
|
TYPE: "absolute"
|
|
SIZE: [ 512, 1024 ]
|
|
SINGLE_CATEGORY_MAX_AREA: 1.0
|
|
IGNORE_VALUE: 255
|
|
COLOR_AUG_SSD: True
|
|
SIZE_DIVISIBILITY: -1
|
|
FORMAT: "RGB"
|
|
DATASET_MAPPER_NAME: "mask_former_panoptic"
|
|
MASK_FORMAT: "polygon"
|
|
TEST:
|
|
EVAL_PERIOD: 5000
|
|
BATCH_SIZE_TOTAL: 1
|
|
AUG:
|
|
ENABLED: False
|
|
MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
|
|
MAX_SIZE: 4096
|
|
FLIP: True
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: True
|
|
NUM_WORKERS: 2
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: True
|
|
TRAIN:
|
|
ASPECT_RATIO_GROUPING: true
|
|
BATCH_SIZE_TOTAL: 2
|
|
BATCH_SIZE_PER_GPU: 2
|
|
SHUFFLE: true
|
|
|
|
PSACAL_PART:
|
|
INPUT:
|
|
MIN_SIZE_TEST: 800
|
|
MAX_SIZE_TEST: 1333
|
|
IMAGE_SIZE: 1024
|
|
MIN_SCALE: 0.1
|
|
MAX_SCALE: 2.0
|
|
DATASET_MAPPER_NAME: "pascal_part_lsj"
|
|
IGNORE_VALUE: 255
|
|
COLOR_AUG_SSD: False
|
|
SIZE_DIVISIBILITY: 32
|
|
RANDOM_FLIP: "horizontal"
|
|
MASK_FORMAT: "polygon"
|
|
FORMAT: "RGB"
|
|
CROP:
|
|
ENABLED: True
|
|
MODEL:
|
|
MASK_ON: True
|
|
KEYPOINT_ON: False
|
|
LOAD_PROPOSALS: False
|
|
# DATASET:
|
|
# DATASET: 'coco'
|
|
TEST:
|
|
DETECTIONS_PER_IMAGE: 100
|
|
NAME: coco_eval
|
|
IOU_TYPE: ['bbox', 'segm']
|
|
USE_MULTISCALE: false
|
|
BATCH_SIZE_TOTAL: 8
|
|
MODEL_FILE: ''
|
|
AUG:
|
|
ENABLED: False
|
|
TRAIN:
|
|
BATCH_SIZE_TOTAL: 1
|
|
BATCH_SIZE_PER_GPU: 1
|
|
SHUFFLE: true
|
|
DATALOADER:
|
|
FILTER_EMPTY_ANNOTATIONS: False
|
|
NUM_WORKERS: 2
|
|
LOAD_PROPOSALS: False
|
|
SAMPLER_TRAIN: "TrainingSampler"
|
|
ASPECT_RATIO_GROUPING: True
|