diff --git a/.gitignore b/.gitignore index cedaca8..6aaa755 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Model checkpoints +*.pth +*.pt + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/mm_agents/SoM_agent.py b/mm_agents/SoM_agent.py index e115203..e3b3e59 100644 --- a/mm_agents/SoM_agent.py +++ b/mm_agents/SoM_agent.py @@ -4,7 +4,6 @@ import os import re import base64 import PIL.Image -from desktop_env.envs.desktop_env import Action, MouseClick import json import requests @@ -15,7 +14,7 @@ import argparse from seem.modeling.BaseModel import BaseModel as BaseModel_Seem from seem.utils.distributed import init_distributed as init_distributed_seem from seem.modeling import build_model as build_model_seem -from task_adapter.seem.tasks import interactive_seem_m2m_auto, inference_seem_pano, inference_seem_interactive +from task_adapter.seem.tasks import inference_seem_pano # semantic sam from semantic_sam.BaseModel import BaseModel @@ -28,9 +27,9 @@ from task_adapter.semantic_sam.tasks import inference_semsam_m2m_auto, prompt_sw # sam from segment_anything import sam_model_registry from task_adapter.sam.tasks.inference_sam_m2m_auto import inference_sam_m2m_auto -from task_adapter.sam.tasks.inference_sam_m2m_interactive import inference_sam_m2m_interactive from scipy.ndimage import label +from io import BytesIO import numpy as np SYS_PROMPT = ''' @@ -45,7 +44,7 @@ Firstly you need to predict the class of your action, select from one below: for example, format as: ``` { - "action_type": "MOUSE_MOVE", + "action_type": "CLICK", "label": 7 } ``` @@ -107,7 +106,11 @@ def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs else: level = [6, 1, 2, 3, 4, 5] - label_mode = 'a' if label_mode == 'Alphabet' else '1' + if label_mode == 'Alphabet': + label_mode = 'a' + else: + label_mode = '1' + text_size, hole_scale, island_scale = 1280, 100, 100 text, text_part, text_thresh = '', '', '0.0' @@ -126,11 +129,15 @@ def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs model = model_seem output, mask = inference_seem_pano(model, image, text_size, label_mode, alpha, anno_mode) - return output + return output, mask # Function to encode the image def encode_image(image): - return base64.b64encode(image).decode('utf-8') + pil_img = PIL.Image.fromarray(image) + buff = BytesIO() + pil_img.save(buff, format="JPEG") + new_image_string = base64.b64encode(buff.getvalue()).decode("utf-8") + return new_image_string def parse_actions_from_string(input_string): # Search for a JSON string within the input string @@ -187,7 +194,8 @@ class GPT4v_Agent: ] def predict(self, obs): - obs = inference(obs, slider=2.0, mode="Automatic", alpha=0.1, label_mode="Number", anno_mode=["Mark", "Box"]) + obs, mask = inference(obs, slider=3.0, mode="Automatic", alpha=0.1, label_mode="Number", anno_mode=["Mark", "Box"]) + PIL.Image.fromarray(obs).save("desktop.jpeg") base64_image = encode_image(obs) self.trajectory.append({ "role": "user", @@ -218,14 +226,14 @@ class GPT4v_Agent: response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload) try: - actions = self.parse_actions(response.json()['choices'][0]['message']['content']) + actions = self.parse_actions(response.json()['choices'][0]['message']['content'], mask) except: print("Failed to parse action from response:", response.json()['choices'][0]['message']['content']) actions = None return actions - def parse_actions(self, response: str): + def parse_actions(self, response: str, mask): # response example """ ```json @@ -238,6 +246,7 @@ class GPT4v_Agent: # parse from the response actions = parse_actions_from_string(response) + print(actions) # add action into the trajectory self.trajectory.append({ @@ -253,24 +262,14 @@ class GPT4v_Agent: # parse action parsed_actions = [] for action in actions: - parsed_action = {} - action_type = Action[action['action_type']].value - parsed_action["action_type"] = action_type + action_type = action['action_type'] + if action_type == "CLICK": + label = int(action['label']) + x, y, w, h = mask[label-1]['bbox'] + parsed_actions.append({"action_type": action_type, "x": int(x + w//2) , "y": int(y + h//2)}) - if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value: - parsed_action["click_type"] = MouseClick[action['click_type']].value - - if action_type == Action.MOUSE_MOVE.value: - parsed_action["x"] = action["x"] - parsed_action["y"] = action["y"] - - if action_type == Action.KEY.value: - parsed_action["key"] = action["key"] # handle the condition of single key and multiple keys - - if action_type == Action.TYPE.value: - parsed_action["text"] = action["text"] - - parsed_actions.append(parsed_action) + if action_type == "TYPE": + parsed_actions.append({"action_type": action_type, "text": action["text"]}) return parsed_actions @@ -279,6 +278,6 @@ if __name__ == '__main__': # OpenAI API Key api_key = os.environ.get("OPENAI_API_KEY") - agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet") - obs = PIL.Image.open('stackoverflow.png') + agent = GPT4v_Agent(api_key=api_key, instruction="Open Firefox") + obs = PIL.Image.open('desktop.png') print(agent.predict(obs=obs)) \ No newline at end of file diff --git a/mm_agents/chrome_start.png b/mm_agents/chrome_start.png deleted file mode 100644 index 24a65c2..0000000 Binary files a/mm_agents/chrome_start.png and /dev/null differ diff --git a/mm_agents/configs/seem_focall_unicl_lang_v1.yaml b/mm_agents/configs/seem_focall_unicl_lang_v1.yaml new file mode 100644 index 0000000..23efe54 --- /dev/null +++ b/mm_agents/configs/seem_focall_unicl_lang_v1.yaml @@ -0,0 +1,401 @@ +# -------------------------------------------------------- +# X-Decoder -- Generalized Decoding for Pixel, Image, and Language +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Xueyan Zou (xueyan@cs.wisc.edu) +# -------------------------------------------------------- + +# Define Test/Trainer/Saving +PIPELINE: XDecoderPipeline +TRAINER: xdecoder +SAVE_DIR: '../../data/output/test' +base_path: "./" + +# Resume Logistic +RESUME: false +WEIGHT: false +RESUME_FROM: '' +EVAL_AT_START: False + +# Logging and Debug +WANDB: False +LOG_EVERY: 100 +FIND_UNUSED_PARAMETERS: false + +# Speed up training +FP16: false +PORT: '36873' + +# misc +LOADER: + JOINT: False + KEY_DATASET: 'coco' + +################## +# Task settings +################## +VERBOSE: true +MODEL: + NAME: seem_model_v1 + HEAD: xdecoder_head + MASK_ON: false + KEYPOINT_ON: false + LOAD_PROPOSALS: false + DIM_PROJ: 512 + TEXT: + ARCH: vlpencoder + NAME: transformer + TOKENIZER: clip + CONTEXT_LENGTH: 77 # 77 + WIDTH: 512 + HEADS: 8 + LAYERS: 12 # 6 + AUTOGRESSIVE: True + BACKBONE: + NAME: focal + PRETRAINED: '' + LOAD_PRETRAINED: false + FOCAL: + PRETRAIN_IMG_SIZE: 224 + PATCH_SIZE: 4 + EMBED_DIM: 192 + DEPTHS: [2, 2, 18, 2] + FOCAL_LEVELS: [4, 4, 4, 4] + FOCAL_WINDOWS: [3, 3, 3, 3] + DROP_PATH_RATE: 0.3 + MLP_RATIO: 4.0 + DROP_RATE: 0.0 + PATCH_NORM: True + USE_CONV_EMBED: True + SCALING_MODULATOR: True + USE_CHECKPOINT: False + USE_POSTLN: true + USE_POSTLN_IN_MODULATION: false + USE_LAYERSCALE: True + OUT_FEATURES: ["res2", "res3", "res4", "res5"] + OUT_INDICES: [0, 1, 2, 3] + ENCODER: + NAME: transformer_encoder_fpn + IGNORE_VALUE: 255 + NUM_CLASSES: 133 + LOSS_WEIGHT: 1.0 + CONVS_DIM: 512 + MASK_DIM: 512 + NORM: "GN" + IN_FEATURES: ["res2", "res3", "res4", "res5"] + DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] + COMMON_STRIDE: 4 + TRANSFORMER_ENC_LAYERS: 6 + DECODER: + NAME: seem_v1 + TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" + MASK: + ENABLED: True + DETECTION: False + SPATIAL: + ENABLED: True + MAX_ITER: 1 + GROUNDING: + ENABLED: True + MAX_LEN: 5 + TEXT_WEIGHT: 2.0 + CLASS_WEIGHT: 0.5 + RETRIEVAL: + ENABLED: False + LVIS: + ENABLED: True + THRES: 0.7 + OPENIMAGE: + ENABLED: False + NEGATIVE_SAMPLES: 5 + GROUNDING: + ENABLED: False + MAX_LEN: 5 + CAPTION: + ENABLED: False + PHRASE_PROB: 0.5 + SIM_THRES: 0.95 + DEEP_SUPERVISION: True + NO_OBJECT_WEIGHT: 0.1 + GCLASS_WEIGHT: 0.4 + GMASK_WEIGHT: 1.0 + GDICE_WEIGHT: 1.0 + SCLASS_WEIGHT: 0.4 + SMASK_WEIGHT: 1.0 + SDICE_WEIGHT: 1.0 + OCLASS_WEIGHT: 0.4 + OMASK_WEIGHT: 1.0 + ODICE_WEIGHT: 1.0 + CLASS_WEIGHT: 2.0 + MASK_WEIGHT: 5.0 + DICE_WEIGHT: 5.0 + BBOX_WEIGHT: 5.0 + GIOU_WEIGHT: 2.0 + CAPTION_WEIGHT: 2.0 + COST_SPATIAL: + CLASS_WEIGHT: 5.0 + MASK_WEIGHT: 2.0 + DICE_WEIGHT: 2.0 + HIDDEN_DIM: 512 + NUM_OBJECT_QUERIES: 101 + NHEADS: 8 + DROPOUT: 0.0 + DIM_FEEDFORWARD: 2048 + MAX_SPATIAL_LEN: [512, 512, 512, 512] + # ENC_LAYERS: 0 + PRE_NORM: False + ENFORCE_INPUT_PROJ: False + SIZE_DIVISIBILITY: 32 + TRAIN_NUM_POINTS: 12544 + OVERSAMPLE_RATIO: 3.0 + IMPORTANCE_SAMPLE_RATIO: 0.75 + DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query + TOP_GROUNDING_LAYERS: 10 + TOP_CAPTION_LAYERS: 10 + TOP_SPATIAL_LAYERS: 10 + TOP_OPENIMAGE_LAYERS: 10 + TEST: + SEMANTIC_ON: True + INSTANCE_ON: True + PANOPTIC_ON: True + OVERLAP_THRESHOLD: 0.8 + OBJECT_MASK_THRESHOLD: 0.8 + SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false + +# Spatial sampler +STROKE_SAMPLER: + MAX_CANDIDATE: 1 + CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only + CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"] + DILATION: 3 + CIRCLE: + NUM_STROKES: 5 + STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small'] + STROKE_PROB: [0.33, 0.33, 0.33] + SCRIBBLE: + NUM_STROKES: 5 + STROKE_PRESET: ['rand_curve', 'rand_curve_small'] + STROKE_PROB: [0.5, 0.5] + POINT: + NUM_POINTS: 20 + POLYGON: + MAX_POINTS: 9 + EVAL: + MODE: 'best' # best/random/best_random + NEGATIVE: False + MAX_ITER: 20 + IOU_ITER: 1 + GROUNDING: False + +# Multi-modal Architecture, order matters +ATTENTION_ARCH: + VARIABLE: + queries: ['object', 'grounding', 'spatial'] + tokens: ['grounding', 'spatial'] + memories: ['spatial'] + SELF_ATTENTION: + queries: + object: ['queries_object'] + grounding: ['queries_grounding', 'tokens_grounding'] + spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial'] + tokens: + grounding: ['queries_grounding', 'tokens_grounding'] + spatial: ['tokens_spatial'] + memories: + spatial: ['memories_spatial'] + CROSS_ATTENTION: + queries: + object: True + grounding: True + spatial: True + memories: + spatial: True + tokens: + grounding: False + spatial: False + MASKING: ['tokens_spatial', 'tokens_grounding'] + DUPLICATION: + queries: + grounding: 'queries_object' + spatial: 'queries_object' + SPATIAL_MEMORIES: 32 + QUERY_NUMBER: 3 + +DATASETS: + TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",] + # TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",] + TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"] # to evaluate instance and semantic performance as well + # TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] + # TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] + # TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] + # TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] + CLASS_CONCAT: false + SIZE_DIVISIBILITY: 32 + PROPOSAL_FILES_TRAIN: [] + +INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + +TRAIN: + ASPECT_RATIO_GROUPING: true + BATCH_SIZE_TOTAL: 4 + BATCH_SIZE_PER_GPU: 4 + SHUFFLE: true + +TEST: + DETECTIONS_PER_IMAGE: 100 + NAME: coco_eval + IOU_TYPE: ['bbox', 'segm'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 8 + MODEL_FILE: '' + AUG: + ENABLED: False + +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 8 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +COCO: + INPUT: + MIN_SIZE_TRAIN: 800 + MAX_SIZE_TRAIN: 1333 + MIN_SIZE_TRAIN_SAMPLING: 'choice' + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + IMAGE_SIZE: 1024 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + DATASET_MAPPER_NAME: "coco_interactive" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + RANDOM_FLIP: "horizontal" + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + DATASET: + DATASET: 'coco' + +# Validation dataset +ADE20K: + INPUT: + MIN_SIZE_TRAIN: 640 + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 640 + MAX_SIZE_TRAIN: 2560 + MAX_SIZE_TEST: 2560 + MASK_FORMAT: "polygon" + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: (640, 640) + SINGLE_CATEGORY_MAX_AREA: 1.0 + COLOR_AUG_SSD: True + SIZE_DIVISIBILITY: 640 # used in dataset mapper + DATASET_MAPPER_NAME: "mask_former_panoptic" + FORMAT: "RGB" + DATASET: + DATASET: 'ade' + +SBD: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 1 + +VOC: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +DAVIS: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +VOS: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 1 + +REF: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 512 + MAX_SIZE_TEST: 1024 + FORMAT: "RGB" + SPATIAL: False + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 4 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +# Detectron2 training config for optimizer and lr scheduler +SOLVER: + BASE_LR: 0.0001 + STEPS: [0.88889, 0.96296] + MAX_ITER: 1 + GAMMA: 0.1 + WARMUP_FACTOR: 1.0 + WARMUP_ITERS: 10 + WARMUP_METHOD: "linear" + WEIGHT_DECAY: 0.05 + OPTIMIZER: "ADAMW" + LR_SCHEDULER_NAME: "WarmupMultiStepLR" + LR_MULTIPLIER: + backbone: 0.1 + lang_encoder: 0.1 + FIX_PARAM: + backbone: True + lang_encoder: True + pixel_decoder: True + WEIGHT_DECAY_NORM: 0.0 + WEIGHT_DECAY_EMBED: 0.0 + CLIP_GRADIENTS: + ENABLED: True + CLIP_TYPE: "full_model" + CLIP_VALUE: 5.0 # 0.01 + NORM_TYPE: 2.0 + MAX_NUM_EPOCHS: 50 \ No newline at end of file diff --git a/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml b/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml new file mode 100644 index 0000000..93abac6 --- /dev/null +++ b/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml @@ -0,0 +1,524 @@ +# ------------------------------------------------------------------------ +# Semantic SAM +# Copyright (c) MicroSoft, Inc. and its affiliates. +# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li. +# ------------------------------------------------------------------------ + +################## +# Task settings +################## +WEIGHT: '' +PORT: 53711 +VERBOSE: true + +OUTPUT_DIR: '../../data/output/test' +# misc +LOADER: + JOINT: True + KEY_DATASET: 'coco' +# model +MODEL: + NAME: interactive_mask_dino + HEAD: general_head + MASK_ON: false + KEYPOINT_ON: false + LOAD_PROPOSALS: false + DIM_PROJ: 512 + BACKBONE_DIM: 768 + BACKGROUND: False + WEIGHTS: '' + TEXT: + ARCH: noencoder # no language encoder for training only sa-1b data + NAME: transformer + TOKENIZER: clip + CONTEXT_LENGTH: 18 # 77 + WIDTH: 512 + HEADS: 8 + LAYERS: 12 # 6 + AUTOGRESSIVE: True + BACKBONE: + NAME: swin + PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' + LOAD_PRETRAINED: true + SWIN: + PRETRAIN_IMG_SIZE: 384 + PATCH_SIZE: 4 + EMBED_DIM: 192 + DEPTHS: [ 2, 2, 18, 2 ] + NUM_HEADS: [ 6, 12, 24, 48 ] + WINDOW_SIZE: 12 + MLP_RATIO: 4.0 + QKV_BIAS: true + QK_SCALE: ~ + DROP_RATE: 0.0 + ATTN_DROP_RATE: 0.0 + DROP_PATH_RATE: 0.3 + APE: false + PATCH_NORM: true + USE_CHECKPOINT: false + OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ] + ENCODER: + NAME: encoder_deform + IGNORE_VALUE: 255 + NUM_CLASSES: 1 + LOSS_WEIGHT: 1.0 + CONVS_DIM: 256 + MASK_DIM: 256 + NORM: "GN" + IN_FEATURES: [ "res2", "res3", "res4", "res5" ] + DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ] + COMMON_STRIDE: 4 + TRANSFORMER_ENC_LAYERS: 6 + TOTAL_NUM_FEATURE_LEVELS: 4 + NUM_FEATURE_LEVELS: 3 + FEATURE_ORDER: "low2high" + DECODER: + NAME: interactive_mask_dino + TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" + MASK: True + BOX: True + PART: True + GROUNDING: + ENABLED: False + MAX_LEN: 5 + TEXT_WEIGHT: 2.0 + CLASS_WEIGHT: 0.5 + CAPTION: + ENABLED: False + PHRASE_PROB: 0.0 + SIM_THRES: 0.95 + CAPTIONING: + ENABLED: False + STEP: 50 + RETRIEVAL: + ENABLED: False + DIM_IMG: 768 + ENSEMBLE: True + OPENIMAGE: + ENABLED: False + NEGATIVE_SAMPLES: 5 + GROUNDING: + ENABLED: False + MAX_LEN: 5 + DEEP_SUPERVISION: True + NO_OBJECT_WEIGHT: 0.1 + CLASS_WEIGHT: 4.0 + MASK_WEIGHT: 5.0 + DICE_WEIGHT: 5.0 + BOX_WEIGHT: 5.0 + GIOU_WEIGHT: 2.0 + IOU_WEIGHT: 1.0 + COST_CLASS_WEIGHT: 4.0 + COST_DICE_WEIGHT: 5.0 + COST_MASK_WEIGHT: 5.0 + COST_BOX_WEIGHT: 5.0 + COST_GIOU_WEIGHT: 2.0 + HIDDEN_DIM: 256 + NUM_OBJECT_QUERIES: 0 + NHEADS: 8 + DROPOUT: 0.0 + DIM_FEEDFORWARD: 2048 + ENC_LAYERS: 0 + PRE_NORM: False + ENFORCE_INPUT_PROJ: False + SIZE_DIVISIBILITY: 32 + DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query + TRAIN_NUM_POINTS: 12544 + OVERSAMPLE_RATIO: 3.0 + IMPORTANCE_SAMPLE_RATIO: 0.75 + TWO_STAGE: False + INITIALIZE_BOX_TYPE: 'no' + DN: seg + DN_NOISE_SCALE: 0.4 + DN_NUM: 100 + INITIAL_PRED: False + LEARN_TGT: False + TOTAL_NUM_FEATURE_LEVELS: 4 + SEMANTIC_CE_LOSS: False + PANO_BOX_LOSS: False + COCO: False + O365: False + SAM: True + PASCAL: False + RE_POINT: True + NUM_INTERACTIVE_TOKENS: 6 + MAX_NUM_INSTANCE: 60 + TEST: + SEMANTIC_ON: True + INSTANCE_ON: True + PANOPTIC_ON: True + BOX_INTERACTIVE: False + CLASSIFICATION_ON: False + OVERLAP_THRESHOLD: 0.8 + OBJECT_MASK_THRESHOLD: 0.25 + SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false + TEST_FOUCUS_ON_BOX: False + PANO_TRANSFORM_EVAL: True + PANO_TEMPERATURE: 0.06 + +TEST: + EVAL_PERIOD: 500000 + PRECISE_BN: + NUM_ITER: 1 + ENABLED: False + AUG: + ENABLED: False + +SAM: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + IMAGE_SIZE: 1024 + MIN_SCALE: 0.99 + MAX_SCALE: 1.01 + DATASET_MAPPER_NAME: "sam" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + RANDOM_FLIP: "horizontal" + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + DATASET: + DATASET: 'sam' + TEST: + DETECTIONS_PER_IMAGE: 100 + NAME: coco_eval + IOU_TYPE: ['bbox', 'segm'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 8 + MODEL_FILE: '' + AUG: + ENABLED: False + TRAIN: + BATCH_SIZE_TOTAL: 1 + BATCH_SIZE_PER_GPU: 1 + SHUFFLE: true + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 4 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +COCO: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + IMAGE_SIZE: 1024 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + RANDOM_FLIP: "horizontal" + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + DATASET: + DATASET: 'coco' + TEST: + DETECTIONS_PER_IMAGE: 100 + NAME: coco_eval + IOU_TYPE: ['bbox', 'segm'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 1 + MODEL_FILE: '' + AUG: + ENABLED: False + TRAIN: + BATCH_SIZE_TOTAL: 1 + BATCH_SIZE_PER_GPU: 1 + SHUFFLE: true + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 2 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +VLP: + INPUT: + IMAGE_SIZE: 224 + DATASET_MAPPER_NAME: "vlpretrain" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + TRAIN: + BATCH_SIZE_TOTAL: 2 + BATCH_SIZE_PER_GPU: 2 + TEST: + BATCH_SIZE_TOTAL: 256 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 16 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + +DATASETS: + TRAIN: ["sam_train"] + # interactive segmentation evaluation. + TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"] +# TEST: ["sam_minival"] + + CLASS_CONCAT: false + SIZE_DIVISIBILITY: 32 + PROPOSAL_FILES_TRAIN: [] + +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 16 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + +# Detectron2 training config for optimizer and lr scheduler +SOLVER: + BASE_LR_END: 0.0 + MOMENTUM: 0.9 + NESTEROV: False + CHECKPOINT_PERIOD: 5000 + IMS_PER_BATCH: 1 + REFERENCE_WORLD_SIZE: 0 + BIAS_LR_FACTOR: 1.0 + WEIGHT_DECAY_BIAS: None + # original + BASE_LR: 0.0001 + STEPS: [327778, 355092] + MAX_ITER: 368750 + GAMMA: 0.1 + WARMUP_FACTOR: 1.0 + WARMUP_ITERS: 10 + WARMUP_METHOD: "linear" + WEIGHT_DECAY: 0.05 + OPTIMIZER: "ADAMW" + LR_SCHEDULER_NAME: "WarmupMultiStepLR" + LR_MULTIPLIER: + backbone: 0.1 + lang_encoder: 0.1 + WEIGHT_DECAY_NORM: 0.0 + WEIGHT_DECAY_EMBED: 0.0 + CLIP_GRADIENTS: + ENABLED: True + CLIP_TYPE: "full_model" + CLIP_VALUE: 0.01 + NORM_TYPE: 2.0 + AMP: + ENABLED: True + +# Evaluation Dataset +ADE20K: + INPUT: + MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280] + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 640 + MAX_SIZE_TRAIN: 2560 + MAX_SIZE_TEST: 2560 + MASK_FORMAT: "polygon" + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: [640, 640] + SINGLE_CATEGORY_MAX_AREA: 1.0 + IGNORE_VALUE: 255 + COLOR_AUG_SSD: True + SIZE_DIVISIBILITY: 640 # used in dataset mapper + DATASET_MAPPER_NAME: "mask_former_panoptic" + FORMAT: "RGB" + DATASET: + DATASET: 'ade' + TRAIN: + ASPECT_RATIO_GROUPING: true + BATCH_SIZE_TOTAL: 16 + BATCH_SIZE_PER_GPU: 2 + SHUFFLE: true + TEST: + DETECTIONS_PER_IMAGE: 100 + NAME: coco_eval + IOU_TYPE: ['bbox', 'segm'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 8 + MODEL_FILE: '' + AUG: + ENABLED: False + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 8 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True +#ADE20K: +# INPUT: +# MIN_SIZE_TRAIN: 640 +# MIN_SIZE_TRAIN_SAMPLING: "choice" +# MIN_SIZE_TEST: 640 +# MAX_SIZE_TRAIN: 2560 +# MAX_SIZE_TEST: 2560 +# MASK_FORMAT: "polygon" +# CROP: +# ENABLED: True +# TYPE: "absolute" +# SIZE: (640, 640) +# SINGLE_CATEGORY_MAX_AREA: 1.0 +# COLOR_AUG_SSD: True +# SIZE_DIVISIBILITY: 640 # used in dataset mapper +# DATASET_MAPPER_NAME: "mask_former_panoptic" +# FORMAT: "RGB" +# DATASET: +# DATASET: 'ade' +# TEST: +# BATCH_SIZE_TOTAL: 8 + + +REF: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 512 + MAX_SIZE_TEST: 1024 + FORMAT: "RGB" + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +SUN: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 512 + MAX_SIZE_TEST: 1024 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +SCAN: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 512 + MAX_SIZE_TEST: 1024 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +BDD: + INPUT: + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 0 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: False + TEST: + BATCH_SIZE_TOTAL: 8 + +CITY: + INPUT: + MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ] + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 1024 + MAX_SIZE_TRAIN: 4096 + MAX_SIZE_TEST: 2048 + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: [ 512, 1024 ] + SINGLE_CATEGORY_MAX_AREA: 1.0 + IGNORE_VALUE: 255 + COLOR_AUG_SSD: True + SIZE_DIVISIBILITY: -1 + FORMAT: "RGB" + DATASET_MAPPER_NAME: "mask_former_panoptic" + MASK_FORMAT: "polygon" + TEST: + EVAL_PERIOD: 5000 + BATCH_SIZE_TOTAL: 1 + AUG: + ENABLED: False + MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ] + MAX_SIZE: 4096 + FLIP: True + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: True + NUM_WORKERS: 2 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True + TRAIN: + ASPECT_RATIO_GROUPING: true + BATCH_SIZE_TOTAL: 2 + BATCH_SIZE_PER_GPU: 2 + SHUFFLE: true + +PSACAL_PART: + INPUT: + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1333 + IMAGE_SIZE: 1024 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + DATASET_MAPPER_NAME: "pascal_part_lsj" + IGNORE_VALUE: 255 + COLOR_AUG_SSD: False + SIZE_DIVISIBILITY: 32 + RANDOM_FLIP: "horizontal" + MASK_FORMAT: "polygon" + FORMAT: "RGB" + CROP: + ENABLED: True + MODEL: + MASK_ON: True + KEYPOINT_ON: False + LOAD_PROPOSALS: False + # DATASET: + # DATASET: 'coco' + TEST: + DETECTIONS_PER_IMAGE: 100 + NAME: coco_eval + IOU_TYPE: ['bbox', 'segm'] + USE_MULTISCALE: false + BATCH_SIZE_TOTAL: 8 + MODEL_FILE: '' + AUG: + ENABLED: False + TRAIN: + BATCH_SIZE_TOTAL: 1 + BATCH_SIZE_PER_GPU: 1 + SHUFFLE: true + DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 2 + LOAD_PROPOSALS: False + SAMPLER_TRAIN: "TrainingSampler" + ASPECT_RATIO_GROUPING: True diff --git a/mm_agents/desktop.png b/mm_agents/desktop.png new file mode 100644 index 0000000..17c8885 Binary files /dev/null and b/mm_agents/desktop.png differ diff --git a/mm_agents/download_ckpt.sh b/mm_agents/download_ckpt.sh new file mode 100644 index 0000000..146fcea --- /dev/null +++ b/mm_agents/download_ckpt.sh @@ -0,0 +1,3 @@ +wget https://github.com/UX-Decoder/Semantic-SAM/releases/download/checkpoint/swinl_only_sam_many2many.pth +wget https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt +wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth \ No newline at end of file diff --git a/mm_agents/ops/functions/__init__.py b/mm_agents/ops/functions/__init__.py new file mode 100644 index 0000000..2b06b5a --- /dev/null +++ b/mm_agents/ops/functions/__init__.py @@ -0,0 +1,13 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from .ms_deform_attn_func import MSDeformAttnFunction + diff --git a/mm_agents/ops/functions/ms_deform_attn_func.py b/mm_agents/ops/functions/ms_deform_attn_func.py new file mode 100644 index 0000000..94a36ab --- /dev/null +++ b/mm_agents/ops/functions/ms_deform_attn_func.py @@ -0,0 +1,72 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import torch +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +try: + import MultiScaleDeformableAttention as MSDA +except ModuleNotFoundError as e: + info_string = ( + "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" + "\t`cd mask2former/modeling/pixel_decoder/ops`\n" + "\t`sh make.sh`\n" + ) + raise ModuleNotFoundError(info_string) + + +class MSDeformAttnFunction(Function): + @staticmethod + def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): + ctx.im2col_step = im2col_step + output = MSDA.ms_deform_attn_forward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = \ + MSDA.ms_deform_attn_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): + # for debug and test only, + # need to use cuda version instead + N_, S_, M_, D_ = value.shape + _, Lq_, M_, L_, P_, _ = sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for lid_, (H_, W_) in enumerate(value_spatial_shapes): + # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ + value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) + # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 + sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) + # N_*M_, D_, Lq_, P_ + sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, + mode='bilinear', padding_mode='zeros', align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) + attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) + output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) + return output.transpose(1, 2).contiguous() diff --git a/mm_agents/ops/make.sh b/mm_agents/ops/make.sh new file mode 100755 index 0000000..7b38cdb --- /dev/null +++ b/mm_agents/ops/make.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +python setup.py build install diff --git a/mm_agents/ops/modules/__init__.py b/mm_agents/ops/modules/__init__.py new file mode 100644 index 0000000..6fdbf03 --- /dev/null +++ b/mm_agents/ops/modules/__init__.py @@ -0,0 +1,12 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from .ms_deform_attn import MSDeformAttn diff --git a/mm_agents/ops/modules/ms_deform_attn.py b/mm_agents/ops/modules/ms_deform_attn.py new file mode 100644 index 0000000..e7b4c42 --- /dev/null +++ b/mm_agents/ops/modules/ms_deform_attn.py @@ -0,0 +1,125 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import warnings +import math + +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn.init import xavier_uniform_, constant_ + +from ..functions import MSDeformAttnFunction +from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch + + +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + return (n & (n-1) == 0) and n != 0 + + +class MSDeformAttn(nn.Module): + def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): + """ + Multi-Scale Deformable Attention Module + :param d_model hidden dimension + :param n_levels number of feature levels + :param n_heads number of attention heads + :param n_points number of sampling points per attention head per feature level + """ + super().__init__() + if d_model % n_heads != 0: + raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) + _d_per_head = d_model // n_heads + # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation + if not _is_power_of_2(_d_per_head): + warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " + "which is more efficient in our CUDA implementation.") + + self.im2col_step = 128 + + self.d_model = d_model + self.n_levels = n_levels + self.n_heads = n_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) + self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) + self.value_proj = nn.Linear(d_model, d_model) + self.output_proj = nn.Linear(d_model, d_model) + + self._reset_parameters() + + def _reset_parameters(self): + constant_(self.sampling_offsets.weight.data, 0.) + thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.) + constant_(self.attention_weights.bias.data, 0.) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.) + + def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): + """ + :param query (N, Length_{query}, C) + :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area + or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes + :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) + :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] + :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements + + :return output (N, Length_{query}, C) + """ + N, Len_q, _ = query.shape + N, Len_in, _ = input_flatten.shape + assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in + + value = self.value_proj(input_flatten) + if input_padding_mask is not None: + value = value.masked_fill(input_padding_mask[..., None], float(0)) + value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) + attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) + attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) + # N, Len_q, n_heads, n_levels, n_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + else: + raise ValueError( + 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) + try: + output = MSDeformAttnFunction.apply( + value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) + except: + # CPU + output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) + # # For FLOPs calculation only + # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) + output = self.output_proj(output) + return output diff --git a/mm_agents/ops/setup.py b/mm_agents/ops/setup.py new file mode 100644 index 0000000..3b57ad3 --- /dev/null +++ b/mm_agents/ops/setup.py @@ -0,0 +1,78 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +import os +import glob + +import torch + +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +from setuptools import find_packages +from setuptools import setup + +requirements = ["torch", "torchvision"] + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "src") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + extra_compile_args = {"cxx": []} + define_macros = [] + + # Force cuda since torch ask for a device, not if cuda is in fact available. + if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + if CUDA_HOME is None: + raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') + else: + raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + ext_modules = [ + extension( + "MultiScaleDeformableAttention", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + return ext_modules + +setup( + name="MultiScaleDeformableAttention", + version="1.0", + author="Weijie Su", + url="https://github.com/fundamentalvision/Deformable-DETR", + description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", + packages=find_packages(exclude=("configs", "tests",)), + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp new file mode 100644 index 0000000..48757e2 --- /dev/null +++ b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp @@ -0,0 +1,46 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include + +#include +#include + + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} + diff --git a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h new file mode 100644 index 0000000..51bb27e --- /dev/null +++ b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h @@ -0,0 +1,38 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#pragma once +#include + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + + diff --git a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu new file mode 100644 index 0000000..0c465da --- /dev/null +++ b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu @@ -0,0 +1,158 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include +#include "cuda/ms_deform_im2col_cuda.cuh" + +#include +#include +#include +#include + + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); + + const int batch_n = im2col_step_; + auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto columns = output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { + ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + columns.data()); + + })); + } + + output = output.view({batch, num_query, num_heads*channels}); + + return output; +} + + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto grad_value = at::zeros_like(value); + auto grad_sampling_loc = at::zeros_like(sampling_loc); + auto grad_attn_weight = at::zeros_like(attn_weight); + + const int batch_n = im2col_step_; + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto grad_output_g = grad_output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { + ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), + grad_output_g.data(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + grad_value.data() + n * im2col_step_ * per_value_size, + grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); + + })); + } + + return { + grad_value, grad_sampling_loc, grad_attn_weight + }; +} \ No newline at end of file diff --git a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h new file mode 100644 index 0000000..4f0658e --- /dev/null +++ b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h @@ -0,0 +1,35 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#pragma once +#include + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + diff --git a/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh b/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh new file mode 100644 index 0000000..c04e0d4 --- /dev/null +++ b/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh @@ -0,0 +1,1332 @@ +/*! +************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************** +* Modified from DCN (https://github.com/msracver/Deformable-ConvNets) +* Copyright (c) 2018 Microsoft +************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include +#include +#include + +#include +#include + +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N, const int num_threads) +{ + return (N + num_threads - 1) / num_threads; +} + + +template +__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + + +template +__global__ void ms_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + scalar_t *data_col_ptr = data_col + index; + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + scalar_t col = 0; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_col_ptr = col; + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockSize; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockSize/2; s>0; s>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockDim.x; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + grad_sampling_loc, grad_attn_weight); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +void ms_deformable_im2col_cuda(cudaStream_t stream, + const scalar_t* data_value, + const int64_t* data_spatial_shapes, + const int64_t* data_level_start_index, + const scalar_t* data_sampling_loc, + const scalar_t* data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* data_col) +{ + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + const int num_threads = CUDA_NUM_THREADS; + ms_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +template +void ms_deformable_col2im_cuda(cudaStream_t stream, + const scalar_t* grad_col, + const scalar_t* data_value, + const int64_t * data_spatial_shapes, + const int64_t * data_level_start_index, + const scalar_t * data_sampling_loc, + const scalar_t * data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + if (channels > 1024) + { + if ((channels & 1023) == 0) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_gm + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + else{ + switch(channels) + { + case 1: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 2: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 4: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 8: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 16: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 32: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 64: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 128: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 256: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 512: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 1024: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + default: + if (channels < 64) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} \ No newline at end of file diff --git a/mm_agents/ops/src/ms_deform_attn.h b/mm_agents/ops/src/ms_deform_attn.h new file mode 100644 index 0000000..2f80a1b --- /dev/null +++ b/mm_agents/ops/src/ms_deform_attn.h @@ -0,0 +1,67 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#pragma once + +#include "cpu/ms_deform_attn_cpu.h" + +#ifdef WITH_CUDA +#include "cuda/ms_deform_attn_cuda.h" +#endif + + +at::Tensor +ms_deform_attn_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_forward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::vector +ms_deform_attn_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_backward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + diff --git a/mm_agents/ops/src/vision.cpp b/mm_agents/ops/src/vision.cpp new file mode 100644 index 0000000..4a08821 --- /dev/null +++ b/mm_agents/ops/src/vision.cpp @@ -0,0 +1,21 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include "ms_deform_attn.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); + m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); +} diff --git a/mm_agents/ops/test.py b/mm_agents/ops/test.py new file mode 100644 index 0000000..6e1b545 --- /dev/null +++ b/mm_agents/ops/test.py @@ -0,0 +1,92 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import time +import torch +import torch.nn as nn +from torch.autograd import gradcheck + +from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch + + +N, M, D = 1, 2, 2 +Lq, L, P = 2, 2, 2 +shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() +level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) +S = sum([(H*W).item() for H, W in shapes]) + + +torch.manual_seed(3) + + +@torch.no_grad() +def check_forward_equal_with_pytorch_double(): + value = torch.rand(N, S, M, D).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() + output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() + fwdok = torch.allclose(output_cuda, output_pytorch) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() + + print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + +@torch.no_grad() +def check_forward_equal_with_pytorch_float(): + value = torch.rand(N, S, M, D).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() + output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() + fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() + + print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + +def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): + + value = torch.rand(N, S, M, channels).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + func = MSDeformAttnFunction.apply + + value.requires_grad = grad_value + sampling_locations.requires_grad = grad_sampling_loc + attention_weights.requires_grad = grad_attn_weight + + gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) + + print(f'* {gradok} check_gradient_numerical(D={channels})') + + +if __name__ == '__main__': + check_forward_equal_with_pytorch_double() + check_forward_equal_with_pytorch_float() + + for channels in [30, 32, 64, 71, 1025, 2048, 3096]: + check_gradient_numerical(channels, True, True, True) + + + diff --git a/mm_agents/stackoverflow.png b/mm_agents/stackoverflow.png deleted file mode 100644 index 420ba20..0000000 Binary files a/mm_agents/stackoverflow.png and /dev/null differ