update SoM_agent

2023-12-31 19:13:17 +08:00
parent f04e625ad9
commit 7560f4dc46
19 changed files with 3729 additions and 49 deletions
--- a/mm_agents/task_adapter/seem/init.py
+++ b/mm_agents/task_adapter/seem/init.py
--- a/mm_agents/task_adapter/seem/tasks/init.py
+++ b/mm_agents/task_adapter/seem/tasks/init.py
@@ -0,0 +1,3 @@
+from .interactive_seem_m2m_auto import *
+from .inference_seem_pano import *
+from .inference_seem_interactive import *
--- a/mm_agents/task_adapter/seem/tasks/automatic_mask_generator.py
+++ b/mm_agents/task_adapter/seem/tasks/automatic_mask_generator.py
@@ -0,0 +1,382 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+
+from segment_anything.modeling import Sam
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+class SeemAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: Sam,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.9,
+        stability_score_thresh: float = 0.5,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crop_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
+
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+
+        self.predictor = model
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+
+        # dilate conv
+        self.dilation = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=7, stride=1, padding=3, bias=False)
+        self.dilation.weight.data.fill_(1.0)
+        self.dilation.cuda()
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[-2:]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+            data.cat(crop_data)
+
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros_like(data["boxes"][:, 0]),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image#[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[-2:]
+        # self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] #* points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        self.enc_features=None
+
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(cropped_im, points, cropped_im_size, crop_box, orig_size)
+            data.cat(batch_data)
+            del batch_data
+
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros(len(data["boxes"])),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+
+        data.filter(keep_by_nms)
+
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+        return data
+
+    def _process_batch(
+        self,
+        images,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        data = {"image": images, "height": orig_h, "width": orig_w}
+        points = torch.tensor(points,dtype=torch.float).to(images.device)
+        
+        # prepare interactive mask for seem
+        abs_points = (points * torch.tensor(orig_size)[None,:].to(points.device)).long()
+        abs_masks = torch.zeros((len(points), orig_h, orig_w), dtype=torch.bool).to(device=points.device)
+        abs_masks[torch.arange(0, abs_points.size(0))[:,None], abs_points[:,0:1], abs_points[:,1:2]] = True
+        abs_masks = self.dilation(abs_masks[:,None].float())[:,0] > 0
+        data['spatial_query'] = {'rand_shape': abs_masks[:,None]}
+
+        batch_inputs = [data]
+        if self.enc_features is None:
+            masks, iou_preds, mask_features, transformer_encoder_features, multi_scale_features = self.predictor.model.evaluate_demo(batch_inputs, None, None, return_features=True)
+            self.enc_features = (mask_features, transformer_encoder_features, multi_scale_features)
+        else:
+            masks, iou_preds = self.predictor.model.evaluate_demo(batch_inputs, self.enc_features[0], self.enc_features[1], self.enc_features[2])
+
+        data = MaskData(
+            masks=masks,
+            iou_preds=iou_preds,
+            points=points,
+        )
+        del masks
+        # Filter by predicted IoU
+        if self.pred_iou_thresh > 0.0:
+            keep_mask = data["iou_preds"] > self.pred_iou_thresh
+            data.filter(keep_mask)
+
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"], 0.0, self.stability_score_offset
+        )
+        if self.stability_score_thresh > 0.0:
+            keep_mask = data["stability_score"] >= self.stability_score_thresh
+            data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > 0.0
+        data["boxes"] = batched_mask_to_box(data["masks"])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros_like(boxes[:, 0]),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
--- a/mm_agents/task_adapter/seem/tasks/inference_seem_interactive.py
+++ b/mm_agents/task_adapter/seem/tasks/inference_seem_interactive.py
@@ -0,0 +1,169 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def inference_seem_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    data = {"image": images, "height": orig_h, "width": orig_w}
+
+    spatial_masks = spatial_masks[:, None].float().cuda()
+    spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
+    data['spatial_query'] = {'rand_shape': spatial_masks}
+
+    model.model.metadata = metadata
+    masks, _ = model.model.evaluate_demo([data])
+    masks = masks > 0.0
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     color_mask = np.random.random((1, 3)).tolist()[0]
+    #     # color_mask = [int(c*255) for c in color_mask]
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/seem/tasks/inference_seem_pano.py
+++ b/mm_agents/task_adapter/seem/tasks/inference_seem_pano.py
@@ -0,0 +1,164 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def inference_seem_pano(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    data = {"image": images, "height": orig_h, "width": orig_w}
+    batch_inputs = [data]
+
+    model.model.metadata = metadata
+    outputs = model.model.evaluate(batch_inputs)
+
+    pano_mask = outputs[0]['panoptic_seg'][0]
+    pano_info = outputs[0]['panoptic_seg'][1]
+
+    masks = []
+    for seg_info in pano_info:
+        masks += [pano_mask == seg_info['id']]
+    masks = torch.stack(masks, dim=0)
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    # create a full zero image as the image_orig
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
+++ b/mm_agents/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
@@ -0,0 +1,93 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def interactive_seem_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    mask_generator = SeemAutomaticMaskGenerator(model)
+    outputs = mask_generator.generate(images)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    for ann in sorted_anns:
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        label += 1
+    im = demo.get_image()
+
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))