Source code for chainercv.experimental.links.model.fcis.fcis

from __future__ import division

import chainer
import chainer.functions as F
import numpy as np

from chainercv.experimental.links.model.fcis.utils.mask_voting \
    import mask_voting
from chainercv.transforms.image.resize import resize


[docs]class FCIS(chainer.Chain):

    """Base class for FCIS.

    This is a base class for FCIS links supporting instance segmentation
    API [#]_. The following three stages constitute FCIS.

    1. **Feature extraction**: Images are taken and their \
        feature maps are calculated.
    2. **Region Proposal Networks**: Given the feature maps calculated in \
        the previous stage, produce set of RoIs around objects.
    3. **Localization, Segmentation and Classification Heads**: Using feature \
        maps that belong to the proposed RoIs, segment regions of the \
        objects, classify the categories of the objects in the RoIs and \
        improve localizations.

    Each stage is carried out by one of the callable
    :class:`chainer.Chain` objects :obj:`feature`, :obj:`rpn` and :obj:`head`.
    There are two functions :meth:`predict` and :meth:`__call__` to conduct
    instance segmentation.
    :meth:`predict` takes images and returns masks, object labels
    and their scores.
    :meth:`__call__` is provided for a scnerario when intermediate outputs
    are needed, for instance, for training and debugging.

    Links that support instance segmentation API have method :meth:`predict`
    with the same interface. Please refer to :meth:`predict` for further
    details.

    .. [#] Yi Li, Haozhi Qi, Jifeng Dai, Xiangyang Ji, Yichen Wei. \
    Fully Convolutional Instance-aware Semantic Segmentation. CVPR 2017.

    Args:
        extractor (callable Chain): A callable that takes a BCHW image
            array and returns feature maps.
        rpn (callable Chain): A callable that has the same interface as
            :class:`~chainercv.links.model.faster_rcnn.RegionProposalNetwork`.
            Please refer to the documentation found there.
        head (callable Chain): A callable that takes a BCHW array,
            RoIs and batch indices for RoIs.
            This returns class-agnostic segmentation scores, class-agnostic
            localization parameters, class scores, improved RoIs and batch
            indices for RoIs.
        mean (numpy.ndarray): A value to be subtracted from an image
            in :meth:`prepare`.
        min_size (int): A preprocessing parameter for :meth:`prepare`. Please
            refer to a docstring found for :meth:`prepare`.
        max_size (int): A preprocessing parameter for :meth:`prepare`.
        loc_normalize_mean (tuple of four floats): Mean values of
            localization estimates.
        loc_normalize_std (tupler of four floats): Standard deviation
            of localization estimates.

    """

    def __init__(
            self, extractor, rpn, head,
            mean, min_size, max_size,
            loc_normalize_mean, loc_normalize_std,
    ):
        super(FCIS, self).__init__()
        with self.init_scope():
            self.extractor = extractor
            self.rpn = rpn
            self.head = head

        self.mean = mean
        self.min_size = min_size
        self.max_size = max_size
        self.loc_normalize_mean = loc_normalize_mean
        self.loc_normalize_std = loc_normalize_std

        self.use_preset('visualize')

    @property
    def n_class(self):
        # Total number of classes including the background.
        return self.head.n_class

[docs]    def __call__(self, x, scale=1.):
        """Forward FCIS.

        Scaling paramter :obj:`scale` is used by RPN to determine the
        threshold to select small objects, which are going to be
        rejected irrespective of their confidence scores.

        Here are notations used.

        * :math:`N` is the number of batch size
        * :math:`R'` is the total number of RoIs produced across batches. \
            Given :math:`R_i` proposed RoIs from the :math:`i` th image, \
            :math:`R' = \\sum _{i=1} ^ N R_i`.
        * :math:`L` is the number of classes excluding the background.
        * :math:`RH` is the height of pooled image by Position Sensitive \
            ROI pooling.
        * :math:`RW` is the height of pooled image by Position Sensitive \
            ROI pooling.

        Classes are ordered by the background, the first class, ..., and
        the :math:`L` th class.

        Args:
            x (~chainer.Variable): 4D image variable.
            scale (float): Amount of scaling applied to the raw image
                during preprocessing.

        Returns:
            Variable, Variable, Variable, array, array:
            Returns tuple of five values listed below.

            * **roi_ag_seg_scores**: Class-agnostic clipped mask scores for \
                the proposed ROIs. Its shape is :math:`(R', 2, RH, RW)`
            * **ag_locs**: Class-agnostic offsets and scalings for \
                the proposed RoIs.  Its shape is :math:`(R', 2, 4)`.
            * **roi_cls_scores**: Class predictions for the proposed RoIs. \
                Its shape is :math:`(R', L + 1)`.
            * **rois**: RoIs proposed by RPN. Its shape is \
                :math:`(R', 4)`.
            * **roi_indices**: Batch indices of RoIs. Its shape is \
                :math:`(R',)`.

        """
        img_size = x.shape[2:]

        # Feature Extractor
        rpn_features, roi_features = self.extractor(x)
        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.rpn(
            rpn_features, img_size, scale)
        roi_ag_seg_scores, roi_ag_locs, roi_cls_scores, rois, roi_indices = \
            self.head(roi_features, rois, roi_indices, img_size)
        return roi_ag_seg_scores, roi_ag_locs, roi_cls_scores, \
            rois, roi_indices

[docs]    def prepare(self, img):
        """Preprocess an image for feature extraction.

        The length of the shorter edge is scaled to :obj:`self.min_size`.
        After the scaling, if the length of the longer edge is longer than
        :obj:`self.max_size`, the image is scaled to fit the longer edge
        to :obj:`self.max_size`.

        After resizing the image, the image is subtracted by a mean image value
        :obj:`self.mean`.

        Args:
            img (~numpy.ndarray): An image. This is in CHW and RGB format.
                The range of its value is :math:`[0, 255]`.

        Returns:
            ~numpy.ndarray:
            A preprocessed image.

        """
        _, H, W = img.shape

        scale = self.min_size / min(H, W)

        if scale * max(H, W) > self.max_size:
            scale = self.max_size / max(H, W)

        img = resize(img, (int(H * scale), int(W * scale)))
        img = (img - self.mean).astype(np.float32, copy=False)
        return img

[docs]    def use_preset(self, preset):
        """Use the given preset during prediction.

        This method changes values of :obj:`self.nms_thresh`,
        :obj:`self.score_thresh`, :obj:`self.mask_merge_thresh`,
        :obj:`self.binary_thresh`, :obj:`self.binary_thresh` and
        :obj:`self.min_drop_size`. These values are a threshold value
        used for non maximum suppression, a threshold value
        to discard low confidence proposals in :meth:`predict`,
        a threshold value to merge mask in :meth:`predict`,
        a threshold value to binalize segmentation scores in :meth:`predict`,
        a limit number of predicted masks in one image and
        a threshold value to discard small bounding boxes respectively.

        If the attributes need to be changed to something
        other than the values provided in the presets, please modify
        them by directly accessing the public attributes.

        Args:
            preset ({'visualize', 'evaluate'): A string to determine the
                preset to use.

        """
        if preset == 'visualize':
            self.nms_thresh = 0.3
            self.score_thresh = 0.7
            self.mask_merge_thresh = 0.5
            self.binary_thresh = 0.4
            self.limit = 100
            self.min_drop_size = 16
        elif preset == 'evaluate':
            self.nms_thresh = 0.3
            self.score_thresh = 1e-3
            self.mask_merge_thresh = 0.5
            self.binary_thresh = 0.4
            self.limit = 100
            self.min_drop_size = 16
        else:
            raise ValueError('preset must be visualize or evaluate')

[docs]    def predict(self, imgs):
        """Segment object instances from images.

        This method predicts instance-aware object regions for each image.

        Args:
            imgs (iterable of numpy.ndarray): Arrays holding images of shape
                :math:`(B, C, H, W)`.  All images are in CHW and RGB format
                and the range of their value is :math:`[0, 255]`.

        Returns:
           tuple of lists:
           This method returns a tuple of three lists,
           :obj:`(masks, labels, scores)`.

           * **masks**: A list of boolean arrays of shape :math:`(R, H, W)`, \
               where :math:`R` is the number of masks in a image. \
               Each pixel holds value if it is inside the object inside or not.
           * **labels** : A list of integer arrays of shape :math:`(R,)`. \
               Each value indicates the class of the masks. \
               Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
               number of the foreground classes.
           * **scores** : A list of float arrays of shape :math:`(R,)`. \
               Each value indicates how confident the prediction is.

        """

        prepared_imgs = []
        sizes = []
        for img in imgs:
            size = img.shape[1:]
            img = self.prepare(img.astype(np.float32))
            prepared_imgs.append(img)
            sizes.append(size)

        masks = []
        labels = []
        scores = []

        for img, size in zip(prepared_imgs, sizes):
            with chainer.using_config('train', False), \
                    chainer.function.no_backprop_mode():
                # inference
                img_var = chainer.Variable(self.xp.array(img[None]))
                scale = img_var.shape[3] / size[1]
                roi_ag_seg_scores, _, roi_cls_scores, bboxes, _ = \
                    self.__call__(img_var, scale)

            # We are assuming that batch size is 1.
            roi_ag_seg_score = chainer.cuda.to_cpu(roi_ag_seg_scores.array)
            roi_cls_score = chainer.cuda.to_cpu(roi_cls_scores.array)
            bbox = chainer.cuda.to_cpu(bboxes)

            # filter bounding boxes with min_size
            height = bbox[:, 2] - bbox[:, 0]
            width = bbox[:, 3] - bbox[:, 1]
            keep_indices = np.where(
                (height >= self.min_drop_size) &
                (width >= self.min_drop_size))[0]
            roi_ag_seg_score = roi_ag_seg_score[keep_indices, :, :]
            roi_cls_score = roi_cls_score[keep_indices]
            bbox = bbox[keep_indices, :]

            # scale bbox
            bbox = bbox / scale

            # shape: (n_rois, 4)
            bbox[:, 0::2] = self.xp.clip(bbox[:, 0::2], 0, size[0])
            bbox[:, 1::2] = self.xp.clip(bbox[:, 1::2], 0, size[1])

            # shape: (n_roi, roi_size, roi_size)
            roi_seg_prob = F.softmax(roi_ag_seg_score).array[:, 1]
            roi_cls_prob = F.softmax(roi_cls_score).array

            roi_seg_prob, bbox, label, roi_cls_prob = mask_voting(
                roi_seg_prob, bbox, roi_cls_prob, size,
                self.score_thresh, self.nms_thresh,
                self.mask_merge_thresh, self.binary_thresh,
                limit=self.limit, bg_label=0)

            mask = np.zeros(
                (len(roi_seg_prob), size[0], size[1]), dtype=np.bool)
            for i, (roi_seg_pb, bb) in enumerate(zip(roi_seg_prob, bbox)):
                bb = np.round(bb).astype(np.int32)
                y_min, x_min, y_max, x_max = bb
                roi_msk_pb = resize(
                    roi_seg_pb.astype(np.float32)[None],
                    (y_max - y_min, x_max - x_min))
                roi_msk = (roi_msk_pb > self.binary_thresh)[0]
                mask[i, y_min:y_max, x_min:x_max] = roi_msk

            masks.append(mask)
            labels.append(label)
            scores.append(roi_cls_prob)

        return masks, labels, scores