Source code for chainercv.links.model.faster_rcnn.faster_rcnn

# Mofidied work:
# --------------------------------------------------------
# Copyright (c) 2017 Preferred Networks, Inc.
# --------------------------------------------------------
# Original works by:
# --------------------------------------------------------
# Faster R-CNN implementation by Chainer
# Copyright (c) 2016 Shunta Saito
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

from __future__ import division

import numpy as np

import chainer
from chainer.backends import cuda
import chainer.functions as F
from chainercv.links.model.faster_rcnn.utils.loc2bbox import loc2bbox
from chainercv.utils import non_maximum_suppression

from chainercv.transforms.image.resize import resize

[docs]class FasterRCNN(chainer.Chain): """Base class for Faster R-CNN. This is a base class for Faster R-CNN links supporting object detection API [#]_. The following three stages constitute Faster R-CNN. 1. **Feature extraction**: Images are taken and their \ feature maps are calculated. 2. **Region Proposal Networks**: Given the feature maps calculated in \ the previous stage, produce set of RoIs around objects. 3. **Localization and Classification Heads**: Using feature maps that \ belong to the proposed RoIs, classify the categories of the objects \ in the RoIs and improve localizations. Each stage is carried out by one of the callable :class:`chainer.Chain` objects :obj:`feature`, :obj:`rpn` and :obj:`head`. There are two functions :meth:`predict` and :meth:`__call__` to conduct object detection. :meth:`predict` takes images and returns bounding boxes that are converted to image coordinates. This will be useful for a scenario when Faster R-CNN is treated as a black box function, for instance. :meth:`__call__` is provided for a scnerario when intermediate outputs are needed, for instance, for training and debugging. Links that support obejct detection API have method :meth:`predict` with the same interface. Please refer to :meth:`predict` for further details. .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ Faster R-CNN: Towards Real-Time Object Detection with \ Region Proposal Networks. NIPS 2015. Args: extractor (callable Chain): A callable that takes a BCHW image array and returns feature maps. rpn (callable Chain): A callable that has the same interface as :class:`~chainercv.links.model.faster_rcnn.RegionProposalNetwork`. Please refer to the documentation found there. head (callable Chain): A callable that takes a BCHW array, RoIs and batch indices for RoIs. This returns class dependent localization paramters and class scores. mean (numpy.ndarray): A value to be subtracted from an image in :meth:`prepare`. min_size (int): A preprocessing paramter for :meth:`prepare`. Please refer to a docstring found for :meth:`prepare`. max_size (int): A preprocessing paramter for :meth:`prepare`. loc_normalize_mean (tuple of four floats): Mean values of localization estimates. loc_normalize_std (tupler of four floats): Standard deviation of localization estimates. """ def __init__( self, extractor, rpn, head, mean, min_size=600, max_size=1000, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2), ): super(FasterRCNN, self).__init__() with self.init_scope(): self.extractor = extractor self.rpn = rpn self.head = head self.mean = mean self.min_size = min_size self.max_size = max_size self.loc_normalize_mean = loc_normalize_mean self.loc_normalize_std = loc_normalize_std self.use_preset('visualize') @property def n_class(self): # Total number of classes including the background. return self.head.n_class
[docs] def __call__(self, x, scale=1.): """Forward Faster R-CNN. Scaling paramter :obj:`scale` is used by RPN to determine the threshold to select small objects, which are going to be rejected irrespective of their confidence scores. Here are notations used. * :math:`N` is the number of batch size * :math:`R'` is the total number of RoIs produced across batches. \ Given :math:`R_i` proposed RoIs from the :math:`i` th image, \ :math:`R' = \\sum _{i=1} ^ N R_i`. * :math:`L` is the number of classes excluding the background. Classes are ordered by the background, the first class, ..., and the :math:`L` th class. Args: x (~chainer.Variable): 4D image variable. scale (float): Amount of scaling applied to the raw image during preprocessing. Returns: Variable, Variable, array, array: Returns tuple of four values listed below. * **roi_cls_locs**: Offsets and scalings for the proposed RoIs. \ Its shape is :math:`(R', (L + 1) \\times 4)`. * **roi_scores**: Class predictions for the proposed RoIs. \ Its shape is :math:`(R', L + 1)`. * **rois**: RoIs proposed by RPN. Its shape is \ :math:`(R', 4)`. * **roi_indices**: Batch indices of RoIs. Its shape is \ :math:`(R',)`. """ img_size = x.shape[2:] h = self.extractor(x) rpn_locs, rpn_scores, rois, roi_indices, anchor =\ self.rpn(h, img_size, scale) roi_cls_locs, roi_scores = self.head( h, rois, roi_indices) return roi_cls_locs, roi_scores, rois, roi_indices
[docs] def use_preset(self, preset): """Use the given preset during prediction. This method changes values of :obj:`self.nms_thresh` and :obj:`self.score_thresh`. These values are a threshold value used for non maximum suppression and a threshold value to discard low confidence proposals in :meth:`predict`, respectively. If the attributes need to be changed to something other than the values provided in the presets, please modify them by directly accessing the public attributes. Args: preset ({'visualize', 'evaluate'): A string to determine the preset to use. """ if preset == 'visualize': self.nms_thresh = 0.3 self.score_thresh = 0.7 elif preset == 'evaluate': self.nms_thresh = 0.3 self.score_thresh = 0.05 else: raise ValueError('preset must be visualize or evaluate')
[docs] def prepare(self, img): """Preprocess an image for feature extraction. The length of the shorter edge is scaled to :obj:`self.min_size`. After the scaling, if the length of the longer edge is longer than :obj:`self.max_size`, the image is scaled to fit the longer edge to :obj:`self.max_size`. After resizing the image, the image is subtracted by a mean image value :obj:`self.mean`. Args: img (~numpy.ndarray): An image. This is in CHW and RGB format. The range of its value is :math:`[0, 255]`. Returns: ~numpy.ndarray: A preprocessed image. """ _, H, W = img.shape scale = 1. scale = self.min_size / min(H, W) if scale * max(H, W) > self.max_size: scale = self.max_size / max(H, W) img = resize(img, (int(H * scale), int(W * scale))) img = (img - self.mean).astype(np.float32, copy=False) return img
def _suppress(self, raw_cls_bbox, raw_prob): bbox = [] label = [] prob = [] # skip cls_id = 0 because it is the background class for l in range(1, self.n_class): cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :] prob_l = raw_prob[:, l] mask = prob_l > self.score_thresh cls_bbox_l = cls_bbox_l[mask] prob_l = prob_l[mask] keep = non_maximum_suppression( cls_bbox_l, self.nms_thresh, prob_l) bbox.append(cls_bbox_l[keep]) # The labels are in [0, self.n_class - 2]. label.append((l - 1) * np.ones((len(keep),))) prob.append(prob_l[keep]) bbox = np.concatenate(bbox, axis=0).astype(np.float32) label = np.concatenate(label, axis=0).astype(np.int32) prob = np.concatenate(prob, axis=0).astype(np.float32) return bbox, label, prob
[docs] def predict(self, imgs): """Detect objects from images. This method predicts objects for each image. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their value is :math:`[0, 255]`. Returns: tuple of lists: This method returns a tuple of three lists, :obj:`(bboxes, labels, scores)`. * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \ where :math:`R` is the number of bounding boxes in a image. \ Each bouding box is organized by \ :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \ in the second axis. * **labels** : A list of integer arrays of shape :math:`(R,)`. \ Each value indicates the class of the bounding box. \ Values are in range :math:`[0, L - 1]`, where :math:`L` is the \ number of the foreground classes. * **scores** : A list of float arrays of shape :math:`(R,)`. \ Each value indicates how confident the prediction is. """ prepared_imgs = [] sizes = [] for img in imgs: size = img.shape[1:] img = self.prepare(img.astype(np.float32)) prepared_imgs.append(img) sizes.append(size) bboxes = [] labels = [] scores = [] for img, size in zip(prepared_imgs, sizes): with chainer.using_config('train', False), \ chainer.function.no_backprop_mode(): img_var = chainer.Variable(self.xp.asarray(img[None])) scale = img_var.shape[3] / size[1] roi_cls_locs, roi_scores, rois, _ = self.__call__( img_var, scale=scale) # We are assuming that batch size is 1. roi_cls_loc = roi_cls_locs.array roi_score = roi_scores.array roi = rois / scale # Convert predictions to bounding boxes in image coordinates. # Bounding boxes are scaled to the scale of the input images. mean = self.xp.tile(self.xp.asarray(self.loc_normalize_mean), self.n_class) std = self.xp.tile(self.xp.asarray(self.loc_normalize_std), self.n_class) roi_cls_loc = (roi_cls_loc * std + mean).astype(np.float32) roi_cls_loc = roi_cls_loc.reshape((-1, self.n_class, 4)) roi = self.xp.broadcast_to(roi[:, None], roi_cls_loc.shape) cls_bbox = loc2bbox(roi.reshape((-1, 4)), roi_cls_loc.reshape((-1, 4))) cls_bbox = cls_bbox.reshape((-1, self.n_class * 4)) # clip bounding box cls_bbox[:, 0::2] = self.xp.clip(cls_bbox[:, 0::2], 0, size[0]) cls_bbox[:, 1::2] = self.xp.clip(cls_bbox[:, 1::2], 0, size[1]) prob = F.softmax(roi_score).array raw_cls_bbox = cuda.to_cpu(cls_bbox) raw_prob = cuda.to_cpu(prob) bbox, label, prob = self._suppress(raw_cls_bbox, raw_prob) bboxes.append(bbox) labels.append(label) scores.append(prob) return bboxes, labels, scores