Source code for chainercv.experimental.links.model.pspnet.pspnet

from __future__ import division

from math import ceil
import numpy as np

import chainer
import chainer.functions as F
import chainer.links as L

from chainercv.experimental.links.model.pspnet.transforms import \
from chainercv.links import Conv2DBNActiv
from chainercv.links.model.resnet import ResBlock
from chainercv.links import PickableSequentialChain
from chainercv import transforms
from chainercv import utils

class PyramidPoolingModule(chainer.ChainList):

    def __init__(self, in_channels, feat_size, pyramids,
        out_channels = in_channels // len(pyramids)
        super(PyramidPoolingModule, self).__init__(
                in_channels, out_channels, 1, 1, 0, 1, initialW=initialW,
                in_channels, out_channels, 1, 1, 0, 1, initialW=initialW,
                in_channels, out_channels, 1, 1, 0, 1, initialW=initialW,
                in_channels, out_channels, 1, 1, 0, 1, initialW=initialW,
        kh = feat_size[0] // np.array(pyramids)
        kw = feat_size[1] // np.array(pyramids)
        self.ksizes = list(zip(kh, kw))

    def __call__(self, x):
        ys = [x]
        H, W = x.shape[2:]
        for f, ksize in zip(self, self.ksizes):
            y = F.average_pooling_2d(x, ksize, ksize)
            y = f(y)
            y = F.resize_images(y, (H, W))
        return F.concat(ys, axis=1)

class DilatedResNet(PickableSequentialChain):

    _blocks = {
        101: [3, 4, 23, 3],

    def __init__(self, n_layer, initialW, bn_kwargs=None):
        n_block = self._blocks[n_layer]
        super(DilatedResNet, self).__init__()
        with self.init_scope():
            self.conv1_1 = Conv2DBNActiv(
                None, 64, 3, 2, 1, 1,
                initialW=initialW, bn_kwargs=bn_kwargs)
            self.conv1_2 = Conv2DBNActiv(
                64, 64, 3, 1, 1, 1, initialW=initialW, bn_kwargs=bn_kwargs)
            self.conv1_3 = Conv2DBNActiv(
                64, 128, 3, 1, 1, 1, initialW=initialW, bn_kwargs=bn_kwargs)
            self.pool1 = lambda x: F.max_pooling_2d(
                x, ksize=3, stride=2, pad=1)
            self.res2 = ResBlock(
                n_block[0], 128, 64, 256, 1, 1,
                initialW=initialW, bn_kwargs=bn_kwargs, stride_first=False)
            self.res3 = ResBlock(
                n_block[1], 256, 128, 512, 2, 1,
                initialW=initialW, bn_kwargs=bn_kwargs, stride_first=False)
            self.res4 = ResBlock(
                n_block[2], 512, 256, 1024, 1, 2,
                initialW=initialW, bn_kwargs=bn_kwargs, stride_first=False)
            self.res5 = ResBlock(
                n_block[3], 1024, 512, 2048, 1, 4,
                initialW=initialW, bn_kwargs=bn_kwargs, stride_first=False)

[docs]class PSPNet(chainer.Chain): """Pyramid Scene Parsing Network. This is a PSPNet [#]_ model for semantic segmentation. This is based on the implementation found here_. .. [#] Hengshuang Zhao, Jianping Shi, Xiaojuan Qi, Xiaogang Wang \ Jiaya Jia "Pyramid Scene Parsing Network" \ CVPR, 2017 .. _here: Args: extractor (chainer.Chain): A feature extractor. n_class (int): The number of channels in the last convolution layer. input_size (tuple): The size of the input. This value is :math:`(height, width)`. initialW (callable): Initializer for the weights of convolution kernels. bn_kwargs (dict): Keyword arguments passed to initialize :class:`chainer.links.BatchNormalization`. If a ChainerMN communicator (:class:`~chainermn.communicators.CommunicatorBase`) is given with the key :obj:`comm`, :class:`~chainermn.links.MultiNodeBatchNormalization` will be used for the batch normalization. Otherwise, :class:`~chainer.links.BatchNormalization` will be used. """ def __init__(self, extractor, n_class, input_size, initialW=None, bn_kwargs=None): super(PSPNet, self).__init__() pyramids = [6, 3, 2, 1] if not isinstance(input_size, (list, tuple)): input_size = (int(input_size), int(input_size)) self.scales = None self.mean = np.array( [123.68, 116.779, 103.939], dtype=np.float32)[:, None, None] self.input_size = input_size feat_size = (input_size[0] // 8, input_size[1] // 8) with self.init_scope(): self.extractor = extractor self.ppm = PyramidPoolingModule(2048, feat_size, pyramids, initialW=initialW, bn_kwargs=bn_kwargs) self.head_conv1 = Conv2DBNActiv(4096, 512, 3, 1, 1, initialW=initialW) self.head_conv2 = L.Convolution2D( 512, n_class, 1, 1, 0, False, initialW) @property def n_class(self): return self.head_conv2.out_channels def __call__(self, x): _, res5 = self.extractor(x) h = self.ppm(res5) h = self.head_conv1(h) h = self.head_conv2(h) h = F.resize_images(h, x.shape[2:]) return h def _tile_predict(self, img): if self.mean is not None: img = img - self.mean ori_H, ori_W = img.shape[1:] long_size = max(ori_H, ori_W) if long_size > max(self.input_size): stride_rate = 2 / 3 stride = (int(ceil(self.input_size[0] * stride_rate)), int(ceil(self.input_size[1] * stride_rate))) imgs, param = convolution_crop( img, self.input_size, stride, return_param=True) counts = self.xp.zeros((1, ori_H, ori_W), dtype=np.float32) preds = self.xp.zeros((1, self.n_class, ori_H, ori_W), dtype=np.float32) N = len(param['y_slices']) for i in range(N): img_i = imgs[i:i+1] y_slice = param['y_slices'][i] x_slice = param['x_slices'][i] crop_y_slice = param['crop_y_slices'][i] crop_x_slice = param['crop_x_slices'][i] scores_i = self._predict(img_i) # Flip horizontally flipped score maps again flipped_scores_i = self._predict( img_i[:, :, :, ::-1])[:, :, :, ::-1] preds[0, :, y_slice, x_slice] +=\ scores_i[0, :, crop_y_slice, crop_x_slice] preds[0, :, y_slice, x_slice] +=\ flipped_scores_i[0, :, crop_y_slice, crop_x_slice] counts[0, y_slice, x_slice] += 2 scores = preds / counts[:, None] else: img, param = transforms.resize_contain( img, self.input_size, return_param=True) preds1 = self._predict(img[np.newaxis]) preds2 = self._predict(img[np.newaxis, :, :, ::-1]) preds = (preds1 + preds2[:, :, :, ::-1]) / 2 y_start = param['y_offset'] y_end = y_start + param['scaled_size'][0] x_start = param['x_offset'] x_end = x_start + param['scaled_size'][1] scores = preds[:, :, y_start:y_end, x_start:x_end] scores = F.resize_images(scores, (ori_H, ori_W))[0].array return scores def _predict(self, imgs): xs = chainer.Variable(self.xp.asarray(imgs)) with chainer.using_config('train', False): scores = F.softmax(self(xs)).array return scores
[docs] def predict(self, imgs): """Conduct semantic segmentation from images. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their values are :math:`[0, 255]`. Returns: list of numpy.ndarray: List of integer labels predicted from each image in the input \ list. """ labels = [] for img in imgs: with chainer.using_config('train', False), \ chainer.function.no_backprop_mode(): if self.scales is not None: scores = _multiscale_predict( self._tile_predict, img, self.scales) else: scores = self._tile_predict(img) labels.append(chainer.backends.cuda.to_cpu( self.xp.argmax(scores, axis=0).astype(np.int32))) return labels
[docs]class PSPNetResNet101(PSPNet): """PSPNet with Dilated ResNet101 as the feature extractor. .. seealso:: :class:`chainercv.experimental.links.model.pspnet.PSPNet` Args: n_class (int): The number of channels in the last convolution layer. pretrained_model (string): The weight file to be loaded. This can take :obj:`'cityscapes'`, `filepath` or :obj:`None`. The default value is :obj:`None`. * :obj:`'cityscapes'`: Load weights trained on train split of \ Cityscapes dataset. \ The weight file is downloaded and cached automatically. \ :obj:`n_class` must be :obj:`19` or :obj:`None`. * `filepath`: A path of npz file. In this case, :obj:`n_class` \ must be specified properly. * :obj:`None`: Do not load weights. input_size (tuple): The size of the input. This value is :math:`(height, width)`. initialW (callable): Initializer for the weights of convolution kernels. comm (chainermn.communicator): If a ChainerMN communicator is given, it will be used for distributed batch normalization during training. If None, all batch normalization links will not share the input vectors among GPUs before calculating mean and variance. The original PSPNet implementation uses distributed batch normalization. """ _models = { 'cityscapes': { 'param': {'n_class': 19, 'input_size': (713, 713)}, 'url': '' 'pspnet_resnet101_cityscapes_converted_2018_05_22.npz' } } def __init__(self, n_class=None, pretrained_model=None, input_size=None, initialW=None, comm=None): param, path = utils.prepare_pretrained_model( {'n_class': n_class, 'input_size': input_size}, pretrained_model, self._models, {'input_size': (713, 713)}) if comm is not None: bn_kwargs = {'comm': comm} else: bn_kwargs = {} if initialW is None: initialW = chainer.initializers.HeNormal() extractor = DilatedResNet(101, initialW, bn_kwargs) extractor.pick = ('res4', 'res5') super(PSPNetResNet101, self).__init__( extractor, param['n_class'], param['input_size'], initialW, bn_kwargs) if path: chainer.serializers.load_npz(path, self)
def _multiscale_predict(predict_method, img, scales): orig_H, orig_W = img.shape[1:] scores = [] orig_img = img for scale in scales: img = orig_img.copy() if scale != 1.0: img = transforms.resize( img, (int(orig_H * scale), int(orig_W * scale))) # This method should return scores y = predict_method(img)[None] assert y.shape[2:] == img.shape[1:] if scale != 1.0: y = F.resize_images(y, (orig_H, orig_W)).array scores.append(y) xp = chainer.backends.cuda.get_array_module(scores[0]) scores = xp.stack(scores) return scores.mean(0)[0] # (C, H, W)