Source code for chainercv.links.model.ssd.ssd_vgg16

from __future__ import division

import numpy as np

import chainer
import chainer.functions as F
from chainer import initializers
import chainer.links as L

from chainercv.links.model.ssd import Multibox
from chainercv.links.model.ssd import Normalize
from chainercv.links.model.ssd import SSD
from chainercv import utils


# RGB, (C, 1, 1) format
_imagenet_mean = np.array((123, 117, 104)).reshape((-1, 1, 1))


[docs]class VGG16(chainer.Chain): """An extended VGG-16 model for SSD300 and SSD512. This is an extended VGG-16 model proposed in [#]_. The differences from original VGG-16 [#]_ are shown below. * :obj:`conv5_1`, :obj:`conv5_2` and :obj:`conv5_3` are changed from \ :class:`~chainer.links.Convolution2d` to \ :class:`~chainer.links.DilatedConvolution2d`. * :class:`~chainercv.links.model.ssd.Normalize` is \ inserted after :obj:`conv4_3`. * The parameters of max pooling after :obj:`conv5_3` are changed. * :obj:`fc6` and :obj:`fc7` are converted to :obj:`conv6` and :obj:`conv7`. .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016. .. [#] Karen Simonyan, Andrew Zisserman. Very Deep Convolutional Networks for Large-Scale Image Recognition. ICLR 2015. """ def __init__(self): super(VGG16, self).__init__() with self.init_scope(): self.conv1_1 = L.Convolution2D(64, 3, pad=1) self.conv1_2 = L.Convolution2D(64, 3, pad=1) self.conv2_1 = L.Convolution2D(128, 3, pad=1) self.conv2_2 = L.Convolution2D(128, 3, pad=1) self.conv3_1 = L.Convolution2D(256, 3, pad=1) self.conv3_2 = L.Convolution2D(256, 3, pad=1) self.conv3_3 = L.Convolution2D(256, 3, pad=1) self.conv4_1 = L.Convolution2D(512, 3, pad=1) self.conv4_2 = L.Convolution2D(512, 3, pad=1) self.conv4_3 = L.Convolution2D(512, 3, pad=1) self.norm4 = Normalize(512, initial=initializers.Constant(20)) self.conv5_1 = L.DilatedConvolution2D(512, 3, pad=1) self.conv5_2 = L.DilatedConvolution2D(512, 3, pad=1) self.conv5_3 = L.DilatedConvolution2D(512, 3, pad=1) self.conv6 = L.DilatedConvolution2D(1024, 3, pad=6, dilate=6) self.conv7 = L.Convolution2D(1024, 1)
[docs] def __call__(self, x): ys = [] h = F.relu(self.conv1_1(x)) h = F.relu(self.conv1_2(h)) h = F.max_pooling_2d(h, 2) h = F.relu(self.conv2_1(h)) h = F.relu(self.conv2_2(h)) h = F.max_pooling_2d(h, 2) h = F.relu(self.conv3_1(h)) h = F.relu(self.conv3_2(h)) h = F.relu(self.conv3_3(h)) h = F.max_pooling_2d(h, 2) h = F.relu(self.conv4_1(h)) h = F.relu(self.conv4_2(h)) h = F.relu(self.conv4_3(h)) ys.append(self.norm4(h)) h = F.max_pooling_2d(h, 2) h = F.relu(self.conv5_1(h)) h = F.relu(self.conv5_2(h)) h = F.relu(self.conv5_3(h)) h = F.max_pooling_2d(h, 3, stride=1, pad=1) h = F.relu(self.conv6(h)) h = F.relu(self.conv7(h)) ys.append(h) return ys
[docs]class VGG16Extractor300(VGG16): """A VGG-16 based feature extractor for SSD300. This is a feature extractor for :class:`~chainercv.links.model.ssd.SSD300`. This extractor is based on :class:`~chainercv.links.model.ssd.VGG16`. """ insize = 300 grids = (38, 19, 10, 5, 3, 1) def __init__(self): init = { 'initialW': initializers.LeCunUniform(), 'initial_bias': initializers.Zero(), } super(VGG16Extractor300, self).__init__() with self.init_scope(): self.conv8_1 = L.Convolution2D(256, 1, **init) self.conv8_2 = L.Convolution2D(512, 3, stride=2, pad=1, **init) self.conv9_1 = L.Convolution2D(128, 1, **init) self.conv9_2 = L.Convolution2D(256, 3, stride=2, pad=1, **init) self.conv10_1 = L.Convolution2D(128, 1, **init) self.conv10_2 = L.Convolution2D(256, 3, **init) self.conv11_1 = L.Convolution2D(128, 1, **init) self.conv11_2 = L.Convolution2D(256, 3, **init)
[docs] def __call__(self, x): """Compute feature maps from a batch of images. This method extracts feature maps from :obj:`conv4_3`, :obj:`conv7`, :obj:`conv8_2`, :obj:`conv9_2`, :obj:`conv10_2`, and :obj:`conv11_2`. Args: x (ndarray): An array holding a batch of images. The images should be resized to :math:`300\\times 300`. Returns: list of Variable: Each variable contains a feature map. """ ys = super(VGG16Extractor300, self).__call__(x) for i in range(8, 11 + 1): h = ys[-1] h = F.relu(self['conv{:d}_1'.format(i)](h)) h = F.relu(self['conv{:d}_2'.format(i)](h)) ys.append(h) return ys
[docs]class VGG16Extractor512(VGG16): """A VGG-16 based feature extractor for SSD512. This is a feature extractor for :class:`~chainercv.links.model.ssd.SSD512`. This extractor is based on :class:`~chainercv.links.model.ssd.VGG16`. """ insize = 512 grids = (64, 32, 16, 8, 4, 2, 1) def __init__(self): init = { 'initialW': initializers.LeCunUniform(), 'initial_bias': initializers.Zero(), } super(VGG16Extractor512, self).__init__() with self.init_scope(): self.conv8_1 = L.Convolution2D(256, 1, **init) self.conv8_2 = L.Convolution2D(512, 3, stride=2, pad=1, **init) self.conv9_1 = L.Convolution2D(128, 1, **init) self.conv9_2 = L.Convolution2D(256, 3, stride=2, pad=1, **init) self.conv10_1 = L.Convolution2D(128, 1, **init) self.conv10_2 = L.Convolution2D(256, 3, stride=2, pad=1, **init) self.conv11_1 = L.Convolution2D(128, 1, **init) self.conv11_2 = L.Convolution2D(256, 3, stride=2, pad=1, **init) self.conv12_1 = L.Convolution2D(128, 1, **init) self.conv12_2 = L.Convolution2D(256, 4, pad=1, **init)
[docs] def __call__(self, x): """Compute feature maps from a batch of images. This method extracts feature maps from :obj:`conv4_3`, :obj:`conv7`, :obj:`conv8_2`, :obj:`conv9_2`, :obj:`conv10_2`, :obj:`conv11_2`, and :obj:`conv12_2`. Args: x (ndarray): An array holding a batch of images. The images should be resized to :math:`512\\times 512`. Returns: list of Variable: Each variable contains a feature map. """ ys = super(VGG16Extractor512, self).__call__(x) for i in range(8, 12 + 1): h = ys[-1] h = F.relu(self['conv{:d}_1'.format(i)](h)) h = F.relu(self['conv{:d}_2'.format(i)](h)) ys.append(h) return ys
[docs]class SSD300(SSD): """Single Shot Multibox Detector with 300x300 inputs. This is a model of Single Shot Multibox Detector [#]_. This model uses :class:`~chainercv.links.model.ssd.VGG16Extractor300` as its feature extractor. .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016. Args: n_fg_class (int): The number of classes excluding the background. pretrained_model (string): The weight file to be loaded. This can take :obj:`'voc0712'`, `filepath` or :obj:`None`. The default value is :obj:`None`. * :obj:`'voc0712'`: Load weights trained on trainval split of \ PASCAL VOC 2007 and 2012. \ The weight file is downloaded and cached automatically. \ :obj:`n_fg_class` must be :obj:`20` or :obj:`None`. \ These weights were converted from the Caffe model provided by \ `the original implementation \ <https://github.com/weiliu89/caffe/tree/ssd>`_. \ The conversion code is `chainercv/examples/ssd/caffe2npz.py`. * :obj:`'imagenet'`: Load weights of VGG-16 trained on ImageNet. \ The weight file is downloaded and cached automatically. \ This option initializes weights partially and the rests are \ initialized randomly. In this case, :obj:`n_fg_class` \ can be set to any number. * `filepath`: A path of npz file. In this case, :obj:`n_fg_class` \ must be specified properly. * :obj:`None`: Do not load weights. """ _models = { 'voc0712': { 'param': {'n_fg_class': 20}, 'url': 'https://chainercv-models.preferred.jp/' 'ssd300_voc0712_converted_2017_06_06.npz', 'cv2': True }, 'imagenet': { 'url': 'https://chainercv-models.preferred.jp/' 'ssd_vgg16_imagenet_converted_2017_06_09.npz', 'cv2': True }, } def __init__(self, n_fg_class=None, pretrained_model=None): param, path = utils.prepare_pretrained_model( {'n_fg_class': n_fg_class}, pretrained_model, self._models) super(SSD300, self).__init__( extractor=VGG16Extractor300(), multibox=Multibox( n_class=param['n_fg_class'] + 1, aspect_ratios=((2,), (2, 3), (2, 3), (2, 3), (2,), (2,))), steps=(8, 16, 32, 64, 100, 300), sizes=(30, 60, 111, 162, 213, 264, 315), mean=_imagenet_mean) if path: chainer.serializers.load_npz(path, self, strict=False)
[docs]class SSD512(SSD): """Single Shot Multibox Detector with 512x512 inputs. This is a model of Single Shot Multibox Detector [#]_. This model uses :class:`~chainercv.links.model.ssd.VGG16Extractor512` as its feature extractor. .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. ECCV 2016. Args: n_fg_class (int): The number of classes excluding the background. pretrained_model (string): The weight file to be loaded. This can take :obj:`'voc0712'`, `filepath` or :obj:`None`. The default value is :obj:`None`. * :obj:`'voc0712'`: Load weights trained on trainval split of \ PASCAL VOC 2007 and 2012. \ The weight file is downloaded and cached automatically. \ :obj:`n_fg_class` must be :obj:`20` or :obj:`None`. \ These weights were converted from the Caffe model provided by \ `the original implementation \ <https://github.com/weiliu89/caffe/tree/ssd>`_. \ The conversion code is `chainercv/examples/ssd/caffe2npz.py`. * :obj:`'imagenet'`: Load weights of VGG-16 trained on ImageNet. \ The weight file is downloaded and cached automatically. \ This option initializes weights partially and the rests are \ initialized randomly. In this case, :obj:`n_fg_class` \ can be set to any number. * `filepath`: A path of npz file. In this case, :obj:`n_fg_class` \ must be specified properly. * :obj:`None`: Do not load weights. """ _models = { 'voc0712': { 'param': {'n_fg_class': 20}, 'url': 'https://chainercv-models.preferred.jp/' 'ssd512_voc0712_converted_2017_06_06.npz', 'cv2': True }, 'imagenet': { 'url': 'https://chainercv-models.preferred.jp/' 'ssd_vgg16_imagenet_converted_2017_06_09.npz', 'cv2': True }, } def __init__(self, n_fg_class=None, pretrained_model=None, use_pretrained_class_weights=True): param, path = utils.prepare_pretrained_model( {'n_fg_class': n_fg_class}, pretrained_model, self._models) super(SSD512, self).__init__( extractor=VGG16Extractor512(), multibox=Multibox( n_class=param['n_fg_class'] + 1, aspect_ratios=( (2,), (2, 3), (2, 3), (2, 3), (2, 3), (2,), (2,))), steps=(8, 16, 32, 64, 128, 256, 512), sizes=(35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6), mean=_imagenet_mean) if path: chainer.serializers.load_npz(path, self, strict=False)