Source code for chainercv.links.model.yolo.yolo_v3

from __future__ import division

import itertools
import numpy as np

import chainer
from chainer.backends import cuda
import chainer.functions as F
from chainer.links import Convolution2D

from chainercv.links import Conv2DBNActiv
from chainercv import utils

from chainercv.links.model.yolo.yolo_base import YOLOBase


def _leaky_relu(x):
    return F.leaky_relu(x, slope=0.1)


def _upsample(x):
    return F.unpooling_2d(x, 2, cover_all=False)


[docs]class ResidualBlock(chainer.ChainList): """ChainList with a residual connection.""" def __init__(self, *links): super(ResidualBlock, self).__init__(*links)
[docs] def __call__(self, x): h = x for link in self: h = link(h) h += x return h
[docs]class Darknet53Extractor(chainer.ChainList): """A Darknet53 based feature extractor for YOLOv3. This is a feature extractor for :class:`~chainercv.links.model.yolo.YOLOv3` """ insize = 416 grids = (13, 26, 52) def __init__(self): super(Darknet53Extractor, self).__init__() # Darknet53 self.append(Conv2DBNActiv(32, 3, pad=1, activ=_leaky_relu)) for k, n_block in enumerate((1, 2, 8, 8, 4)): self.append(Conv2DBNActiv( 32 << (k + 1), 3, stride=2, pad=1, activ=_leaky_relu)) for _ in range(n_block): self.append(ResidualBlock( Conv2DBNActiv(32 << k, 1, activ=_leaky_relu), Conv2DBNActiv(32 << (k + 1), 3, pad=1, activ=_leaky_relu))) # additional links for i, n in enumerate((512, 256, 128)): if i > 0: self.append(Conv2DBNActiv(n, 1, activ=_leaky_relu)) self.append(Conv2DBNActiv(n, 1, activ=_leaky_relu)) self.append(Conv2DBNActiv(n * 2, 3, pad=1, activ=_leaky_relu)) self.append(Conv2DBNActiv(n, 1, activ=_leaky_relu)) self.append(Conv2DBNActiv(n * 2, 3, pad=1, activ=_leaky_relu)) self.append(Conv2DBNActiv(n, 1, activ=_leaky_relu))
[docs] def __call__(self, x): """Compute feature maps from a batch of images. This method extracts feature maps from 3 layers. Args: x (ndarray): An array holding a batch of images. The images should be resized to :math:`416\\times 416`. Returns: list of Variable: Each variable contains a feature map. """ ys = [] h = x hs = [] for i, link in enumerate(self): h = link(h) if i in {33, 39, 45}: ys.append(h) elif i in {14, 23}: hs.append(h) elif i in {34, 40}: h = F.concat((_upsample(h), hs.pop())) return ys
[docs]class YOLOv3(YOLOBase): """YOLOv3. This is a model of YOLOv3 [#]_. This model uses :class:`~chainercv.links.model.yolo.Darknet53Extractor` as its feature extractor. .. [#] Joseph Redmon, Ali Farhadi. YOLOv3: An Incremental Improvement. arXiv 2018. Args: n_fg_class (int): The number of classes excluding the background. pretrained_model (string): The weight file to be loaded. This can take :obj:`'voc0712'`, `filepath` or :obj:`None`. The default value is :obj:`None`. * :obj:`'voc0712'`: Load weights trained on trainval split of \ PASCAL VOC 2007 and 2012. \ The weight file is downloaded and cached automatically. \ :obj:`n_fg_class` must be :obj:`20` or :obj:`None`. \ These weights were converted from the darknet model. \ The conversion code is \ `chainercv/examples/yolo/darknet2npz.py`. * `filepath`: A path of npz file. In this case, :obj:`n_fg_class` \ must be specified properly. * :obj:`None`: Do not load weights. """ _models = { 'voc0712': { 'param': {'n_fg_class': 20}, 'url': 'https://chainercv-models.preferred.jp/' 'yolo_v3_voc0712_converted_2018_05_01.npz', 'cv2': True }, } _anchors = ( ((90, 116), (198, 156), (326, 373)), ((61, 30), (45, 62), (119, 59)), ((13, 10), (30, 16), (23, 33))) def __init__(self, n_fg_class=None, pretrained_model=None): super(YOLOv3, self).__init__() param, path = utils.prepare_pretrained_model( {'n_fg_class': n_fg_class}, pretrained_model, self._models) self.n_fg_class = param['n_fg_class'] self.use_preset('visualize') with self.init_scope(): self.extractor = Darknet53Extractor() self.subnet = chainer.ChainList() for i, n in enumerate((512, 256, 128)): self.subnet.append(chainer.Sequential( Conv2DBNActiv(n * 2, 3, pad=1, activ=_leaky_relu), Convolution2D( len(self._anchors[i]) * (4 + 1 + self.n_fg_class), 1))) default_bbox = [] step = [] for k, grid in enumerate(self.extractor.grids): for v, u in itertools.product(range(grid), repeat=2): for h, w in self._anchors[k]: default_bbox.append((v, u, h, w)) step.append(self.insize / grid) self._default_bbox = np.array(default_bbox, dtype=np.float32) self._step = np.array(step, dtype=np.float32) if path: chainer.serializers.load_npz(path, self, strict=False)
[docs] def to_cpu(self): super(YOLOv3, self).to_cpu() self._default_bbox = cuda.to_cpu(self._default_bbox) self._step = cuda.to_cpu(self._step)
[docs] def to_gpu(self, device=None): super(YOLOv3, self).to_gpu(device) self._default_bbox = cuda.to_gpu(self._default_bbox, device) self._step = cuda.to_gpu(self._step, device)
def __call__(self, x): """Compute localization, objectness, and classification from a batch of images. This method computes three variables, :obj:`locs`, :obj:`objs`, and :obj:`confs`. :meth:`self._decode` converts these variables to bounding box coordinates and confidence scores. These variables are also used in training YOLOv3. Args: x (chainer.Variable): A variable holding a batch of images. Returns: tuple of chainer.Variable: This method returns three variables, :obj:`locs`, :obj:`objs`, and :obj:`confs`. * **locs**: A variable of float arrays of shape \ :math:`(B, K, 4)`, \ where :math:`B` is the number of samples in the batch and \ :math:`K` is the number of default bounding boxes. * **objs**: A variable of float arrays of shape \ :math:`(B, K)`. * **confs**: A variable of float arrays of shape \ :math:`(B, K, n\_fg\_class)`. """ ys = [] for i, h in enumerate(self.extractor(x)): h = self.subnet[i](h) h = F.transpose(h, (0, 2, 3, 1)) h = F.reshape(h, (h.shape[0], -1, 4 + 1 + self.n_fg_class)) ys.append(h) y = F.concat(ys) locs = y[:, :, :4] objs = y[:, :, 4] confs = y[:, :, 5:] return locs, objs, confs def _decode(self, loc, obj, conf): raw_bbox = self._default_bbox.copy() raw_bbox[:, :2] += 1 / (1 + self.xp.exp(-loc[:, :2])) raw_bbox[:, :2] *= self._step[:, None] raw_bbox[:, 2:] *= self.xp.exp(loc[:, 2:]) raw_bbox[:, :2] -= raw_bbox[:, 2:] / 2 raw_bbox[:, 2:] += raw_bbox[:, :2] obj = 1 / (1 + self.xp.exp(-obj)) conf = 1 / (1 + self.xp.exp(-conf)) raw_score = obj[:, None] * conf bbox = [] label = [] score = [] for l in range(self.n_fg_class): bbox_l = raw_bbox score_l = raw_score[:, l] mask = score_l >= self.score_thresh bbox_l = bbox_l[mask] score_l = score_l[mask] indices = utils.non_maximum_suppression( bbox_l, self.nms_thresh, score_l) bbox_l = bbox_l[indices] score_l = score_l[indices] bbox.append(bbox_l) label.append(self.xp.array((l,) * len(bbox_l))) score.append(score_l) bbox = self.xp.vstack(bbox).astype(np.float32) label = self.xp.hstack(label).astype(np.int32) score = self.xp.hstack(score).astype(np.float32) return bbox, label, score