from __future__ import division

import itertools
import numpy as np

import chainer
from chainer.backends import cuda
import chainer.functions as F
from chainer.links import Convolution2D

from chainercv.links import Conv2DBNActiv
from chainercv import utils

from chainercv.links.model.yolo.yolo_base import YOLOBase

def _leaky_relu(x):
    return F.leaky_relu(x, slope=0.1)

def _maxpool(x):
    return F.max_pooling_2d(x, 2)

def _reorg(x):
    n, c, h, w = x.shape
    x = F.reshape(x, (n, c // 4, h, 2, w, 2))
    x = F.transpose(x, (0, 3, 5, 1, 2, 4))
    return F.reshape(x, (n, c * 4, h // 2, w // 2))

[docs]class Darknet19Extractor(chainer.ChainList): """A Darknet19 based feature extractor for YOLOv2. This is a feature extractor for :class:`~chainercv.links.model.yolo.YOLOv2` """ insize = 416 grid = 13 def __init__(self): super(Darknet19Extractor, self).__init__() # Darknet19 for k, n_conv in enumerate((1, 1, 3, 3, 5, 5)): for i in range(n_conv): if i % 2 == 0: self.append( Conv2DBNActiv(32 << k, 3, pad=1, activ=_leaky_relu)) else: self.append( Conv2DBNActiv(32 << (k - 1), 1, activ=_leaky_relu)) # additional links self.append(Conv2DBNActiv(1024, 3, pad=1, activ=_leaky_relu)) self.append(Conv2DBNActiv(1024, 3, pad=1, activ=_leaky_relu)) self.append(Conv2DBNActiv(64, 1, activ=_leaky_relu)) self.append(Conv2DBNActiv(1024, 3, pad=1, activ=_leaky_relu))
[docs] def __call__(self, x): """Compute a feature map from a batch of images. Args: x (ndarray): An array holding a batch of images. The images should be resized to :math:`416\\times 416`. Returns: Variable: """ h = x for i, link in enumerate(self): h = link(h) if i == 12: tmp = h elif i == 19: h, tmp = tmp, h elif i == 20: h = F.concat((_reorg(h), tmp)) if i in {0, 1, 4, 7, 12}: h = _maxpool(h) return h
[docs]class YOLOv2(YOLOBase): """YOLOv2. This is a model of YOLOv2 [#]_. This model uses :class:`~chainercv.links.model.yolo.Darknet19Extractor` as its feature extractor. .. [#] Joseph Redmon, Ali Farhadi. YOLO9000: Better, Faster, Stronger. CVPR 2017. Args: n_fg_class (int): The number of classes excluding the background. pretrained_model (string): The weight file to be loaded. This can take :obj:`'voc0712'`, `filepath` or :obj:`None`. The default value is :obj:`None`. * :obj:`'voc0712'`: Load weights trained on trainval split of \ PASCAL VOC 2007 and 2012. \ The weight file is downloaded and cached automatically. \ :obj:`n_fg_class` must be :obj:`20` or :obj:`None`. \ These weights were converted from the darknet model \ provided by `the original implementation \ <>`_. \ The conversion code is \ `chainercv/examples/yolo/`. * `filepath`: A path of npz file. In this case, :obj:`n_fg_class` \ must be specified properly. * :obj:`None`: Do not load weights. """ _models = { 'voc0712': { 'param': {'n_fg_class': 20}, 'url': '' 'yolo_v2_voc0712_converted_2018_05_03.npz', 'cv2': True }, } _anchors = ( (1.73145, 1.3221), (4.00944, 3.19275), (8.09892, 5.05587), (4.84053, 9.47112), (10.0071, 11.2364)) def __init__(self, n_fg_class=None, pretrained_model=None): super(YOLOv2, self).__init__() param, path = utils.prepare_pretrained_model( {'n_fg_class': n_fg_class}, pretrained_model, self._models) self.n_fg_class = param['n_fg_class'] self.use_preset('visualize') with self.init_scope(): self.extractor = Darknet19Extractor() self.subnet = Convolution2D( len(self._anchors) * (4 + 1 + self.n_fg_class), 1) default_bbox = [] for v, u in itertools.product(range(self.extractor.grid), repeat=2): for h, w in self._anchors: default_bbox.append((v, u, h, w)) self._default_bbox = np.array(default_bbox, dtype=np.float32) if path: chainer.serializers.load_npz(path, self, strict=False)
[docs] def to_cpu(self): super(YOLOv2, self).to_cpu() self._default_bbox = cuda.to_cpu(self._default_bbox)
[docs] def to_gpu(self, device=None): super(YOLOv2, self).to_gpu(device) self._default_bbox = cuda.to_gpu(self._default_bbox, device)
def __call__(self, x): """Compute localization, objectness, and classification from a batch of images. This method computes three variables, :obj:`locs`, :obj:`objs`, and :obj:`confs`. :meth:`self._decode` converts these variables to bounding box coordinates and confidence scores. These variables are also used in training YOLOv2. Args: x (chainer.Variable): A variable holding a batch of images. Returns: tuple of chainer.Variable: This method returns three variables, :obj:`locs`, :obj:`objs`, and :obj:`confs`. * **locs**: A variable of float arrays of shape \ :math:`(B, K, 4)`, \ where :math:`B` is the number of samples in the batch and \ :math:`K` is the number of default bounding boxes. * **objs**: A variable of float arrays of shape \ :math:`(B, K)`. * **confs**: A variable of float arrays of shape \ :math:`(B, K, n\_fg\_class)`. """ h = self.subnet(self.extractor(x)) h = F.transpose(h, (0, 2, 3, 1)) h = F.reshape(h, (h.shape[0], -1, 4 + 1 + self.n_fg_class)) locs = h[:, :, :4] objs = h[:, :, 4] confs = h[:, :, 5:] return locs, objs, confs def _decode(self, loc, obj, conf): raw_bbox = self._default_bbox.copy() raw_bbox[:, :2] += 1 / (1 + self.xp.exp(-loc[:, :2])) raw_bbox[:, 2:] *= self.xp.exp(loc[:, 2:]) raw_bbox[:, :2] -= raw_bbox[:, 2:] / 2 raw_bbox[:, 2:] += raw_bbox[:, :2] raw_bbox *= self.insize / self.extractor.grid obj = 1 / (1 + self.xp.exp(-obj)) conf = self.xp.exp(conf) conf /= conf.sum(axis=1, keepdims=True) raw_score = obj[:, None] * conf bbox = [] label = [] score = [] for l in range(self.n_fg_class): bbox_l = raw_bbox score_l = raw_score[:, l] mask = score_l >= self.score_thresh bbox_l = bbox_l[mask] score_l = score_l[mask] indices = utils.non_maximum_suppression( bbox_l, self.nms_thresh, score_l) bbox_l = bbox_l[indices] score_l = score_l[indices] bbox.append(bbox_l) label.append(self.xp.array((l,) * len(bbox_l))) score.append(score_l) bbox = self.xp.vstack(bbox).astype(np.float32) label = self.xp.hstack(label).astype(np.int32) score = self.xp.hstack(score).astype(np.float32) return bbox, label, score