from __future__ import division
import itertools
import numpy as np
import chainer
from chainer.backends import cuda
import chainer.functions as F
from chainer.links import Convolution2D
from chainercv.links import Conv2DBNActiv
from chainercv import utils
from chainercv.links.model.yolo.yolo_base import YOLOBase
def _leaky_relu(x):
return F.leaky_relu(x, slope=0.1)
def _maxpool(x):
return F.max_pooling_2d(x, 2)
def _reorg(x):
n, c, h, w = x.shape
x = F.reshape(x, (n, c // 4, h, 2, w, 2))
x = F.transpose(x, (0, 3, 5, 1, 2, 4))
return F.reshape(x, (n, c * 4, h // 2, w // 2))
[docs]class YOLOv2(YOLOBase):
"""YOLOv2.
This is a model of YOLOv2 [#]_.
This model uses :class:`~chainercv.links.model.yolo.Darknet19Extractor` as
its feature extractor.
.. [#] Joseph Redmon, Ali Farhadi.
YOLO9000: Better, Faster, Stronger. CVPR 2017.
Args:
n_fg_class (int): The number of classes excluding the background.
pretrained_model (string): The weight file to be loaded.
This can take :obj:`'voc0712'`, `filepath` or :obj:`None`.
The default value is :obj:`None`.
* :obj:`'voc0712'`: Load weights trained on trainval split of \
PASCAL VOC 2007 and 2012. \
The weight file is downloaded and cached automatically. \
:obj:`n_fg_class` must be :obj:`20` or :obj:`None`. \
These weights were converted from the darknet model \
provided by `the original implementation \
<https://pjreddie.com/darknet/yolov2/>`_. \
The conversion code is \
`chainercv/examples/yolo/darknet2npz.py`.
* `filepath`: A path of npz file. In this case, :obj:`n_fg_class` \
must be specified properly.
* :obj:`None`: Do not load weights.
"""
_models = {
'voc0712': {
'param': {'n_fg_class': 20},
'url': 'https://chainercv-models.preferred.jp/'
'yolo_v2_voc0712_converted_2018_05_03.npz',
'cv2': True
},
}
_anchors = (
(1.73145, 1.3221),
(4.00944, 3.19275),
(8.09892, 5.05587),
(4.84053, 9.47112),
(10.0071, 11.2364))
def __init__(self, n_fg_class=None, pretrained_model=None):
super(YOLOv2, self).__init__()
param, path = utils.prepare_pretrained_model(
{'n_fg_class': n_fg_class}, pretrained_model, self._models)
self.n_fg_class = param['n_fg_class']
self.use_preset('visualize')
with self.init_scope():
self.extractor = Darknet19Extractor()
self.subnet = Convolution2D(
len(self._anchors) * (4 + 1 + self.n_fg_class), 1)
default_bbox = []
for v, u in itertools.product(range(self.extractor.grid), repeat=2):
for h, w in self._anchors:
default_bbox.append((v, u, h, w))
self._default_bbox = np.array(default_bbox, dtype=np.float32)
if path:
chainer.serializers.load_npz(path, self, strict=False)
[docs] def to_cpu(self):
super(YOLOv2, self).to_cpu()
self._default_bbox = cuda.to_cpu(self._default_bbox)
[docs] def to_gpu(self, device=None):
super(YOLOv2, self).to_gpu(device)
self._default_bbox = cuda.to_gpu(self._default_bbox, device)
def __call__(self, x):
"""Compute localization, objectness, and classification from a batch of images.
This method computes three variables, :obj:`locs`, :obj:`objs`,
and :obj:`confs`.
:meth:`self._decode` converts these variables to bounding box
coordinates and confidence scores.
These variables are also used in training YOLOv2.
Args:
x (chainer.Variable): A variable holding a batch of images.
Returns:
tuple of chainer.Variable:
This method returns three variables, :obj:`locs`,
:obj:`objs`, and :obj:`confs`.
* **locs**: A variable of float arrays of shape \
:math:`(B, K, 4)`, \
where :math:`B` is the number of samples in the batch and \
:math:`K` is the number of default bounding boxes.
* **objs**: A variable of float arrays of shape \
:math:`(B, K)`.
* **confs**: A variable of float arrays of shape \
:math:`(B, K, n\_fg\_class)`.
"""
h = self.subnet(self.extractor(x))
h = F.transpose(h, (0, 2, 3, 1))
h = F.reshape(h, (h.shape[0], -1, 4 + 1 + self.n_fg_class))
locs = h[:, :, :4]
objs = h[:, :, 4]
confs = h[:, :, 5:]
return locs, objs, confs
def _decode(self, loc, obj, conf):
raw_bbox = self._default_bbox.copy()
raw_bbox[:, :2] += 1 / (1 + self.xp.exp(-loc[:, :2]))
raw_bbox[:, 2:] *= self.xp.exp(loc[:, 2:])
raw_bbox[:, :2] -= raw_bbox[:, 2:] / 2
raw_bbox[:, 2:] += raw_bbox[:, :2]
raw_bbox *= self.insize / self.extractor.grid
obj = 1 / (1 + self.xp.exp(-obj))
conf = self.xp.exp(conf)
conf /= conf.sum(axis=1, keepdims=True)
raw_score = obj[:, None] * conf
bbox = []
label = []
score = []
for l in range(self.n_fg_class):
bbox_l = raw_bbox
score_l = raw_score[:, l]
mask = score_l >= self.score_thresh
bbox_l = bbox_l[mask]
score_l = score_l[mask]
indices = utils.non_maximum_suppression(
bbox_l, self.nms_thresh, score_l)
bbox_l = bbox_l[indices]
score_l = score_l[indices]
bbox.append(bbox_l)
label.append(self.xp.array((l,) * len(bbox_l)))
score.append(score_l)
bbox = self.xp.vstack(bbox).astype(np.float32)
label = self.xp.hstack(label).astype(np.int32)
score = self.xp.hstack(score).astype(np.float32)
return bbox, label, score