1"""Transforms for RCNN series."""
2# pylint: disable=not-callable
3from __future__ import absolute_import
4
5import copy
6from random import randint
7
8import mxnet as mx
9
10from .. import bbox as tbbox
11from .. import image as timage
12from .. import mask as tmask
13
14__all__ = ['transform_test', 'load_test',
15           'FasterRCNNDefaultTrainTransform', 'FasterRCNNDefaultValTransform',
16           'MaskRCNNDefaultTrainTransform', 'MaskRCNNDefaultValTransform']
17
18
19def transform_test(imgs, short=600, max_size=1000, mean=(0.485, 0.456, 0.406),
20                   std=(0.229, 0.224, 0.225)):
21    """A util function to transform all images to tensors as network input by applying
22    normalizations. This function support 1 NDArray or iterable of NDArrays.
23
24    Parameters
25    ----------
26    imgs : NDArray or iterable of NDArray
27        Image(s) to be transformed.
28    short : int, optional, default is 600
29        Resize image short side to this `short` and keep aspect ratio.
30    max_size : int, optional, default is 1000
31        Maximum longer side length to fit image.
32        This is to limit the input image shape, avoid processing too large image.
33    mean : iterable of float
34        Mean pixel values.
35    std : iterable of float
36        Standard deviations of pixel values.
37
38    Returns
39    -------
40    (mxnet.NDArray, numpy.ndarray) or list of such tuple
41        A (1, 3, H, W) mxnet NDArray as input to network, and a numpy ndarray as
42        original un-normalized color image for display.
43        If multiple image names are supplied, return two lists. You can use
44        `zip()`` to collapse it.
45
46    """
47    if isinstance(imgs, mx.nd.NDArray):
48        imgs = [imgs]
49    for im in imgs:
50        assert isinstance(im, mx.nd.NDArray), "Expect NDArray, got {}".format(type(im))
51
52    tensors = []
53    origs = []
54    for img in imgs:
55        img = timage.resize_short_within(img, short, max_size)
56        orig_img = img.asnumpy().astype('uint8')
57        img = mx.nd.image.to_tensor(img)
58        img = mx.nd.image.normalize(img, mean=mean, std=std)
59        tensors.append(img.expand_dims(0))
60        origs.append(orig_img)
61    if len(tensors) == 1:
62        return tensors[0], origs[0]
63    return tensors, origs
64
65
66def load_test(filenames, short=600, max_size=1000, mean=(0.485, 0.456, 0.406),
67              std=(0.229, 0.224, 0.225)):
68    """A util function to load all images, transform them to tensor by applying
69    normalizations. This function support 1 filename or list of filenames.
70
71    Parameters
72    ----------
73    filenames : str or list of str
74        Image filename(s) to be loaded.
75    short : int, optional, default is 600
76        Resize image short side to this `short` and keep aspect ratio.
77    max_size : int, optional, default is 1000
78        Maximum longer side length to fit image.
79        This is to limit the input image shape, avoid processing too large image.
80    mean : iterable of float
81        Mean pixel values.
82    std : iterable of float
83        Standard deviations of pixel values.
84
85    Returns
86    -------
87    (mxnet.NDArray, numpy.ndarray) or list of such tuple
88        A (1, 3, H, W) mxnet NDArray as input to network, and a numpy ndarray as
89        original un-normalized color image for display.
90        If multiple image names are supplied, return two lists. You can use
91        `zip()`` to collapse it.
92
93    """
94    if isinstance(filenames, str):
95        filenames = [filenames]
96    imgs = [mx.image.imread(f) for f in filenames]
97    return transform_test(imgs, short, max_size, mean, std)
98
99
100class FasterRCNNDefaultTrainTransform(object):
101    """Default Faster-RCNN training transform.
102
103    Parameters
104    ----------
105    short : int/tuple, default is 600
106        Resize image shorter side to ``short``.
107        Resize the shorter side of the image randomly within the given range, if it is a tuple.
108    max_size : int, default is 1000
109        Make sure image longer side is smaller than ``max_size``.
110    net : mxnet.gluon.HybridBlock, optional
111        The faster-rcnn network.
112
113        .. hint::
114
115            If net is ``None``, the transformation will not generate training targets.
116            Otherwise it will generate training targets to accelerate the training phase
117            since we push some workload to CPU workers instead of GPUs.
118
119    mean : array-like of size 3
120        Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406].
121    std : array-like of size 3
122        Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225].
123    box_norm : array-like of size 4, default is (1., 1., 1., 1.)
124        Std value to be divided from encoded values.
125    num_sample : int, default is 256
126        Number of samples for RPN targets.
127    pos_iou_thresh : float, default is 0.7
128        Anchors larger than ``pos_iou_thresh`` is regarded as positive samples.
129    neg_iou_thresh : float, default is 0.3
130        Anchors smaller than ``neg_iou_thresh`` is regarded as negative samples.
131        Anchors with IOU in between ``pos_iou_thresh`` and ``neg_iou_thresh`` are
132        ignored.
133    pos_ratio : float, default is 0.5
134        ``pos_ratio`` defines how many positive samples (``pos_ratio * num_sample``) is
135        to be sampled.
136    flip_p : float, default is 0.5
137        Probability to flip horizontally, by default is 0.5 for random horizontal flip.
138        You may set it to 0 to disable random flip or 1 to force flip.
139    ashape : int, default is 128
140        Defines shape of pre generated anchors for target generation
141    multi_stage : boolean, default is False
142        Whether the network output multi stage features.
143    """
144
145    def __init__(self, short=600, max_size=1000, net=None, mean=(0.485, 0.456, 0.406),
146                 std=(0.229, 0.224, 0.225), box_norm=(1., 1., 1., 1.),
147                 num_sample=256, pos_iou_thresh=0.7, neg_iou_thresh=0.3,
148                 pos_ratio=0.5, flip_p=0.5, ashape=128, multi_stage=False, **kwargs):
149        self._short = short
150        self._max_size = max_size
151        self._mean = mean
152        self._std = std
153        self._box_norm = box_norm
154        self._anchors = None
155        self._multi_stage = multi_stage
156        self._random_resize = isinstance(self._short, (tuple, list))
157        self._num_sample = num_sample
158        self._pos_iou_thresh = pos_iou_thresh
159        self._neg_iou_thresh = neg_iou_thresh
160        self._pos_ratio = pos_ratio
161        self._flip_p = flip_p
162        self._internal_target_generator = None
163        self._net_none = False
164        self._kwargs = kwargs
165        if net is None:
166            self._net_none = True
167            return
168
169        # use fake data to generate fixed anchors for target generation
170        anchors = []  # [P2, P3, P4, P5]
171        # in case network has reset_ctx to gpu
172        # anchor_generator = copy.deepcopy(net.rpn.anchor_generator)
173        anchor_generator = net.rpn.anchor_generator
174        old_ctx = list(anchor_generator.collect_params().values())[0].list_ctx()
175        anchor_generator.collect_params().reset_ctx(mx.cpu())
176        if self._multi_stage:
177            for ag in anchor_generator:
178                anchor = ag(mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1))
179                ashape = max(ashape // 2, 16)
180                anchors.append(anchor)
181        else:
182            anchors = anchor_generator(
183                mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1))
184        self._anchors = anchors
185        anchor_generator.collect_params().reset_ctx(old_ctx)
186        # record feature extractor for infer_shape
187        if not hasattr(net, 'features'):
188            raise ValueError("Cannot find features in network, it is a Faster-RCNN network?")
189        self._feat_sym = net.features(mx.sym.var(name='data'))
190
191    @property
192    def _target_generator(self):
193        if self._internal_target_generator is None:
194            if self._net_none:
195                return None
196            from ....model_zoo.rcnn.rpn.rpn_target import RPNTargetGenerator
197            self._internal_target_generator = RPNTargetGenerator(
198                num_sample=self._num_sample, pos_iou_thresh=self._pos_iou_thresh,
199                neg_iou_thresh=self._neg_iou_thresh, pos_ratio=self._pos_ratio,
200                stds=self._box_norm, **self._kwargs)
201            return self._internal_target_generator
202        else:
203            return self._internal_target_generator
204
205    def __call__(self, src, label):
206        """Apply transform to training image/label."""
207        # resize shorter side but keep in max_size
208        h, w, _ = src.shape
209        if self._random_resize:
210            short = randint(self._short[0], self._short[1])
211        else:
212            short = self._short
213        img = timage.resize_short_within(src, short, self._max_size, interp=1)
214        bbox = tbbox.resize(label, (w, h), (img.shape[1], img.shape[0]))
215
216        # random horizontal flip
217        h, w, _ = img.shape
218        img, flips = timage.random_flip(img, px=self._flip_p)
219        bbox = tbbox.flip(bbox, (w, h), flip_x=flips[0])
220
221        # to tensor
222        img = mx.nd.image.to_tensor(img)
223        img = mx.nd.image.normalize(img, mean=self._mean, std=self._std)
224
225        if self._anchors is None:
226            return img, bbox.astype(img.dtype)
227
228        # generate RPN target so cpu workers can help reduce the workload
229        # feat_h, feat_w = (img.shape[1] // self._stride, img.shape[2] // self._stride)
230        gt_bboxes = mx.nd.array(bbox[:, :4])
231        if self._multi_stage:
232            anchor_targets = []
233            oshapes = []
234            cls_targets, box_targets, box_masks = [], [], []
235            for anchor, feat_sym in zip(self._anchors, self._feat_sym):
236                oshape = feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0]
237                anchor = anchor[:, :, :oshape[2], :oshape[3], :]
238                oshapes.append(anchor.shape)
239                anchor_targets.append(anchor.reshape((-1, 4)))
240            anchor_targets = mx.nd.concat(*anchor_targets, dim=0)
241            cls_target, box_target, box_mask = self._target_generator(
242                gt_bboxes, anchor_targets, img.shape[2], img.shape[1])
243            start_ind = 0
244            for oshape in oshapes:
245                size = oshape[2] * oshape[3] * (oshape[4] // 4)
246                lvl_cls_target = cls_target[start_ind:start_ind + size] \
247                    .reshape(oshape[2], oshape[3], -1)
248                lvl_box_target = box_target[start_ind:start_ind + size] \
249                    .reshape(oshape[2], oshape[3], -1)
250                lvl_box_mask = box_mask[start_ind:start_ind + size] \
251                    .reshape(oshape[2], oshape[3], -1)
252                start_ind += size
253                cls_targets.append(lvl_cls_target)
254                box_targets.append(lvl_box_target)
255                box_masks.append(lvl_box_mask)
256        else:
257            oshape = self._feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0]
258            anchor = self._anchors[:, :, :oshape[2], :oshape[3], :]
259            oshape = anchor.shape
260            cls_target, box_target, box_mask = self._target_generator(
261                gt_bboxes, anchor.reshape((-1, 4)), img.shape[2], img.shape[1])
262            size = oshape[2] * oshape[3] * (oshape[4] // 4)
263            cls_targets = [cls_target[0:size].reshape(oshape[2], oshape[3], -1)]
264            box_targets = [box_target[0:size].reshape(oshape[2], oshape[3], -1)]
265            box_masks = [box_mask[0:size].reshape(oshape[2], oshape[3], -1)]
266        return img, bbox.astype(img.dtype), cls_targets, box_targets, box_masks
267
268
269class FasterRCNNDefaultValTransform(object):
270    """Default Faster-RCNN validation transform.
271
272    Parameters
273    ----------
274    short : int, default is 600
275        Resize image shorter side to ``short``.
276    max_size : int, default is 1000
277        Make sure image longer side is smaller than ``max_size``.
278    mean : array-like of size 3
279        Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406].
280    std : array-like of size 3
281        Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225].
282
283    """
284
285    def __init__(self, short=600, max_size=1000,
286                 mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
287        self._mean = mean
288        self._std = std
289        self._short = short
290        self._max_size = max_size
291
292    def __call__(self, src, label):
293        """Apply transform to validation image/label."""
294        # resize shorter side but keep in max_size
295        h, w, _ = src.shape
296        img = timage.resize_short_within(src, self._short, self._max_size, interp=1)
297        # no scaling ground-truth, return image scaling ratio instead
298        bbox = tbbox.resize(label, (w, h), (img.shape[1], img.shape[0]))
299        im_scale = h / float(img.shape[0])
300
301        img = mx.nd.image.to_tensor(img)
302        img = mx.nd.image.normalize(img, mean=self._mean, std=self._std)
303        return img, bbox.astype('float32'), mx.nd.array([im_scale])
304
305
306class MaskRCNNDefaultTrainTransform(object):
307    """Default Mask RCNN training transform.
308
309    Parameters
310    ----------
311    short : int/tuple, default is 600
312        Resize image shorter side to ``short``.
313        Resize the shorter side of the image randomly within the given range, if it is a tuple.
314    max_size : int, default is 1000
315        Make sure image longer side is smaller than ``max_size``.
316    net : mxnet.gluon.HybridBlock, optional
317        The Mask R-CNN network.
318
319        .. hint::
320
321            If net is ``None``, the transformation will not generate training targets.
322            Otherwise it will generate training targets to accelerate the training phase
323            since we push some workload to CPU workers instead of GPUs.
324
325    mean : array-like of size 3
326        Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406].
327    std : array-like of size 3
328        Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225].
329    box_norm : array-like of size 4, default is (1., 1., 1., 1.)
330        Std value to be divided from encoded values.
331    num_sample : int, default is 256
332        Number of samples for RPN targets.
333    pos_iou_thresh : float, default is 0.7
334        Anchors larger than ``pos_iou_thresh`` is regarded as positive samples.
335    neg_iou_thresh : float, default is 0.3
336        Anchors smaller than ``neg_iou_thresh`` is regarded as negative samples.
337        Anchors with IOU in between ``pos_iou_thresh`` and ``neg_iou_thresh`` are
338        ignored.
339    pos_ratio : float, default is 0.5
340        ``pos_ratio`` defines how many positive samples (``pos_ratio * num_sample``) is
341        to be sampled.
342    ashape : int, default is 128
343        Defines shape of pre generated anchors for target generation
344    multi_stage : boolean, default is False
345        Whether the network output multi stage features.
346    """
347
348    def __init__(self, short=600, max_size=1000, net=None, mean=(0.485, 0.456, 0.406),
349                 std=(0.229, 0.224, 0.225), box_norm=(1., 1., 1., 1.),
350                 num_sample=256, pos_iou_thresh=0.7, neg_iou_thresh=0.3,
351                 pos_ratio=0.5, ashape=128, multi_stage=False, **kwargs):
352        self._short = short
353        self._max_size = max_size
354        self._mean = mean
355        self._std = std
356        self._box_norm = box_norm
357        self._anchors = None
358        self._multi_stage = multi_stage
359        self._random_resize = isinstance(self._short, (tuple, list))
360        self._num_sample = num_sample
361        self._pos_iou_thresh = pos_iou_thresh
362        self._neg_iou_thresh = neg_iou_thresh
363        self._pos_ratio = pos_ratio
364        self._internal_target_generator = None
365        self._net_none = False
366        self._kwargs = kwargs
367        if net is None:
368            self._net_none = True
369            return
370
371        # use fake data to generate fixed anchors for target generation
372        anchors = []  # [P2, P3, P4, P5]
373        # in case network has reset_ctx to gpu
374        anchor_generator = copy.deepcopy(net.rpn.anchor_generator)
375        anchor_generator.collect_params().reset_ctx(None)
376        if self._multi_stage:
377            for ag in anchor_generator:
378                anchor = ag(mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1))
379                ashape = max(ashape // 2, 16)
380                anchors.append(anchor)
381        else:
382            anchors = anchor_generator(
383                mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1))
384        self._anchors = anchors
385        # record feature extractor for infer_shape
386        if not hasattr(net, 'features'):
387            raise ValueError("Cannot find features in network, it is a Mask RCNN network?")
388        self._feat_sym = net.features(mx.sym.var(name='data'))
389
390    @property
391    def _target_generator(self):
392        if self._internal_target_generator is None:
393            if self._net_none:
394                return None
395            from ....model_zoo.rcnn.rpn.rpn_target import RPNTargetGenerator
396            self._internal_target_generator = RPNTargetGenerator(
397                num_sample=self._num_sample, pos_iou_thresh=self._pos_iou_thresh,
398                neg_iou_thresh=self._neg_iou_thresh, pos_ratio=self._pos_ratio,
399                stds=self._box_norm, **self._kwargs)
400            return self._internal_target_generator
401        else:
402            return self._internal_target_generator
403
404    def __call__(self, src, label, segm):
405        """Apply transform to training image/label."""
406        # resize shorter side but keep in max_size
407        h, w, _ = src.shape
408        if self._random_resize:
409            short = randint(self._short[0], self._short[1])
410        else:
411            short = self._short
412        img = timage.resize_short_within(src, short, self._max_size, interp=1)
413        bbox = tbbox.resize(label, (w, h), (img.shape[1], img.shape[0]))
414        segm = [tmask.resize(polys, (w, h), (img.shape[1], img.shape[0])) for polys in segm]
415
416        # random horizontal flip
417        h, w, _ = img.shape
418        img, flips = timage.random_flip(img, px=0.5)
419        bbox = tbbox.flip(bbox, (w, h), flip_x=flips[0])
420        segm = [tmask.flip(polys, (w, h), flip_x=flips[0]) for polys in segm]
421
422        # gt_masks (n, im_height, im_width) of uint8 -> float32 (cannot take uint8)
423        masks = [mx.nd.array(tmask.to_mask(polys, (w, h))) for polys in segm]
424        # n * (im_height, im_width) -> (n, im_height, im_width)
425        masks = mx.nd.stack(*masks, axis=0)
426
427        # to tensor
428        img = mx.nd.image.to_tensor(img)
429        img = mx.nd.image.normalize(img, mean=self._mean, std=self._std)
430
431        if self._anchors is None:
432            return img, bbox.astype(img.dtype), masks
433
434        # generate RPN target so cpu workers can help reduce the workload
435        # feat_h, feat_w = (img.shape[1] // self._stride, img.shape[2] // self._stride)
436        gt_bboxes = mx.nd.array(bbox[:, :4])
437        if self._multi_stage:
438            anchor_targets = []
439            oshapes = []
440            cls_targets, box_targets, box_masks = [], [], []
441            for anchor, feat_sym in zip(self._anchors, self._feat_sym):
442                oshape = feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0]
443                anchor = anchor[:, :, :oshape[2], :oshape[3], :]
444                oshapes.append(anchor.shape)
445                anchor_targets.append(anchor.reshape((-1, 4)))
446            anchor_targets = mx.nd.concat(*anchor_targets, dim=0)
447            cls_target, box_target, box_mask = self._target_generator(
448                gt_bboxes, anchor_targets, img.shape[2], img.shape[1])
449            start_ind = 0
450            for oshape in oshapes:
451                size = oshape[2] * oshape[3] * (oshape[4] // 4)
452                lvl_cls_target = cls_target[start_ind:start_ind + size] \
453                    .reshape(oshape[2], oshape[3], -1)
454                lvl_box_target = box_target[start_ind:start_ind + size] \
455                    .reshape(oshape[2], oshape[3], -1)
456                lvl_box_mask = box_mask[start_ind:start_ind + size] \
457                    .reshape(oshape[2], oshape[3], -1)
458                start_ind += size
459                cls_targets.append(lvl_cls_target)
460                box_targets.append(lvl_box_target)
461                box_masks.append(lvl_box_mask)
462        else:
463            oshape = self._feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0]
464            anchor = self._anchors[:, :, :oshape[2], :oshape[3], :]
465            oshape = anchor.shape
466            cls_target, box_target, box_mask = self._target_generator(
467                gt_bboxes, anchor.reshape((-1, 4)), img.shape[2], img.shape[1])
468            size = oshape[2] * oshape[3] * (oshape[4] // 4)
469            cls_targets = [cls_target[0:size].reshape(oshape[2], oshape[3], -1)]
470            box_targets = [box_target[0:size].reshape(oshape[2], oshape[3], -1)]
471            box_masks = [box_mask[0:size].reshape(oshape[2], oshape[3], -1)]
472        return img, bbox.astype(img.dtype), cls_targets, box_targets, box_masks, masks
473
474
475class MaskRCNNDefaultValTransform(object):
476    """Default Mask RCNN validation transform.
477
478    Parameters
479    ----------
480    short : int, default is 600
481        Resize image shorter side to ``short``.
482    max_size : int, default is 1000
483        Make sure image longer side is smaller than ``max_size``.
484    mean : array-like of size 3
485        Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406].
486    std : array-like of size 3
487        Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225].
488
489    """
490
491    def __init__(self, short=600, max_size=1000,
492                 mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
493        self._mean = mean
494        self._std = std
495        self._short = short
496        self._max_size = max_size
497
498    def __call__(self, src, label, mask):
499        """Apply transform to validation image/label."""
500        # resize shorter side but keep in max_size
501        h, _, _ = src.shape
502        img = timage.resize_short_within(src, self._short, self._max_size, interp=1)
503        # no scaling ground-truth, return image scaling ratio instead
504        im_scale = float(img.shape[0]) / h
505
506        img = mx.nd.image.to_tensor(img)
507        img = mx.nd.image.normalize(img, mean=self._mean, std=self._std)
508        return img, mx.nd.array([img.shape[-2], img.shape[-1], im_scale])
509