1"""Transforms for RCNN series.""" 2# pylint: disable=not-callable 3from __future__ import absolute_import 4 5import copy 6from random import randint 7 8import mxnet as mx 9 10from .. import bbox as tbbox 11from .. import image as timage 12from .. import mask as tmask 13 14__all__ = ['transform_test', 'load_test', 15 'FasterRCNNDefaultTrainTransform', 'FasterRCNNDefaultValTransform', 16 'MaskRCNNDefaultTrainTransform', 'MaskRCNNDefaultValTransform'] 17 18 19def transform_test(imgs, short=600, max_size=1000, mean=(0.485, 0.456, 0.406), 20 std=(0.229, 0.224, 0.225)): 21 """A util function to transform all images to tensors as network input by applying 22 normalizations. This function support 1 NDArray or iterable of NDArrays. 23 24 Parameters 25 ---------- 26 imgs : NDArray or iterable of NDArray 27 Image(s) to be transformed. 28 short : int, optional, default is 600 29 Resize image short side to this `short` and keep aspect ratio. 30 max_size : int, optional, default is 1000 31 Maximum longer side length to fit image. 32 This is to limit the input image shape, avoid processing too large image. 33 mean : iterable of float 34 Mean pixel values. 35 std : iterable of float 36 Standard deviations of pixel values. 37 38 Returns 39 ------- 40 (mxnet.NDArray, numpy.ndarray) or list of such tuple 41 A (1, 3, H, W) mxnet NDArray as input to network, and a numpy ndarray as 42 original un-normalized color image for display. 43 If multiple image names are supplied, return two lists. You can use 44 `zip()`` to collapse it. 45 46 """ 47 if isinstance(imgs, mx.nd.NDArray): 48 imgs = [imgs] 49 for im in imgs: 50 assert isinstance(im, mx.nd.NDArray), "Expect NDArray, got {}".format(type(im)) 51 52 tensors = [] 53 origs = [] 54 for img in imgs: 55 img = timage.resize_short_within(img, short, max_size) 56 orig_img = img.asnumpy().astype('uint8') 57 img = mx.nd.image.to_tensor(img) 58 img = mx.nd.image.normalize(img, mean=mean, std=std) 59 tensors.append(img.expand_dims(0)) 60 origs.append(orig_img) 61 if len(tensors) == 1: 62 return tensors[0], origs[0] 63 return tensors, origs 64 65 66def load_test(filenames, short=600, max_size=1000, mean=(0.485, 0.456, 0.406), 67 std=(0.229, 0.224, 0.225)): 68 """A util function to load all images, transform them to tensor by applying 69 normalizations. This function support 1 filename or list of filenames. 70 71 Parameters 72 ---------- 73 filenames : str or list of str 74 Image filename(s) to be loaded. 75 short : int, optional, default is 600 76 Resize image short side to this `short` and keep aspect ratio. 77 max_size : int, optional, default is 1000 78 Maximum longer side length to fit image. 79 This is to limit the input image shape, avoid processing too large image. 80 mean : iterable of float 81 Mean pixel values. 82 std : iterable of float 83 Standard deviations of pixel values. 84 85 Returns 86 ------- 87 (mxnet.NDArray, numpy.ndarray) or list of such tuple 88 A (1, 3, H, W) mxnet NDArray as input to network, and a numpy ndarray as 89 original un-normalized color image for display. 90 If multiple image names are supplied, return two lists. You can use 91 `zip()`` to collapse it. 92 93 """ 94 if isinstance(filenames, str): 95 filenames = [filenames] 96 imgs = [mx.image.imread(f) for f in filenames] 97 return transform_test(imgs, short, max_size, mean, std) 98 99 100class FasterRCNNDefaultTrainTransform(object): 101 """Default Faster-RCNN training transform. 102 103 Parameters 104 ---------- 105 short : int/tuple, default is 600 106 Resize image shorter side to ``short``. 107 Resize the shorter side of the image randomly within the given range, if it is a tuple. 108 max_size : int, default is 1000 109 Make sure image longer side is smaller than ``max_size``. 110 net : mxnet.gluon.HybridBlock, optional 111 The faster-rcnn network. 112 113 .. hint:: 114 115 If net is ``None``, the transformation will not generate training targets. 116 Otherwise it will generate training targets to accelerate the training phase 117 since we push some workload to CPU workers instead of GPUs. 118 119 mean : array-like of size 3 120 Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406]. 121 std : array-like of size 3 122 Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225]. 123 box_norm : array-like of size 4, default is (1., 1., 1., 1.) 124 Std value to be divided from encoded values. 125 num_sample : int, default is 256 126 Number of samples for RPN targets. 127 pos_iou_thresh : float, default is 0.7 128 Anchors larger than ``pos_iou_thresh`` is regarded as positive samples. 129 neg_iou_thresh : float, default is 0.3 130 Anchors smaller than ``neg_iou_thresh`` is regarded as negative samples. 131 Anchors with IOU in between ``pos_iou_thresh`` and ``neg_iou_thresh`` are 132 ignored. 133 pos_ratio : float, default is 0.5 134 ``pos_ratio`` defines how many positive samples (``pos_ratio * num_sample``) is 135 to be sampled. 136 flip_p : float, default is 0.5 137 Probability to flip horizontally, by default is 0.5 for random horizontal flip. 138 You may set it to 0 to disable random flip or 1 to force flip. 139 ashape : int, default is 128 140 Defines shape of pre generated anchors for target generation 141 multi_stage : boolean, default is False 142 Whether the network output multi stage features. 143 """ 144 145 def __init__(self, short=600, max_size=1000, net=None, mean=(0.485, 0.456, 0.406), 146 std=(0.229, 0.224, 0.225), box_norm=(1., 1., 1., 1.), 147 num_sample=256, pos_iou_thresh=0.7, neg_iou_thresh=0.3, 148 pos_ratio=0.5, flip_p=0.5, ashape=128, multi_stage=False, **kwargs): 149 self._short = short 150 self._max_size = max_size 151 self._mean = mean 152 self._std = std 153 self._box_norm = box_norm 154 self._anchors = None 155 self._multi_stage = multi_stage 156 self._random_resize = isinstance(self._short, (tuple, list)) 157 self._num_sample = num_sample 158 self._pos_iou_thresh = pos_iou_thresh 159 self._neg_iou_thresh = neg_iou_thresh 160 self._pos_ratio = pos_ratio 161 self._flip_p = flip_p 162 self._internal_target_generator = None 163 self._net_none = False 164 self._kwargs = kwargs 165 if net is None: 166 self._net_none = True 167 return 168 169 # use fake data to generate fixed anchors for target generation 170 anchors = [] # [P2, P3, P4, P5] 171 # in case network has reset_ctx to gpu 172 # anchor_generator = copy.deepcopy(net.rpn.anchor_generator) 173 anchor_generator = net.rpn.anchor_generator 174 old_ctx = list(anchor_generator.collect_params().values())[0].list_ctx() 175 anchor_generator.collect_params().reset_ctx(mx.cpu()) 176 if self._multi_stage: 177 for ag in anchor_generator: 178 anchor = ag(mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1)) 179 ashape = max(ashape // 2, 16) 180 anchors.append(anchor) 181 else: 182 anchors = anchor_generator( 183 mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1)) 184 self._anchors = anchors 185 anchor_generator.collect_params().reset_ctx(old_ctx) 186 # record feature extractor for infer_shape 187 if not hasattr(net, 'features'): 188 raise ValueError("Cannot find features in network, it is a Faster-RCNN network?") 189 self._feat_sym = net.features(mx.sym.var(name='data')) 190 191 @property 192 def _target_generator(self): 193 if self._internal_target_generator is None: 194 if self._net_none: 195 return None 196 from ....model_zoo.rcnn.rpn.rpn_target import RPNTargetGenerator 197 self._internal_target_generator = RPNTargetGenerator( 198 num_sample=self._num_sample, pos_iou_thresh=self._pos_iou_thresh, 199 neg_iou_thresh=self._neg_iou_thresh, pos_ratio=self._pos_ratio, 200 stds=self._box_norm, **self._kwargs) 201 return self._internal_target_generator 202 else: 203 return self._internal_target_generator 204 205 def __call__(self, src, label): 206 """Apply transform to training image/label.""" 207 # resize shorter side but keep in max_size 208 h, w, _ = src.shape 209 if self._random_resize: 210 short = randint(self._short[0], self._short[1]) 211 else: 212 short = self._short 213 img = timage.resize_short_within(src, short, self._max_size, interp=1) 214 bbox = tbbox.resize(label, (w, h), (img.shape[1], img.shape[0])) 215 216 # random horizontal flip 217 h, w, _ = img.shape 218 img, flips = timage.random_flip(img, px=self._flip_p) 219 bbox = tbbox.flip(bbox, (w, h), flip_x=flips[0]) 220 221 # to tensor 222 img = mx.nd.image.to_tensor(img) 223 img = mx.nd.image.normalize(img, mean=self._mean, std=self._std) 224 225 if self._anchors is None: 226 return img, bbox.astype(img.dtype) 227 228 # generate RPN target so cpu workers can help reduce the workload 229 # feat_h, feat_w = (img.shape[1] // self._stride, img.shape[2] // self._stride) 230 gt_bboxes = mx.nd.array(bbox[:, :4]) 231 if self._multi_stage: 232 anchor_targets = [] 233 oshapes = [] 234 cls_targets, box_targets, box_masks = [], [], [] 235 for anchor, feat_sym in zip(self._anchors, self._feat_sym): 236 oshape = feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0] 237 anchor = anchor[:, :, :oshape[2], :oshape[3], :] 238 oshapes.append(anchor.shape) 239 anchor_targets.append(anchor.reshape((-1, 4))) 240 anchor_targets = mx.nd.concat(*anchor_targets, dim=0) 241 cls_target, box_target, box_mask = self._target_generator( 242 gt_bboxes, anchor_targets, img.shape[2], img.shape[1]) 243 start_ind = 0 244 for oshape in oshapes: 245 size = oshape[2] * oshape[3] * (oshape[4] // 4) 246 lvl_cls_target = cls_target[start_ind:start_ind + size] \ 247 .reshape(oshape[2], oshape[3], -1) 248 lvl_box_target = box_target[start_ind:start_ind + size] \ 249 .reshape(oshape[2], oshape[3], -1) 250 lvl_box_mask = box_mask[start_ind:start_ind + size] \ 251 .reshape(oshape[2], oshape[3], -1) 252 start_ind += size 253 cls_targets.append(lvl_cls_target) 254 box_targets.append(lvl_box_target) 255 box_masks.append(lvl_box_mask) 256 else: 257 oshape = self._feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0] 258 anchor = self._anchors[:, :, :oshape[2], :oshape[3], :] 259 oshape = anchor.shape 260 cls_target, box_target, box_mask = self._target_generator( 261 gt_bboxes, anchor.reshape((-1, 4)), img.shape[2], img.shape[1]) 262 size = oshape[2] * oshape[3] * (oshape[4] // 4) 263 cls_targets = [cls_target[0:size].reshape(oshape[2], oshape[3], -1)] 264 box_targets = [box_target[0:size].reshape(oshape[2], oshape[3], -1)] 265 box_masks = [box_mask[0:size].reshape(oshape[2], oshape[3], -1)] 266 return img, bbox.astype(img.dtype), cls_targets, box_targets, box_masks 267 268 269class FasterRCNNDefaultValTransform(object): 270 """Default Faster-RCNN validation transform. 271 272 Parameters 273 ---------- 274 short : int, default is 600 275 Resize image shorter side to ``short``. 276 max_size : int, default is 1000 277 Make sure image longer side is smaller than ``max_size``. 278 mean : array-like of size 3 279 Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406]. 280 std : array-like of size 3 281 Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225]. 282 283 """ 284 285 def __init__(self, short=600, max_size=1000, 286 mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): 287 self._mean = mean 288 self._std = std 289 self._short = short 290 self._max_size = max_size 291 292 def __call__(self, src, label): 293 """Apply transform to validation image/label.""" 294 # resize shorter side but keep in max_size 295 h, w, _ = src.shape 296 img = timage.resize_short_within(src, self._short, self._max_size, interp=1) 297 # no scaling ground-truth, return image scaling ratio instead 298 bbox = tbbox.resize(label, (w, h), (img.shape[1], img.shape[0])) 299 im_scale = h / float(img.shape[0]) 300 301 img = mx.nd.image.to_tensor(img) 302 img = mx.nd.image.normalize(img, mean=self._mean, std=self._std) 303 return img, bbox.astype('float32'), mx.nd.array([im_scale]) 304 305 306class MaskRCNNDefaultTrainTransform(object): 307 """Default Mask RCNN training transform. 308 309 Parameters 310 ---------- 311 short : int/tuple, default is 600 312 Resize image shorter side to ``short``. 313 Resize the shorter side of the image randomly within the given range, if it is a tuple. 314 max_size : int, default is 1000 315 Make sure image longer side is smaller than ``max_size``. 316 net : mxnet.gluon.HybridBlock, optional 317 The Mask R-CNN network. 318 319 .. hint:: 320 321 If net is ``None``, the transformation will not generate training targets. 322 Otherwise it will generate training targets to accelerate the training phase 323 since we push some workload to CPU workers instead of GPUs. 324 325 mean : array-like of size 3 326 Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406]. 327 std : array-like of size 3 328 Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225]. 329 box_norm : array-like of size 4, default is (1., 1., 1., 1.) 330 Std value to be divided from encoded values. 331 num_sample : int, default is 256 332 Number of samples for RPN targets. 333 pos_iou_thresh : float, default is 0.7 334 Anchors larger than ``pos_iou_thresh`` is regarded as positive samples. 335 neg_iou_thresh : float, default is 0.3 336 Anchors smaller than ``neg_iou_thresh`` is regarded as negative samples. 337 Anchors with IOU in between ``pos_iou_thresh`` and ``neg_iou_thresh`` are 338 ignored. 339 pos_ratio : float, default is 0.5 340 ``pos_ratio`` defines how many positive samples (``pos_ratio * num_sample``) is 341 to be sampled. 342 ashape : int, default is 128 343 Defines shape of pre generated anchors for target generation 344 multi_stage : boolean, default is False 345 Whether the network output multi stage features. 346 """ 347 348 def __init__(self, short=600, max_size=1000, net=None, mean=(0.485, 0.456, 0.406), 349 std=(0.229, 0.224, 0.225), box_norm=(1., 1., 1., 1.), 350 num_sample=256, pos_iou_thresh=0.7, neg_iou_thresh=0.3, 351 pos_ratio=0.5, ashape=128, multi_stage=False, **kwargs): 352 self._short = short 353 self._max_size = max_size 354 self._mean = mean 355 self._std = std 356 self._box_norm = box_norm 357 self._anchors = None 358 self._multi_stage = multi_stage 359 self._random_resize = isinstance(self._short, (tuple, list)) 360 self._num_sample = num_sample 361 self._pos_iou_thresh = pos_iou_thresh 362 self._neg_iou_thresh = neg_iou_thresh 363 self._pos_ratio = pos_ratio 364 self._internal_target_generator = None 365 self._net_none = False 366 self._kwargs = kwargs 367 if net is None: 368 self._net_none = True 369 return 370 371 # use fake data to generate fixed anchors for target generation 372 anchors = [] # [P2, P3, P4, P5] 373 # in case network has reset_ctx to gpu 374 anchor_generator = copy.deepcopy(net.rpn.anchor_generator) 375 anchor_generator.collect_params().reset_ctx(None) 376 if self._multi_stage: 377 for ag in anchor_generator: 378 anchor = ag(mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1)) 379 ashape = max(ashape // 2, 16) 380 anchors.append(anchor) 381 else: 382 anchors = anchor_generator( 383 mx.nd.zeros((1, 3, ashape, ashape))).reshape((1, 1, ashape, ashape, -1)) 384 self._anchors = anchors 385 # record feature extractor for infer_shape 386 if not hasattr(net, 'features'): 387 raise ValueError("Cannot find features in network, it is a Mask RCNN network?") 388 self._feat_sym = net.features(mx.sym.var(name='data')) 389 390 @property 391 def _target_generator(self): 392 if self._internal_target_generator is None: 393 if self._net_none: 394 return None 395 from ....model_zoo.rcnn.rpn.rpn_target import RPNTargetGenerator 396 self._internal_target_generator = RPNTargetGenerator( 397 num_sample=self._num_sample, pos_iou_thresh=self._pos_iou_thresh, 398 neg_iou_thresh=self._neg_iou_thresh, pos_ratio=self._pos_ratio, 399 stds=self._box_norm, **self._kwargs) 400 return self._internal_target_generator 401 else: 402 return self._internal_target_generator 403 404 def __call__(self, src, label, segm): 405 """Apply transform to training image/label.""" 406 # resize shorter side but keep in max_size 407 h, w, _ = src.shape 408 if self._random_resize: 409 short = randint(self._short[0], self._short[1]) 410 else: 411 short = self._short 412 img = timage.resize_short_within(src, short, self._max_size, interp=1) 413 bbox = tbbox.resize(label, (w, h), (img.shape[1], img.shape[0])) 414 segm = [tmask.resize(polys, (w, h), (img.shape[1], img.shape[0])) for polys in segm] 415 416 # random horizontal flip 417 h, w, _ = img.shape 418 img, flips = timage.random_flip(img, px=0.5) 419 bbox = tbbox.flip(bbox, (w, h), flip_x=flips[0]) 420 segm = [tmask.flip(polys, (w, h), flip_x=flips[0]) for polys in segm] 421 422 # gt_masks (n, im_height, im_width) of uint8 -> float32 (cannot take uint8) 423 masks = [mx.nd.array(tmask.to_mask(polys, (w, h))) for polys in segm] 424 # n * (im_height, im_width) -> (n, im_height, im_width) 425 masks = mx.nd.stack(*masks, axis=0) 426 427 # to tensor 428 img = mx.nd.image.to_tensor(img) 429 img = mx.nd.image.normalize(img, mean=self._mean, std=self._std) 430 431 if self._anchors is None: 432 return img, bbox.astype(img.dtype), masks 433 434 # generate RPN target so cpu workers can help reduce the workload 435 # feat_h, feat_w = (img.shape[1] // self._stride, img.shape[2] // self._stride) 436 gt_bboxes = mx.nd.array(bbox[:, :4]) 437 if self._multi_stage: 438 anchor_targets = [] 439 oshapes = [] 440 cls_targets, box_targets, box_masks = [], [], [] 441 for anchor, feat_sym in zip(self._anchors, self._feat_sym): 442 oshape = feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0] 443 anchor = anchor[:, :, :oshape[2], :oshape[3], :] 444 oshapes.append(anchor.shape) 445 anchor_targets.append(anchor.reshape((-1, 4))) 446 anchor_targets = mx.nd.concat(*anchor_targets, dim=0) 447 cls_target, box_target, box_mask = self._target_generator( 448 gt_bboxes, anchor_targets, img.shape[2], img.shape[1]) 449 start_ind = 0 450 for oshape in oshapes: 451 size = oshape[2] * oshape[3] * (oshape[4] // 4) 452 lvl_cls_target = cls_target[start_ind:start_ind + size] \ 453 .reshape(oshape[2], oshape[3], -1) 454 lvl_box_target = box_target[start_ind:start_ind + size] \ 455 .reshape(oshape[2], oshape[3], -1) 456 lvl_box_mask = box_mask[start_ind:start_ind + size] \ 457 .reshape(oshape[2], oshape[3], -1) 458 start_ind += size 459 cls_targets.append(lvl_cls_target) 460 box_targets.append(lvl_box_target) 461 box_masks.append(lvl_box_mask) 462 else: 463 oshape = self._feat_sym.infer_shape(data=(1, 3, img.shape[1], img.shape[2]))[1][0] 464 anchor = self._anchors[:, :, :oshape[2], :oshape[3], :] 465 oshape = anchor.shape 466 cls_target, box_target, box_mask = self._target_generator( 467 gt_bboxes, anchor.reshape((-1, 4)), img.shape[2], img.shape[1]) 468 size = oshape[2] * oshape[3] * (oshape[4] // 4) 469 cls_targets = [cls_target[0:size].reshape(oshape[2], oshape[3], -1)] 470 box_targets = [box_target[0:size].reshape(oshape[2], oshape[3], -1)] 471 box_masks = [box_mask[0:size].reshape(oshape[2], oshape[3], -1)] 472 return img, bbox.astype(img.dtype), cls_targets, box_targets, box_masks, masks 473 474 475class MaskRCNNDefaultValTransform(object): 476 """Default Mask RCNN validation transform. 477 478 Parameters 479 ---------- 480 short : int, default is 600 481 Resize image shorter side to ``short``. 482 max_size : int, default is 1000 483 Make sure image longer side is smaller than ``max_size``. 484 mean : array-like of size 3 485 Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406]. 486 std : array-like of size 3 487 Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225]. 488 489 """ 490 491 def __init__(self, short=600, max_size=1000, 492 mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): 493 self._mean = mean 494 self._std = std 495 self._short = short 496 self._max_size = max_size 497 498 def __call__(self, src, label, mask): 499 """Apply transform to validation image/label.""" 500 # resize shorter side but keep in max_size 501 h, _, _ = src.shape 502 img = timage.resize_short_within(src, self._short, self._max_size, interp=1) 503 # no scaling ground-truth, return image scaling ratio instead 504 im_scale = float(img.shape[0]) / h 505 506 img = mx.nd.image.to_tensor(img) 507 img = mx.nd.image.normalize(img, mean=self._mean, std=self._std) 508 return img, mx.nd.array([img.shape[-2], img.shape[-1], im_scale]) 509