1from __future__ import absolute_import, print_function, division 2from six import integer_types 3 4import theano 5from theano import Apply, Op 6 7from theano.compile import optdb 8from theano.gof import LocalOptGroup, ParamsType 9from theano.scalar import bool as bool_t 10from theano.tensor.basic import as_tensor_variable 11from theano.tensor.opt import in2out 12 13from .basic_ops import (GpuArrayType, CGpuKernelBase, 14 as_gpuarray_variable, gpu_contiguous, infer_context_name, gpuarray_helper_inc_dir) 15from .opt_util import inplace_allocempty 16 17try: 18 import pygpu 19 from pygpu import blas 20except ImportError as e: 21 # To make sure theano is importable 22 pass 23 24 25class BlasOp(Op): 26 def c_headers(self): 27 return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>'] 28 29 def c_header_dirs(self): 30 return [pygpu.get_include(), gpuarray_helper_inc_dir()] 31 32 def c_init_code(self): 33 return ['import_pygpu__blas();'] 34 35 36class GpuGemv(BlasOp): 37 """ 38 Gemv on the GPU. 39 40 """ 41 params_type = ParamsType(inplace=bool_t) 42 __props__ = ('inplace',) 43 44 def __init__(self, inplace=False): 45 self.inplace = inplace 46 if self.inplace: 47 self.destroy_map = {0: [0]} 48 49 def make_node(self, y, alpha, A, x, beta): 50 ctx_name = infer_context_name(y, A, x) 51 A = as_gpuarray_variable(A, ctx_name) 52 x = as_gpuarray_variable(x, ctx_name) 53 y = as_gpuarray_variable(y, ctx_name) 54 alpha = as_tensor_variable(alpha) 55 beta = as_tensor_variable(beta) 56 57 assert alpha.ndim == 0 58 assert beta.ndim == 0 59 assert A.ndim == 2 60 assert x.ndim == 1 61 assert y.ndim == 1 62 assert A.dtype == x.dtype == y.dtype 63 64 # float16 not supported 65 expected = A.dtype 66 assert theano.scalar.upcast(alpha.dtype, 67 beta.dtype, expected) == expected 68 alpha = alpha.astype(expected) 69 beta = beta.astype(expected) 70 return Apply(self, [y, alpha, A, x, beta], [y.type()]) 71 72 def perform(self, node, inputs, out_storage, params): 73 y, alpha, A, x, beta = inputs 74 inplace = params.inplace 75 if inplace and y.strides[0] < 0: 76 inplace = False 77 if A.shape[1] == 0: 78 out_storage[0][0] = pygpu.zeros(y.shape, dtype=y.dtype, 79 context=y.context) 80 else: 81 out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, 82 overwrite_y=inplace) 83 84 def c_code(self, node, name, inp, out, sub): 85 vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3], 86 beta=inp[4], fail=sub['fail'], name=name, 87 params=sub['params']) 88 code = """ 89 if (!%(params)s->inplace || %(y)s->ga.strides[0] <= 0) { 90 %(out)s = theano_try_copy(%(out)s, %(y)s); 91 if (%(out)s == NULL) { 92 %(fail)s 93 } 94 } else { 95 Py_XDECREF(%(out)s); 96 %(out)s = %(y)s; 97 Py_INCREF(%(out)s); 98 } 99 """ % vars 100 # in case of possible speed up using blas dot, 101 # temporary hack A to 1D for vector-vector dot 102 code += """ 103 if (PyGpuArray_DIM(%(A)s, 1) == 0) { 104 int code; 105 code = GpuArray_memset(&%(out)s->ga, 0); 106 if (code != GA_NO_ERROR) { 107 PyErr_SetString(PyExc_RuntimeError, "Memset failed"); 108 %(fail)s 109 } 110 } else if ( PyGpuArray_DIM(%(A)s, 0) == 1 111 &&((dtype_%(alpha)s*)PyArray_DATA(%(alpha)s))[0] == (dtype_%(alpha)s)1. 112 &&((dtype_%(beta)s*)PyArray_DATA(%(beta)s))[0] == (dtype_%(beta)s)0. 113 ) { 114 %(out)s->ga.nd = 0; 115 %(A)s->ga.nd = 1; 116 %(A)s->ga.dimensions[0] = %(A)s->ga.dimensions[1]; 117 ssize_t a_stride0 = %(A)s->ga.strides[0]; 118 %(A)s->ga.strides[0] = %(A)s->ga.strides[1]; 119 if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) { 120 %(fail)s 121 } 122 %(A)s->ga.strides[0] = a_stride0; 123 %(out)s->ga.nd = 1; 124 %(A)s->ga.nd = 2; 125 %(A)s->ga.dimensions[0] = 1; 126 } else if ( 127 pygpu_blas_rgemv(cb_no_trans, 128 ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], 129 %(A)s, %(x)s, 130 ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0], 131 %(out)s, 0) == -1) { 132 %(fail)s 133 } 134 """ % vars 135 136 return code 137 138 def c_code_cache_version(self): 139 return (10,) 140 141gpugemv_no_inplace = GpuGemv(inplace=False) 142gpugemv_inplace = GpuGemv(inplace=True) 143 144 145class GpuGemm(BlasOp): 146 """ 147 Gemm on the GPU. 148 149 """ 150 params_type = ParamsType(inplace=bool_t) 151 __props__ = ('inplace',) 152 _f16_ok = True 153 154 def __init__(self, inplace=False): 155 self.inplace = inplace 156 if self.inplace: 157 self.destroy_map = {0: [0]} 158 159 def make_node(self, C, alpha, A, B, beta): 160 ctx_name = infer_context_name(C, A, B) 161 A = as_gpuarray_variable(A, ctx_name) 162 B = as_gpuarray_variable(B, ctx_name) 163 C = as_gpuarray_variable(C, ctx_name) 164 alpha = as_tensor_variable(alpha) 165 beta = as_tensor_variable(beta) 166 167 if not (A.dtype == B.dtype == C.dtype): 168 raise TypeError(theano.tensor.blas.Gemm.E_mixed, 169 (A.dtype, B.dtype, C.dtype, 170 alpha.dtype, beta.dtype)) 171 if not A.dtype.startswith('float'): 172 raise TypeError(theano.tensor.blas.Gemm.E_float, (A.dtype)) 173 174 if A.dtype == 'float16': 175 expected = 'float32' 176 else: 177 expected = A.dtype 178 assert theano.scalar.upcast(alpha.dtype, 179 beta.dtype, expected) == expected 180 alpha = alpha.astype(expected) 181 beta = beta.astype(expected) 182 183 assert alpha.ndim == 0 184 assert beta.ndim == 0 185 assert A.ndim == 2 186 assert B.ndim == 2 187 assert C.ndim == 2 188 return Apply(self, [C, alpha, A, B, beta], [C.type()]) 189 190 def perform(self, node, inputs, outputs, params): 191 C, alpha, A, B, beta = inputs 192 inplace = params.inplace 193 if inplace and not C.flags.forc: 194 inplace = False 195 outputs[0][0] = blas.gemm(alpha, A, B, beta, C, 196 overwrite_c=inplace) 197 198 def c_code(self, node, name, inp, out, sub): 199 vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3], 200 beta=inp[4], fail=sub['fail'], name=name, 201 params=sub['params']) 202 code = """ 203 if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(C)s->ga)) { 204 %(out)s = theano_try_copy(%(out)s, %(C)s); 205 if (%(out)s == NULL) { 206 %(fail)s 207 } 208 } else { 209 Py_XDECREF(%(out)s); 210 %(out)s = %(C)s; 211 Py_INCREF(%(out)s); 212 } 213 if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans, 214 ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], 215 %(A)s, %(B)s, 216 ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0], 217 %(out)s, 0) == -1) { 218 %(fail)s 219 } 220 """ % vars 221 222 return code 223 224 def c_code_cache_version(self): 225 return (7,) 226 227gpugemm_no_inplace = GpuGemm(inplace=False) 228gpugemm_inplace = GpuGemm(inplace=True) 229 230 231class GpuGer(BlasOp): 232 """ 233 Ger on the GPU. 234 235 """ 236 params_type = ParamsType(inplace=bool_t) 237 __props__ = ('inplace',) 238 239 def __init__(self, inplace=False): 240 self.inplace = inplace 241 if self.inplace: 242 self.destroy_map = {0: [0]} 243 244 def make_node(self, A, alpha, x, y): 245 ctx_name = infer_context_name(A, x, y) 246 A = as_gpuarray_variable(A, ctx_name) 247 x = as_gpuarray_variable(x, ctx_name) 248 y = as_gpuarray_variable(y, ctx_name) 249 alpha = as_tensor_variable(alpha) 250 if not(A.dtype == x.dtype == y.dtype): 251 raise TypeError('ger requires matching dtypes', 252 (A.dtype, alpha.dtype, x.dtype, y.dtype)) 253 254 assert theano.scalar.upcast(alpha.dtype, A.dtype) == A.dtype 255 alpha = alpha.astype(A.dtype) 256 assert alpha.ndim == 0 257 assert A.ndim == 2 258 assert x.ndim == 1 259 assert y.ndim == 1 260 return Apply(self, [A, alpha, x, y], [A.type()]) 261 262 def perform(self, node, inp, out, params): 263 A, alpha, x, y = inp 264 inplace = params.inplace 265 if inplace and not A.flags.forc: 266 inplace = False 267 out[0][0] = blas.ger(alpha, x, y, A, 268 overwrite_a=inplace) 269 270 def c_code(self, node, name, inp, out, sub): 271 vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3], 272 fail=sub['fail'], name=name, params=sub['params']) 273 code = """ 274 if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(A)s->ga)) { 275 %(out)s = theano_try_copy(%(out)s, %(A)s); 276 if (%(out)s == NULL) { 277 %(fail)s 278 } 279 } else { 280 Py_XDECREF(%(out)s); 281 %(out)s = %(A)s; 282 Py_INCREF(%(out)s); 283 } 284 if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], 285 %(x)s, %(y)s, %(out)s, 0) == -1) { 286 %(fail)s 287 } 288 """ % vars 289 290 return code 291 292 def c_code_cache_version(self): 293 return (5,) 294 295 296gpuger_no_inplace = GpuGer(inplace=False) 297gpuger_inplace = GpuGer(inplace=True) 298 299 300class GpuDot22(BlasOp): 301 """ 302 Dot22 on the GPU. 303 304 """ 305 _f16_ok = True 306 __props__ = () 307 308 def make_node(self, x, y): 309 ctx_name = infer_context_name(x, y) 310 x = as_gpuarray_variable(x, ctx_name) 311 y = as_gpuarray_variable(y, ctx_name) 312 assert x.ndim == 2 313 assert y.ndim == 2 314 assert x.dtype == y.dtype 315 otype = x.type.clone( 316 broadcastable=(x.type.broadcastable[0], y.type.broadcastable[1])) 317 return Apply(self, [x, y], [otype()]) 318 319 def perform(self, node, inputs, outputs): 320 x, y = inputs 321 322 out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype, 323 context=x.context) 324 outputs[0][0] = blas.gemm(1., x, y, 0., out, 325 overwrite_c=True) 326 327 def c_code(self, node, name, inputs, outputs, sub): 328 dtype = node.inputs[0].dtype 329 typecode = pygpu.gpuarray.dtype_to_typecode(dtype) 330 vars = dict(A=inputs[0], B=inputs[1], dtype=dtype, out=outputs[0], 331 typecode=typecode, 332 fail=sub['fail'], name=name) 333 code = """ 334 double one = 1.; 335 double zero = 0.; 336 337 size_t dims[] = {0, 0}; 338 dims[0] = PyGpuArray_DIMS(%(A)s)[0]; 339 dims[1] = PyGpuArray_DIMS(%(B)s)[1]; 340 341 if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER, 342 %(A)s->context)) { 343 %(fail)s 344 } 345 346 if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans, 347 one, 348 %(A)s, %(B)s, 349 zero, 350 %(out)s, 0) == -1) { 351 %(fail)s 352 } 353 """ % vars 354 355 return code 356 357 def c_code_cache_version(self): 358 return (5,) 359 360gpu_dot22 = GpuDot22() 361 362 363class GpuGemmBatch(BlasOp): 364 params_type = ParamsType(inplace=bool_t) 365 __props__ = ('inplace',) 366 _f16_ok = True 367 368 def __init__(self, inplace=False): 369 self.inplace = inplace 370 if self.inplace: 371 self.destroy_map = {0: [0]} 372 373 def make_node(self, C, alpha, A, B, beta): 374 ctx_name = infer_context_name(C, A, B) 375 A = as_gpuarray_variable(A, ctx_name) 376 B = as_gpuarray_variable(B, ctx_name) 377 C = as_gpuarray_variable(C, ctx_name) 378 alpha = as_tensor_variable(alpha) 379 if alpha.dtype == 'float16': 380 alpha = alpha.astype('float32') 381 beta = as_tensor_variable(beta) 382 if beta.dtype == 'float16': 383 beta = beta.astype('float32') 384 assert alpha.ndim == 0 385 assert beta.ndim == 0 386 assert A.ndim == 3 387 assert B.ndim == 3 388 assert C.ndim == 3 389 assert A.dtype == B.dtype == C.dtype 390 if A.dtype in ('float32', 'float64'): 391 assert A.dtype == alpha.dtype == beta.dtype 392 else: 393 assert 'float32' == alpha.dtype == beta.dtype 394 return Apply(self, [C, alpha, A, B, beta], [C.type()]) 395 396 def c_headers(self): 397 return super(GpuGemmBatch, self).c_headers() + ['<gpuarray/blas.h>'] 398 399 def c_code(self, node, name, inp, out, sub): 400 vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3], 401 beta=inp[4], params=sub['params'], 402 fail=sub['fail'], name=name) 403 code = """ 404 int err; 405 if (%(params)s->inplace){ 406 if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) { 407 %(out)s = theano_try_copy(%(out)s, %(C)s); 408 if (%(out)s == NULL) { 409 %(fail)s 410 } 411 } else { 412 Py_XDECREF(%(out)s); 413 %(out)s = %(C)s; 414 Py_INCREF(%(out)s); 415 } 416 } else { 417 %(out)s = theano_try_copy(%(out)s, %(C)s); 418 if (%(out)s == NULL) { 419 %(fail)s 420 } 421 } 422 err = GpuArray_rgemmBatch_3d( 423 cb_no_trans, cb_no_trans, 424 ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], 425 &%(A)s->ga, &%(B)s->ga, 426 ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0], 427 &%(out)s->ga, 0); 428 if (err != GA_NO_ERROR) { 429 PyErr_Format(PyExc_RuntimeError, 430 "%%s", GpuArray_error(&%(A)s->ga, err)); 431 %(fail)s; 432 } 433 """ % vars 434 435 return code 436 437 def c_code_cache_version(self): 438 return (4,) 439 440gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False) 441gpugemmbatch_inplace = GpuGemmBatch(inplace=True) 442 443 444class BaseGpuCorrMM(CGpuKernelBase): 445 """ 446 Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and 447 `GpuCorrMM_gradInputs`. Cannot be used directly. 448 449 Parameters 450 ---------- 451 border_mode : {'valid', 'full', 'half'} 452 Additionally, the padding size could be directly specified by an integer, 453 a pair of integers, or two pairs of integers. 454 subsample 455 Perform subsampling of the output (default: (1, 1)). 456 filter_dilation 457 Perform subsampling of the input, also known as dilation (default: (1, 1)). 458 num_groups : 459 Divides the image, kernel and output tensors into num_groups 460 separate groups. Each which carry out convolutions separately (default : 1). 461 unshared 462 Perform unshared correlation (default: False) 463 """ 464 check_broadcast = False 465 __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared') 466 _f16_ok = True 467 468 def __init__(self, border_mode="valid", subsample=(1, 1), 469 filter_dilation=(1, 1), num_groups=1, unshared=False): 470 if isinstance(border_mode, integer_types): 471 if border_mode < 0: 472 raise ValueError( 473 'invalid border_mode {}, which must be a ' 474 'non-negative integer'.format(border_mode)) 475 border_mode = ((border_mode, border_mode),) * 2 476 elif isinstance(border_mode, tuple): 477 if len(border_mode) != 2: 478 raise ValueError( 479 'invalid border_mode {} which must be a ' 480 'tuple of length 2'.format(border_mode)) 481 border = () 482 for mode in border_mode: 483 if isinstance(mode, tuple) and len(mode) == 2 and \ 484 min(mode) >= 0: 485 border += ((int(mode[0]), int(mode[1])),) 486 elif mode >= 0: 487 border += ((int(mode), int(mode)),) 488 else: 489 raise ValueError( 490 'invalid border mode {}. The tuple can only contain ' 491 'integers or tuples of length 2'.format(border_mode)) 492 border_mode = border 493 elif border_mode not in ('valid', 'full', 'half'): 494 raise ValueError( 495 'invalid border_mode {}, which must be either ' 496 '"valid", "full", "half", an integer or a tuple ' 497 'of length 2'.format(border_mode)) 498 self.border_mode = border_mode 499 if len(subsample) != 2: 500 raise ValueError("subsample must have two elements") 501 if len(filter_dilation) != 2: 502 raise ValueError("filter_dilation must have two elements") 503 self.subsample = tuple(subsample) 504 self.filter_dilation = tuple(filter_dilation) 505 if num_groups < 1: 506 raise ValueError("Number of groups should be greater than 0") 507 self.num_groups = num_groups 508 CGpuKernelBase.__init__(self, ['c_code/corr_gemm.c']) 509 self.unshared = unshared 510 511 @property 512 def pad(self): 513 if self.border_mode != 'valid': 514 return self.border_mode 515 return ((0, 0),) * 2 516 517 def __str__(self): 518 return '%s{%s, %s, %s, %s, %s}' % ( 519 self.__class__.__name__, 520 self.border_mode, 521 str(self.subsample), 522 str(self.filter_dilation), 523 str(self.num_groups), 524 str(self.unshared)) 525 526 def __setstate__(self, d): 527 self.__dict__.update(d) 528 if not hasattr(self, 'num_groups'): 529 self.num_groups = 1 530 531 def flops(self, inp, outp): 532 """ 533 Useful with the hack in profilemode to print the MFlops. 534 535 """ 536 # if the output shape is correct, then this gives the correct 537 # flops for any direction, sampling, padding, and border mode 538 inputs, filters = inp 539 outputs, = outp 540 assert inputs[1] == (filters[1] * self.num_groups) 541 # nb mul and add by output pixel 542 flops = filters[2] * filters[3] * 2 543 # nb flops by output image 544 flops *= outputs[2] * outputs[3] 545 # nb patch multiplied 546 flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups 547 return flops 548 549 def c_headers(self): 550 return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"] 551 552 def c_header_dirs(self): 553 return [gpuarray_helper_inc_dir()] 554 555 def c_code_cache_version(self): 556 # Raise this whenever modifying the C code (including the file). 557 return (12,) 558 559 def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): 560 """ 561 This generates the C code for GpuCorrMM (direction="forward"), 562 GpuCorrMM_gradWeights (direction="backprop weights"), and 563 GpuCorrMM_gradInputs (direction="backprop inputs"). 564 Depending on the direction, one of bottom, weights, top will 565 receive the output, while the other two serve as inputs. 566 567 Parameters 568 ---------- 569 bottom 570 Variable name of the input images in the forward pass, 571 or the gradient of the input images in backprop wrt. inputs 572 weights 573 Variable name of the filters in the forward pass, 574 or the gradient of the filters in backprop wrt. weights 575 top 576 Variable name of the output images / feature maps in the 577 forward pass, or the gradient of the outputs in the backprop passes 578 direction : {'forward', 'backprop weights', 'backprop inputs'} 579 "forward" to correlate bottom with weights and store results in top, 580 "backprop weights" to do a valid convolution of bottom with top 581 (swapping the first two dimensions) and store results in weights, 582 and "backprop inputs" to do a full convolution of top with weights 583 (swapping the first two dimensions) and store results in bottom. 584 sub 585 Dictionary of substitutions useable to help generating the C code. 586 height 587 Required if self.subsample[0] != 1, a variable giving the height of 588 the filters for direction="backprop weights" or the height of the 589 input images for direction="backprop inputs". 590 Required if self.border_mode == 'half', a variable giving the height 591 of the filters for direction="backprop weights". 592 Not required otherwise, but if a value is given this will be checked. 593 width 594 Required if self.subsample[1] != 1, a variable giving the width of 595 the filters for direction="backprop weights" or the width of the 596 input images for direction="backprop inputs". 597 Required if self.border_mode == 'half', a variable giving the width 598 of the filters for direction="backprop weights". 599 Not required otherwise, but if a value is given this will be checked. 600 601 """ 602 dH, dW = self.subsample 603 dilH, dilW = self.filter_dilation 604 numgroups = self.num_groups 605 unshared = int(self.unshared) 606 if self.border_mode == "half": 607 padH_l = padH_r = padW_l = padW_r = -1 608 elif self.border_mode == "full": 609 padH_l = padH_r = padW_l = padW_r = -2 610 elif isinstance(self.border_mode, tuple): 611 (padH_l, padH_r), (padW_l, padW_r) = self.border_mode 612 else: 613 assert self.border_mode == "valid" 614 padH_l = padH_r = padW_l = padW_r = 0 615 if direction == "forward": 616 direction = 0 617 out = top 618 elif direction == "backprop weights": 619 direction = 1 620 out = weights 621 elif direction == "backprop inputs": 622 direction = 2 623 out = bottom 624 else: 625 raise ValueError("direction must be one of 'forward', " 626 "'backprop weights', 'backprop inputs'") 627 # When subsampling, we cannot unambiguously infer the height and width 628 # of bottom and weights from top, so we require them to be given. 629 # Similarly, when pad="half", we cannot infer the weight size. 630 if height: 631 height = '(*(npy_int*)(PyArray_DATA(%s)))' % height 632 else: 633 if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH_l == -1 or padH_r == -1)): 634 raise ValueError("height must be given for backprop with vertical sampling or pad='half'") 635 height = '-1' 636 if width: 637 width = '(*(npy_int*)(PyArray_DATA(%s)))' % width 638 else: 639 if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW_l == -1 or padW_r == -1)): 640 raise ValueError("width must be given for backprop with horizontal sampling or pad='half'") 641 width = '-1' 642 643 sub = sub.copy() 644 sub.update(locals()) 645 646 return """ 647 // Mandatory args 648 int direction = %(direction)s; // forward, bprop weights, bprop inputs 649 650 // Optional args 651 size_t dH = %(dH)s; 652 size_t dW = %(dW)s; 653 size_t dilH = %(dilH)s; 654 size_t dilW = %(dilW)s; 655 int padH_l = %(padH_l)s; 656 int padH_r = %(padH_r)s; 657 int padW_l = %(padW_l)s; 658 int padW_r = %(padW_r)s; 659 int numgroups = %(numgroups)s; 660 int unshared = %(unshared)s; 661 662 PyGpuArrayObject * bottom = %(bottom)s; 663 PyGpuArrayObject * weights = %(weights)s; 664 PyGpuArrayObject * top = %(top)s; 665 PyGpuArrayObject * out2 = NULL; 666 667 int wdim, odim; 668 wdim = unshared ? 6 : 4; 669 odim = 4; //Can be set to 6 later for unshared backprop wrt weights 670 671 // Obtain or infer kernel width and height 672 // (we need to know it early to be able to handle auto-padding) 673 size_t kH, kW, dil_kH, dil_kW; 674 if (direction != 1) { 675 // weight is an input variable, we can just read its shape 676 kH = PyGpuArray_DIMS(weights)[wdim-2]; 677 kW = PyGpuArray_DIMS(weights)[wdim-1]; 678 } 679 else { 680 if (%(height)s != -1) { 681 // kernel height is specified (perhaps vertical subsampling or half padding) 682 kH = %(height)s; 683 } 684 else if (padH_l == -2 || padH_r == -2) { 685 // vertical full padding, we can infer the kernel height 686 kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1; 687 } 688 else { 689 // explicit padding, we can infer the kernel height 690 kH = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ; 691 } 692 if (%(width)s != -1) { 693 kW = %(width)s; 694 } 695 else if (padW_l == -2 || padW_r == -2) { 696 kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; 697 } 698 else { 699 kW = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; 700 } 701 } 702 703 // Implicit dilated kernel size 704 dil_kH = (kH - 1) * dilH + 1; 705 dil_kW = (kW - 1) * dilW + 1; 706 707 // Auto-padding if requested 708 if (padH_l == -1 || padH_r == -1) { // vertical half padding 709 padH_l = padH_r = dil_kH / 2; 710 } 711 else if (padH_l == -2 || padH_r == -2) { // vertical full padding 712 padH_l = padH_r = dil_kH - 1; 713 } 714 else if (padH_l < 0 || padH_r < 0) { 715 PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padH must be >= -2"); 716 %(fail)s 717 } 718 if (padW_l == -1 || padW_r == -1) { // horizontal half padding 719 padW_l = padW_r = dil_kW / 2; 720 } 721 else if (padW_l == -2 || padW_r == -2) { // horizontal full padding 722 padW_l = padW_r = dil_kW - 1; 723 } 724 else if (padW_l < 0 || padW_r < 0) { 725 PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: padW must be >= -2"); 726 %(fail)s 727 } 728 729 // Infer output shape and type 730 // The inferred shape can be negative. 731 long long out_dim[6]; 732 size_t out_dim_size[6]; 733 out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights 734 out_dim_size[4] = out_dim_size[5] = 0; //Same 735 int out_typecode; 736 PyGpuContextObject *out_context; 737 switch(direction) { 738 case 0: // forward pass 739 // output is top: (batchsize, num_filters, height, width) 740 // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1 741 out_dim[0] = PyGpuArray_DIMS(bottom)[0]; 742 out_dim[1] = PyGpuArray_DIMS(weights)[0]; 743 out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1; 744 out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1; 745 out_typecode = bottom->ga.typecode; 746 out_context = bottom->context; 747 if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) 748 { 749 if (unshared) { 750 PyErr_Format(PyExc_ValueError, 751 "GpuCorrMM: impossible output shape\\n" 752 " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" 753 " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" 754 " top shape: %%ld x %%ld x %%ld x %%ld\\n", 755 PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], 756 PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], 757 PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], 758 PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], 759 PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5], 760 out_dim[0], out_dim[1], out_dim[2], out_dim[3]); 761 %(fail)s 762 } 763 else { 764 PyErr_Format(PyExc_ValueError, 765 "GpuCorrMM: impossible output shape\\n" 766 " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" 767 " weights shape: %%ld x %%ld x %%ld x %%ld\\n" 768 " top shape: %%ld x %%ld x %%ld x %%ld\\n", 769 PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], 770 PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], 771 PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], 772 PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], 773 out_dim[0], out_dim[1], out_dim[2], out_dim[3]); 774 %(fail)s 775 } 776 } 777 break; 778 case 1: // backprop wrt. weights 779 // output is weights: (num_filters, num_channels, height, width) or 780 // (num_filters, top_height, top_width, num_channels, height, width) -> for unshared 781 // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 782 out_dim[0] = PyGpuArray_DIMS(top)[1]; 783 if (unshared){ 784 odim = 6; 785 out_dim[1] = PyGpuArray_DIMS(top)[2]; 786 out_dim[2] = PyGpuArray_DIMS(top)[3]; 787 } 788 out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups; 789 out_dim[wdim-2] = kH; // already inferred further above 790 out_dim[wdim-1] = kW; // how convenient 791 out_typecode = top->ga.typecode; 792 out_context = top->context; 793 if (unshared) { 794 if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0 795 || out_dim[4] <= 0 || out_dim[5] <= 0){ 796 PyErr_Format(PyExc_ValueError, 797 "GpuCorrMM backprop wrt. weights: impossible output shape\\n" 798 " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" 799 " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" 800 " top shape: %%ld x %%ld x %%ld x %%ld\\n", 801 PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], 802 PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], 803 out_dim[0], out_dim[1], out_dim[2], out_dim[3], 804 out_dim[4], out_dim[5], 805 PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], 806 PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]); 807 %(fail)s 808 } 809 } 810 else { 811 if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) 812 { 813 PyErr_Format(PyExc_ValueError, 814 "GpuCorrMM backprop wrt. weights: impossible output shape\\n" 815 " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" 816 " weights shape: %%ld x %%ld x %%ld x %%ld\\n" 817 " top shape: %%ld x %%ld x %%ld x %%ld\\n", 818 PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], 819 PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], 820 out_dim[0], out_dim[1], out_dim[2], out_dim[3], 821 PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], 822 PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]); 823 %(fail)s 824 } 825 } 826 break; 827 case 2: // backprop wrt. inputs 828 // output is bottom: (batchsize, num_channels, height, width) 829 // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad 830 out_dim[0] = PyGpuArray_DIMS(top)[0]; 831 out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups; 832 out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r; 833 out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r; 834 out_typecode = top->ga.typecode; 835 out_context = top->context; 836 if (unshared) { 837 if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) 838 { 839 PyErr_Format(PyExc_ValueError, 840 "GpuCorrMM backprop wrt. inputs: impossible output shape\\n" 841 " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" 842 " weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n" 843 " top shape: %%ld x %%ld x %%ld x %%ld\\n", 844 out_dim[0], out_dim[1], out_dim[2], out_dim[3], 845 PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], 846 PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], 847 PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5], 848 PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], 849 PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]); 850 %(fail)s 851 } 852 } 853 else { 854 if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0) 855 { 856 PyErr_Format(PyExc_ValueError, 857 "GpuCorrMM backprop wrt. inputs: impossible output shape\\n" 858 " bottom shape: %%ld x %%ld x %%ld x %%ld\\n" 859 " weight shape: %%ld x %%ld x %%ld x %%ld\\n" 860 " top shape: %%ld x %%ld x %%ld x %%ld\\n", 861 out_dim[0], out_dim[1], out_dim[2], out_dim[3], 862 PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], 863 PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], 864 PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], 865 PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]); 866 %(fail)s 867 } 868 } 869 break; 870 default: 871 PyErr_SetString(PyExc_ValueError, "BaseGpuCorrMM: direction must be 0, 1, or 2\\n"); 872 %(fail)s 873 } 874 875 out_dim_size[0] = (size_t)out_dim[0]; 876 out_dim_size[1] = (size_t)out_dim[1]; 877 out_dim_size[2] = (size_t)out_dim[2]; 878 out_dim_size[3] = (size_t)out_dim[3]; 879 880 if (odim == 6) { 881 out_dim_size[4] = (size_t)out_dim[4]; 882 out_dim_size[5] = (size_t)out_dim[5]; 883 } 884 885 // Prepare output array 886 if (theano_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0) 887 { 888 if (odim == 4) { 889 PyErr_Format(PyExc_RuntimeError, 890 "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld", 891 out_dim[0], out_dim[1], out_dim[2], out_dim[3]); 892 } 893 if (odim == 6) { 894 PyErr_Format(PyExc_RuntimeError, 895 "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld", 896 out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]); 897 } 898 %(fail)s 899 } 900 if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) { 901 PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported."); 902 %(fail)s 903 } 904 905 // Call GPU code 906 out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, 907 padH_l, padH_r, padW_l, padW_r, numgroups, unshared); 908 if (out2==NULL){ 909 %(fail)s 910 } 911 assert (out2 == %(out)s); 912 913""" % sub 914 915 916class GpuCorrMM(BaseGpuCorrMM): 917 """ 918 GPU correlation implementation using Matrix Multiplication. 919 920 Parameters 921 ---------- 922 border_mode 923 The width of a border of implicit zeros to pad the 924 input with. Must be a tuple with 2 elements giving the numbers of rows 925 and columns to pad on each side, or a single integer to pad the same 926 on all sides, or a string shortcut setting the padding at runtime: 927 ``'valid'`` for ``(0, 0)`` (valid convolution, no padding), ``'full'`` 928 for ``(kernel_rows - 1, kernel_columns - 1)`` (full convolution), 929 ``'half'`` for ``(kernel_rows // 2, kernel_columns // 2)`` (same 930 convolution for odd-sized kernels). 931 If it is a tuple containing 2 pairs of integers, then these specify 932 the padding to be applied on each side ((left, right), (top, bottom)). 933 Otherwise, each width is applied twice, once per side (left and right, 934 top and bottom). 935 subsample 936 The subsample operation applied to each output image. 937 Should be a tuple with 2 elements. 938 `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`, 939 but faster. 940 Set to `(1, 1)` to disable subsampling. 941 filter_dilation 942 The filter dilation operation applied to each input image. 943 Should be a tuple with 2 elements. 944 Set to `(1, 1)` to disable filter dilation. 945 num_groups 946 The number of distinct groups the image and kernel must be 947 divided into. 948 should be an int 949 set to 1 to disable grouped convolution 950 unshared 951 Perform unshared correlation (default: False) 952 953 Notes 954 ----- 955 Currently, the Op requires the inputs, filters and outputs to be 956 C-contiguous. Use :func:`gpu_contiguous 957 <theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments 958 if needed. 959 960 You can either enable the Theano flag `optimizer_including=conv_gemm` 961 to automatically replace all convolution operations with `GpuCorrMM` 962 or one of its gradients, or you can use it as a replacement for 963 :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as 964 `GpuCorrMM(subsample=...)(image, filters)`. The latter is currently 965 faster, but note that it computes a correlation -- if you need to 966 compute a convolution, flip the filters as `filters[:,:,::-1,::-1]`. 967 968 """ 969 def __init__(self, border_mode="valid", 970 subsample=(1, 1), 971 filter_dilation=(1, 1), num_groups=1, unshared=False): 972 super(GpuCorrMM, self).__init__(border_mode, subsample, 973 filter_dilation, num_groups, unshared) 974 975 def make_node(self, img, kern): 976 ctx_name = infer_context_name(img, kern) 977 img = as_gpuarray_variable(img, ctx_name) 978 kern = as_gpuarray_variable(kern, ctx_name) 979 if img.type.ndim != 4: 980 raise TypeError('img must be 4D tensor') 981 if self.unshared: 982 if kern.type.ndim != 6: 983 raise TypeError('kern must be 6D tensor') 984 else: 985 if kern.type.ndim != 4: 986 raise TypeError('kern must be 4D tensor') 987 988 broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], 989 False, False] 990 return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype, 991 context_name=ctx_name, 992 broadcastable=broadcastable)()]) 993 994 def c_code(self, node, nodename, inp, out_, sub): 995 bottom, weights = inp 996 top, = out_ 997 direction = "forward" 998 return super(GpuCorrMM, self).c_code_helper(bottom, weights, top, direction, sub) 999 1000 def grad(self, inp, grads): 1001 bottom, weights = inp 1002 top, = grads 1003 top = gpu_contiguous(top) 1004 d_bottom = GpuCorrMM_gradInputs(self.border_mode, 1005 self.subsample, 1006 self.filter_dilation, 1007 self.num_groups, 1008 self.unshared)( 1009 weights, top, bottom.shape[-2:]) 1010 d_weights = GpuCorrMM_gradWeights(self.border_mode, 1011 self.subsample, 1012 self.filter_dilation, 1013 self.num_groups, 1014 self.unshared)( 1015 bottom, top, weights.shape[-2:]) 1016 return d_bottom, d_weights 1017 1018 1019class GpuCorrMM_gradWeights(BaseGpuCorrMM): 1020 """ 1021 Gradient wrt. filters for `GpuCorrMM`. 1022 1023 Notes 1024 ----- 1025 You will not want to use this directly, but rely on Theano's automatic 1026 differentiation or graph optimization to use it as needed. 1027 1028 """ 1029 1030 def __init__(self, border_mode="valid", 1031 subsample=(1, 1), 1032 filter_dilation=(1, 1), 1033 num_groups=1, 1034 unshared=False): 1035 super(GpuCorrMM_gradWeights, self).__init__(border_mode, 1036 subsample, 1037 filter_dilation, num_groups, 1038 unshared) 1039 1040 def make_node(self, img, topgrad, shape=None): 1041 ctx_name = infer_context_name(img, topgrad) 1042 img = as_gpuarray_variable(img, ctx_name) 1043 topgrad = as_gpuarray_variable(topgrad, ctx_name) 1044 if img.type.ndim != 4: 1045 raise TypeError('img must be 4D tensor') 1046 if topgrad.type.ndim != 4: 1047 raise TypeError('topgrad must be 4D tensor') 1048 if shape is None: 1049 if self.subsample != (1, 1) or self.border_mode == "half": 1050 raise ValueError('shape must be given if subsample != (1, 1)' 1051 ' or border_mode == "half"') 1052 height_width = [] 1053 else: 1054 height_width = [shape[0], shape[1]] 1055 assert shape[0].ndim == 0 1056 assert shape[1].ndim == 0 1057 1058 if self.unshared: 1059 broadcastable = [topgrad.type.broadcastable[1], False, False, 1060 img.type.broadcastable[1], False, False] 1061 else: 1062 broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1], 1063 False, False] 1064 return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype, 1065 context_name=ctx_name, 1066 broadcastable=broadcastable)()]) 1067 1068 def c_code(self, node, nodename, inp, out_, sub): 1069 bottom, top = inp[:2] 1070 height, width = inp[2:] or (None, None) 1071 weights, = out_ 1072 direction = "backprop weights" 1073 return super(GpuCorrMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width) 1074 1075 def grad(self, inp, grads): 1076 bottom, top = inp[:2] 1077 weights, = grads 1078 weights = gpu_contiguous(weights) 1079 d_bottom = GpuCorrMM_gradInputs(self.border_mode, 1080 self.subsample, 1081 self.filter_dilation, 1082 self.num_groups, 1083 self.unshared)(weights, 1084 top, 1085 bottom.shape[-2:]) 1086 d_top = GpuCorrMM( 1087 self.border_mode, self.subsample, self.filter_dilation, self.num_groups, self.unshared)(bottom, weights) 1088 d_height_width = ( 1089 theano.gradient.DisconnectedType()(), 1090 ) * 2 if len(inp) == 4 else () 1091 return (d_bottom, d_top) + d_height_width 1092 1093 def connection_pattern(self, node): 1094 if node.nin == 2: 1095 return [[1], [1]] 1096 else: 1097 return [[1], [1], [0], [0]] # no connection to height, width 1098 1099 1100class GpuCorrMM_gradInputs(BaseGpuCorrMM): 1101 """ 1102 Gradient wrt. inputs for `GpuCorrMM`. 1103 1104 Notes 1105 ----- 1106 You will not want to use this directly, but rely on Theano's automatic 1107 differentiation or graph optimization to use it as needed. 1108 1109 """ 1110 1111 def __init__(self, border_mode="valid", 1112 subsample=(1, 1), 1113 filter_dilation=(1, 1), 1114 num_groups=1, 1115 unshared=False): 1116 super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, 1117 filter_dilation, num_groups, 1118 unshared) 1119 1120 def make_node(self, kern, topgrad, shape=None): 1121 ctx_name = infer_context_name(kern, topgrad) 1122 kern = as_gpuarray_variable(kern, ctx_name) 1123 topgrad = as_gpuarray_variable(topgrad, ctx_name) 1124 if self.unshared: 1125 if kern.type.ndim != 6: 1126 raise TypeError('kern must be 6D tensor') 1127 else: 1128 if kern.type.ndim != 4: 1129 raise TypeError('kern must be 4D tensor') 1130 if topgrad.type.ndim != 4: 1131 raise TypeError('topgrad must be 4D tensor') 1132 if shape is None: 1133 if self.subsample != (1, 1): 1134 raise ValueError('shape must be given if subsample != (1, 1)') 1135 height_width = [] 1136 else: 1137 height_width = [shape[0], shape[1]] 1138 assert shape[0].ndim == 0 1139 assert shape[1].ndim == 0 1140 1141 if self.num_groups > 1: 1142 broadcastable = [topgrad.type.broadcastable[0], False, 1143 False, False] 1144 else: 1145 broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3], 1146 False, False] 1147 return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype, 1148 context_name=ctx_name, 1149 broadcastable=broadcastable)()]) 1150 1151 def c_code(self, node, nodename, inp, out_, sub): 1152 weights, top = inp[:2] 1153 height, width = inp[2:] or (None, None) 1154 bottom, = out_ 1155 direction = "backprop inputs" 1156 return super(GpuCorrMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width) 1157 1158 def grad(self, inp, grads): 1159 weights, top = inp[:2] 1160 bottom, = grads 1161 bottom = gpu_contiguous(bottom) 1162 d_weights = GpuCorrMM_gradWeights(self.border_mode, 1163 self.subsample, 1164 self.filter_dilation, 1165 self.num_groups, 1166 self.unshared)(bottom, 1167 top, 1168 weights.shape[-2:]) 1169 d_top = GpuCorrMM(self.border_mode, 1170 self.subsample, 1171 self.filter_dilation, 1172 self.num_groups, 1173 self.unshared)(bottom, weights) 1174 d_height_width = ( 1175 theano.gradient.DisconnectedType()(), 1176 ) * 2 if len(inp) == 4 else () 1177 return (d_weights, d_top) + d_height_width 1178 1179 def connection_pattern(self, node): 1180 if node.nin == 2: 1181 return [[1], [1]] 1182 else: 1183 return [[1], [1], [0], [0]] # no connection to height, width 1184 1185 1186class BaseGpuCorr3dMM(CGpuKernelBase): 1187 """ 1188 Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and 1189 `GpuCorr3dMM_gradInputs`. Cannot be used directly. 1190 1191 Parameters 1192 ---------- 1193 border_mode : {'valid', 'full', 'half'} 1194 Additionally, the padding size could be directly specified by an integer 1195 or a pair of integers 1196 subsample 1197 Perform subsampling of the output (default: (1, 1, 1)). 1198 filter_dilation 1199 Perform subsampling of the input, also known as dilation (default: (1, 1, 1)). 1200 num_groups : 1201 Divides the image, kernel and output tensors into num_groups 1202 separate groups. Each which carry out convolutions separately (default : 1). 1203 1204 """ 1205 check_broadcast = False 1206 __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups') 1207 _f16_ok = True 1208 1209 def __init__(self, border_mode="valid", subsample=(1, 1, 1), 1210 filter_dilation=(1, 1, 1), num_groups=1): 1211 if isinstance(border_mode, integer_types): 1212 border_mode = (border_mode, border_mode, border_mode) 1213 if isinstance(border_mode, tuple): 1214 pad_h, pad_w, pad_d = map(int, border_mode) 1215 border_mode = (pad_h, pad_w, pad_d) 1216 if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or 1217 border_mode in ('valid', 'full', 'half')): 1218 raise ValueError( 1219 'invalid border_mode {}, which must be either ' 1220 '"valid", "full", "half", an integer or a tuple of' 1221 ' three integers'.format(border_mode)) 1222 self.border_mode = border_mode 1223 if len(subsample) != 3: 1224 raise ValueError("subsample must have three elements") 1225 if len(filter_dilation) != 3: 1226 raise ValueError("filter_dilation must have three elements") 1227 self.subsample = tuple(subsample) 1228 self.filter_dilation = tuple(filter_dilation) 1229 if num_groups < 1: 1230 raise ValueError("Number of groups should be greater than 0") 1231 self.num_groups = num_groups 1232 CGpuKernelBase.__init__(self, ['c_code/corr3d_gemm.c']) 1233 1234 @property 1235 def pad(self): 1236 if self.border_mode != 'valid': 1237 return self.border_mode 1238 return (0, 0, 0) 1239 1240 def __str__(self): 1241 return '%s{%s, %s, %s, %s}' % ( 1242 self.__class__.__name__, 1243 self.border_mode, 1244 str(self.subsample), 1245 str(self.filter_dilation), 1246 str(self.num_groups)) 1247 1248 def __setstate__(self, d): 1249 self.__dict__.update(d) 1250 if not hasattr(self, 'num_groups'): 1251 self.num_groups = 1 1252 1253 def flops(self, inp, outp): 1254 """ 1255 Useful with the hack in profilemode to print the MFlops. 1256 1257 """ 1258 # if the output shape is correct, then this gives the correct 1259 # flops for any direction, sampling, padding, and border mode 1260 inputs, filters = inp 1261 outputs, = outp 1262 assert inputs[1] == (filters[1] * self.num_groups) 1263 # nb mul and add by output pixel 1264 flops = filters[2] * filters[3] * filters[4] * 2 1265 # nb flops by output image 1266 flops *= outputs[2] * outputs[3] * outputs[4] 1267 # nb patch multiplied 1268 flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups 1269 return flops 1270 1271 def c_headers(self): 1272 return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"] 1273 1274 def c_header_dirs(self): 1275 return [gpuarray_helper_inc_dir()] 1276 1277 def c_code_cache_version(self): 1278 # raise this whenever modifying the code below. 1279 return (8,) 1280 1281 def c_code_helper(self, bottom, weights, top, direction, sub, 1282 height=None, width=None, depth=None): 1283 """ 1284 This generates the C code for GpuCorr3dMM (direction="forward"), 1285 GpuCorr3dMM_gradWeights (direction="backprop weights"), and 1286 GpuCorr3dMM_gradInputs (direction="backprop inputs"). 1287 Depending on the direction, one of bottom, weights, top will 1288 receive the output, while the other two serve as inputs. 1289 1290 Parameters 1291 ---------- 1292 bottom 1293 Variable name of the input images in the forward pass, 1294 or the gradient of the input images in backprop wrt. inputs 1295 weights 1296 Variable name of the filters in the forward pass, 1297 or the gradient of the filters in backprop wrt. weights 1298 top 1299 Variable name of the output images / feature maps in the 1300 forward pass, or the gradient of the outputs in the backprop passes 1301 direction : {'forward', 'backprop weights', 'backprop inputs'} 1302 "forward" to correlate bottom with weights and store results in top, 1303 "backprop weights" to do a valid convolution of bottom with top 1304 (swapping the first two dimensions) and store results in weights, 1305 and "backprop inputs" to do a full convolution of top with weights 1306 (swapping the first two dimensions) and store results in bottom. 1307 sub 1308 Dictionary of substitutions useable to help generating the C code. 1309 height 1310 Required if self.subsample[0] != 1, a variable giving the height of 1311 the filters for direction="backprop weights" or the height of the 1312 input images for direction="backprop inputs". 1313 Required if self.border_mode == 'half', a variable giving the height 1314 of the filters for direction="backprop weights". 1315 Not required otherwise, but if a value is given this will be checked. 1316 width 1317 Required if self.subsample[1] != 1, a variable giving the width of 1318 the filters for direction="backprop weights" or the width of the 1319 input images for direction="backprop inputs". 1320 Required if self.border_mode == 'half', a variable giving the width 1321 of the filters for direction="backprop weights". 1322 Not required otherwise, but if a value is given this will be checked. 1323 depth 1324 Required if self.subsample[2] != 1, a variable giving the depth of 1325 the filters for direction="backprop weights" or the depth of the 1326 input images for direction="backprop inputs". 1327 Required if self.border_mode == 'half', a variable giving the depth 1328 of the filters for direction="backprop weights". 1329 Not required otherwise, but if a value is given this will be checked. 1330 1331 """ 1332 dH, dW, dD = self.subsample 1333 dilH, dilW, dilD = self.filter_dilation 1334 numgroups = self.num_groups 1335 if self.border_mode == "half": 1336 padH = padW = padD = -1 1337 elif self.border_mode == "full": 1338 padH = padW = padD = -2 1339 elif isinstance(self.border_mode, tuple): 1340 padH, padW, padD = self.border_mode 1341 else: 1342 assert self.border_mode == "valid" 1343 padH = padW = padD = 0 1344 if direction == "forward": 1345 direction = 0 1346 out = top 1347 elif direction == "backprop weights": 1348 direction = 1 1349 out = weights 1350 elif direction == "backprop inputs": 1351 direction = 2 1352 out = bottom 1353 else: 1354 raise ValueError("direction must be one of 'forward', " 1355 "'backprop weights', 'backprop inputs'") 1356 # When subsampling, we cannot unambiguously infer the height and width 1357 # of bottom and weights from top, so we require them to be given. 1358 # Similarly, when pad="half", we cannot infer the weight size. 1359 if height: 1360 height = '(*(npy_int*)(PyArray_DATA(%s)))' % height 1361 else: 1362 if ((direction != 0) and (dH != 1)) or ((direction == 1) and (padH == -1)): 1363 raise ValueError("height must be given for backprop with vertical sampling or pad='half'") 1364 height = '-1' 1365 if width: 1366 width = '(*(npy_int*)(PyArray_DATA(%s)))' % width 1367 else: 1368 if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)): 1369 raise ValueError("width must be given for backprop with horizontal sampling or pad='half'") 1370 width = '-1' 1371 if depth: 1372 depth = '(*(npy_int*)(PyArray_DATA(%s)))' % depth 1373 else: 1374 if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)): 1375 raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'") 1376 depth = '-1' 1377 1378 sub = sub.copy() 1379 sub.update(locals()) 1380 1381 return """ 1382 // Mandatory args 1383 int direction = %(direction)s; // forward, bprop weights, bprop inputs 1384 1385 // Optional args 1386 size_t dH = %(dH)s; 1387 size_t dW = %(dW)s; 1388 size_t dD = %(dD)s; 1389 size_t dilH = %(dilH)s; 1390 size_t dilW = %(dilW)s; 1391 size_t dilD = %(dilD)s; 1392 int padH = %(padH)s; 1393 int padW = %(padW)s; 1394 int padD = %(padD)s; 1395 int numgroups = %(numgroups)s; 1396 1397 PyGpuArrayObject * bottom = %(bottom)s; 1398 PyGpuArrayObject * weights = %(weights)s; 1399 PyGpuArrayObject * top = %(top)s; 1400 PyGpuArrayObject * out2 = NULL; 1401 1402 // Obtain or infer kernel height, width and depth 1403 // (we need to know it early to be able to handle auto-padding) 1404 size_t kH, kW, kD, dil_kH, dil_kW, dil_kD; 1405 if (direction != 1) { 1406 // weight is an input variable, we can just read its shape 1407 kH = PyGpuArray_DIMS(weights)[2]; 1408 kW = PyGpuArray_DIMS(weights)[3]; 1409 kD = PyGpuArray_DIMS(weights)[4]; 1410 } 1411 else { 1412 if (%(height)s != -1) { 1413 // kernel height is specified (perhaps vertical subsampling or half padding) 1414 kH = %(height)s; 1415 } 1416 else if (padH == -2) { 1417 // vertical full padding, we can infer the kernel height 1418 kH = (2 - PyGpuArray_DIMS(bottom)[2] + (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1; 1419 } 1420 else { 1421 // explicit padding, we can infer the kernel height 1422 kH = (PyGpuArray_DIMS(bottom)[2] + 2*padH - (PyGpuArray_DIMS(top)[2] - 1) * dH - 1) / dilH + 1 ; 1423 } 1424 if (%(width)s != -1) { 1425 kW = %(width)s; 1426 } 1427 else if (padW == -2) { 1428 kW = (2 - PyGpuArray_DIMS(bottom)[3] + (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; 1429 } 1430 else { 1431 kW = (PyGpuArray_DIMS(bottom)[3] + 2*padW - (PyGpuArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1; 1432 } 1433 if (%(depth)s != -1) { 1434 kD = %(depth)s; 1435 } 1436 else if (padD == -2) { 1437 kD = (2 - PyGpuArray_DIMS(bottom)[4] + (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1; 1438 } 1439 else { 1440 kD = (PyGpuArray_DIMS(bottom)[4] + 2*padD - (PyGpuArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1; 1441 } 1442 } 1443 1444 // Implicit dilated kernel size 1445 dil_kH = (kH - 1) * dilH + 1; 1446 dil_kW = (kW - 1) * dilW + 1; 1447 dil_kD = (kD - 1) * dilD + 1; 1448 1449 // Auto-padding if requested 1450 if (padH == -1) { // vertical half padding 1451 padH = dil_kH / 2; 1452 } 1453 else if (padH == -2) { // vertical full padding 1454 padH = dil_kH - 1; 1455 } 1456 else if (padH < 0) { 1457 PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padH must be >= -2"); 1458 %(fail)s 1459 } 1460 if (padW == -1) { // horizontal half padding 1461 padW = dil_kW / 2; 1462 } 1463 else if (padW == -2) { // horizontal full padding 1464 padW = dil_kW - 1; 1465 } 1466 else if (padW < 0) { 1467 PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padW must be >= -2"); 1468 %(fail)s 1469 } 1470 if (padD == -1) { // depth half padding 1471 padD = dil_kD / 2; 1472 } 1473 else if (padD == -2) { // depth full padding 1474 padD = dil_kD - 1; 1475 } 1476 else if (padD < 0) { 1477 PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: padD must be >= -2"); 1478 %(fail)s 1479 } 1480 1481 // Infer output shape and type 1482 // The inferred shape can be negative. 1483 long long out_dim[5]; 1484 size_t out_dim_size[5]; 1485 int out_typecode; 1486 PyGpuContextObject *out_context; 1487 switch(direction) { 1488 case 0: // forward pass 1489 // output is top: (batchsize, num_filters, height, width, depth) 1490 // height, width and depth: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1 1491 out_dim[0] = PyGpuArray_DIMS(bottom)[0]; 1492 out_dim[1] = PyGpuArray_DIMS(weights)[0]; 1493 out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1; 1494 out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1; 1495 out_dim[4] = (PyGpuArray_DIMS(bottom)[4] + 2*padD - ((PyGpuArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1; 1496 out_typecode = bottom->ga.typecode; 1497 out_context = bottom->context; 1498 if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0) 1499 { 1500 PyErr_Format(PyExc_ValueError, 1501 "GpuCorr3dMM: impossible output shape\\n" 1502 " bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" 1503 " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" 1504 " top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n", 1505 PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], 1506 PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], 1507 PyGpuArray_DIMS(bottom)[4], 1508 PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], 1509 PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], 1510 PyGpuArray_DIMS(weights)[4], 1511 out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]); 1512 %(fail)s 1513 } 1514 break; 1515 case 1: // backprop wrt. weights 1516 // output is weights: (num_filters, num_channels, height, width, depth) 1517 // height, width and depth: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1 1518 out_dim[0] = PyGpuArray_DIMS(top)[1]; 1519 out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups; 1520 out_dim[2] = kH; // already inferred further above 1521 out_dim[3] = kW; // how convenient 1522 out_dim[4] = kD; 1523 out_typecode = top->ga.typecode; 1524 out_context = top->context; 1525 if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0) 1526 { 1527 PyErr_Format(PyExc_ValueError, 1528 "GpuCorr3dMM backprop wrt. weights: impossible output shape\\n" 1529 " bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" 1530 " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" 1531 " top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n", 1532 PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1], 1533 PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3], 1534 PyGpuArray_DIMS(bottom)[4], 1535 out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], 1536 PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], 1537 PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], 1538 PyGpuArray_DIMS(top)[4]); 1539 %(fail)s 1540 } 1541 break; 1542 case 2: // backprop wrt. inputs 1543 // output is bottom: (batchsize, num_channels, height, width, depth) 1544 // height, width and depth: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad 1545 out_dim[0] = PyGpuArray_DIMS(top)[0]; 1546 out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups; 1547 out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH; 1548 out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW; 1549 out_dim[4] = (%(depth)s != -1) ? %(depth)s : (PyGpuArray_DIMS(top)[4] - 1) * dD + (PyGpuArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD; 1550 out_typecode = top->ga.typecode; 1551 out_context = top->context; 1552 if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0) 1553 { 1554 PyErr_Format(PyExc_ValueError, 1555 "GpuCorr3dMM backprop wrt. inputs: impossible output shape\\n" 1556 " bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" 1557 " weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n" 1558 " top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n", 1559 out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], 1560 PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1], 1561 PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3], 1562 PyGpuArray_DIMS(weights)[4], 1563 PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], 1564 PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], 1565 PyGpuArray_DIMS(top)[4]); 1566 %(fail)s 1567 } 1568 break; 1569 default: 1570 PyErr_SetString(PyExc_ValueError, "BaseGpuCorr3dMM: direction must be 0, 1, or 2\\n"); 1571 %(fail)s 1572 } 1573 1574 out_dim_size[0] = (size_t)out_dim[0]; 1575 out_dim_size[1] = (size_t)out_dim[1]; 1576 out_dim_size[2] = (size_t)out_dim[2]; 1577 out_dim_size[3] = (size_t)out_dim[3]; 1578 out_dim_size[4] = (size_t)out_dim[4]; 1579 1580 // Prepare output array 1581 if (theano_prep_output(&%(out)s, 5, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0) 1582 { 1583 PyErr_Format(PyExc_RuntimeError, 1584 "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld", 1585 out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4]); 1586 %(fail)s 1587 } 1588 if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) { 1589 PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported."); 1590 %(fail)s 1591 } 1592 1593 // Call GPU code 1594 out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction, 1595 dH, dW, dD, dilH, dilW, dilD, padH, padW, padD, numgroups); 1596 if (out2==NULL){ 1597 %(fail)s 1598 } 1599 assert (out2 == %(out)s); 1600 1601""" % sub 1602 1603 1604class GpuCorr3dMM(BaseGpuCorr3dMM): 1605 """ 1606 GPU correlation implementation using Matrix Multiplication. 1607 1608 Parameters 1609 ---------- 1610 border_mode 1611 The width of a border of implicit zeros to pad the 1612 input with. Must be a tuple with 3 elements giving the width of 1613 the padding on each side, or a single integer to pad the same 1614 on all sides, or a string shortcut setting the padding at runtime: 1615 ``'valid'`` for ``(0, 0, 0)`` (valid convolution, no padding), ``'full'`` 1616 for ``(kernel_rows - 1, kernel_columns - 1, kernel_depth - 1)`` 1617 (full convolution), ``'half'`` for ``(kernel_rows // 2, 1618 kernel_columns // 2, kernel_depth // 2)`` (same convolution for 1619 odd-sized kernels). Note that the three widths are each 1620 applied twice, once per side (left and right, top and bottom, front 1621 and back). 1622 subsample 1623 The subsample operation applied to each output image. Should be a tuple 1624 with 3 elements. `(sv, sh, sl)` is equivalent to 1625 `GpuCorrMM(...)(...)[:,:,::sv, ::sh, ::sl]`, but faster. 1626 Set to `(1, 1, 1)` to disable subsampling. 1627 filter_dilation 1628 The filter dilation operation applied to each input image. 1629 Should be a tuple with 3 elements. 1630 Set to `(1, 1, 1)` to disable filter dilation. 1631 num_groups 1632 The number of distinct groups the image and kernel must be 1633 divided into. 1634 should be an int 1635 set to 1 to disable grouped convolution 1636 1637 Notes 1638 ----- 1639 Currently, the Op requires the inputs, filters and outputs to be 1640 C-contiguous. Use :func:`gpu_contiguous 1641 <theano.gpuarray.basic_ops.gpu_contiguous>` on these arguments 1642 if needed. 1643 1644 You can either enable the Theano flag `optimizer_including=conv_gemm` 1645 to automatically replace all convolution operations with `GpuCorr3dMM` 1646 or one of its gradients, or you can use it as a replacement for 1647 :func:`conv2d <theano.tensor.nnet.conv.conv2d>`, called as 1648 `GpuCorr3dMM(subsample=...)(image, filters)`. The latter is currently 1649 faster, but note that it computes a correlation -- if you need to 1650 compute a convolution, flip the filters as `filters[:,:,::-1,::-1,::-1]`. 1651 1652 """ 1653 def __init__(self, border_mode="valid", 1654 subsample=(1, 1, 1), 1655 filter_dilation=(1, 1, 1), 1656 num_groups=1): 1657 super(GpuCorr3dMM, self).__init__(border_mode, subsample, 1658 filter_dilation, num_groups) 1659 1660 def make_node(self, img, kern): 1661 ctx_name = infer_context_name(img, kern) 1662 img = as_gpuarray_variable(img, ctx_name) 1663 kern = as_gpuarray_variable(kern, ctx_name) 1664 if img.type.ndim != 5: 1665 raise TypeError('img must be 5D tensor') 1666 if kern.type.ndim != 5: 1667 raise TypeError('kern must be 5D tensor') 1668 1669 broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], 1670 False, False, False] 1671 return Apply(self, [img, kern], [GpuArrayType(dtype=img.dtype, 1672 context_name=ctx_name, 1673 broadcastable=broadcastable)()]) 1674 1675 def c_code(self, node, nodename, inp, out_, sub): 1676 bottom, weights = inp 1677 top, = out_ 1678 direction = "forward" 1679 return super(GpuCorr3dMM, self).c_code_helper(bottom, weights, top, direction, sub) 1680 1681 def grad(self, inp, grads): 1682 bottom, weights = inp 1683 top, = grads 1684 top = gpu_contiguous(top) 1685 d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, 1686 self.subsample, 1687 self.filter_dilation, 1688 self.num_groups)( 1689 weights, top, bottom.shape[-3:]) 1690 d_weights = GpuCorr3dMM_gradWeights(self.border_mode, 1691 self.subsample, 1692 self.filter_dilation, 1693 self.num_groups)( 1694 bottom, top, weights.shape[-3:]) 1695 return d_bottom, d_weights 1696 1697 1698class GpuCorr3dMM_gradWeights(BaseGpuCorr3dMM): 1699 """ 1700 Gradient wrt. filters for `GpuCorr3dMM`. 1701 1702 Notes 1703 ----- 1704 You will not want to use this directly, but rely on Theano's automatic 1705 differentiation or graph optimization to use it as needed. 1706 1707 """ 1708 1709 def __init__(self, border_mode="valid", 1710 subsample=(1, 1, 1), 1711 filter_dilation=(1, 1, 1), 1712 num_groups=1): 1713 super(GpuCorr3dMM_gradWeights, self).__init__(border_mode, 1714 subsample, 1715 filter_dilation, 1716 num_groups) 1717 1718 def make_node(self, img, topgrad, shape=None): 1719 ctx_name = infer_context_name(img, topgrad) 1720 img = as_gpuarray_variable(img, ctx_name) 1721 topgrad = as_gpuarray_variable(topgrad, ctx_name) 1722 if img.type.ndim != 5: 1723 raise TypeError('img must be 5D tensor') 1724 if topgrad.type.ndim != 5: 1725 raise TypeError('topgrad must be 5D tensor') 1726 if shape is None: 1727 if self.subsample != (1, 1, 1) or self.border_mode == "half": 1728 raise ValueError('shape must be given if subsample != (1, 1, 1)' 1729 ' or border_mode == "half"') 1730 height_width_depth = [] 1731 else: 1732 height_width_depth = [shape[0], shape[1], shape[2]] 1733 assert shape[0].ndim == 0 1734 assert shape[1].ndim == 0 1735 assert shape[2].ndim == 0 1736 1737 broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1], 1738 False, False, False] 1739 return Apply(self, [img, topgrad] + height_width_depth, 1740 [GpuArrayType(dtype=img.dtype, 1741 context_name=ctx_name, 1742 broadcastable=broadcastable)()]) 1743 1744 def c_code(self, node, nodename, inp, out_, sub): 1745 bottom, top = inp[:2] 1746 height, width, depth = inp[2:] or (None, None, None) 1747 weights, = out_ 1748 direction = "backprop weights" 1749 return super(GpuCorr3dMM_gradWeights, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth) 1750 1751 def grad(self, inp, grads): 1752 bottom, top = inp[:2] 1753 weights, = grads 1754 weights = gpu_contiguous(weights) 1755 d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, 1756 self.subsample, 1757 self.filter_dilation, 1758 self.num_groups)(weights, 1759 top, 1760 bottom.shape[-3:]) 1761 d_top = GpuCorr3dMM( 1762 self.border_mode, self.subsample, self.filter_dilation, 1763 self.num_groups)(bottom, weights) 1764 d_height_width_depth = (theano.gradient.DisconnectedType()(),)\ 1765 * 3 if len(inp) == 5 else () 1766 return (d_bottom, d_top) + d_height_width_depth 1767 1768 def connection_pattern(self, node): 1769 if node.nin == 2: 1770 return [[1], [1]] 1771 else: 1772 return [[1], [1], [0], [0], [0]] # no connection to height, width, depth 1773 1774 1775class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): 1776 """ 1777 Gradient wrt. inputs for `GpuCorr3dMM`. 1778 1779 Notes 1780 ----- 1781 You will not want to use this directly, but rely on Theano's automatic 1782 differentiation or graph optimization to use it as needed. 1783 1784 """ 1785 1786 def __init__(self, border_mode="valid", 1787 subsample=(1, 1, 1), 1788 filter_dilation=(1, 1, 1), 1789 num_groups=1): 1790 super(GpuCorr3dMM_gradInputs, self).__init__(border_mode, subsample, 1791 filter_dilation, num_groups) 1792 1793 def make_node(self, kern, topgrad, shape=None): 1794 ctx_name = infer_context_name(kern, topgrad) 1795 kern = as_gpuarray_variable(kern, ctx_name) 1796 topgrad = as_gpuarray_variable(topgrad, ctx_name) 1797 if kern.type.ndim != 5: 1798 raise TypeError('kern must be 5D tensor') 1799 if topgrad.type.ndim != 5: 1800 raise TypeError('topgrad must be 5D tensor') 1801 if shape is None: 1802 if self.subsample != (1, 1, 1): 1803 raise ValueError('shape must be given if subsample != (1, 1, 1)') 1804 height_width_depth = [] 1805 else: 1806 height_width_depth = [shape[0], shape[1], shape[2]] 1807 assert shape[0].ndim == 0 1808 assert shape[1].ndim == 0 1809 assert shape[2].ndim == 0 1810 1811 if self.num_groups > 1: 1812 broadcastable = [topgrad.type.broadcastable[0], False, 1813 False, False, False] 1814 else: 1815 broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-4], 1816 False, False, False] 1817 return Apply(self, [kern, topgrad] + height_width_depth, 1818 [GpuArrayType(dtype=topgrad.dtype, 1819 context_name=ctx_name, 1820 broadcastable=broadcastable)()]) 1821 1822 def c_code(self, node, nodename, inp, out_, sub): 1823 weights, top = inp[:2] 1824 height, width, depth = inp[2:] or (None, None, None) 1825 bottom, = out_ 1826 direction = "backprop inputs" 1827 return super(GpuCorr3dMM_gradInputs, self).c_code_helper(bottom, weights, top, direction, sub, height, width, depth) 1828 1829 def grad(self, inp, grads): 1830 weights, top = inp[:2] 1831 bottom, = grads 1832 bottom = gpu_contiguous(bottom) 1833 d_weights = GpuCorr3dMM_gradWeights(self.border_mode, 1834 self.subsample, 1835 self.filter_dilation, 1836 self.num_groups)(bottom, 1837 top, 1838 weights.shape[-3:]) 1839 d_top = GpuCorr3dMM(self.border_mode, 1840 self.subsample, 1841 self.filter_dilation, 1842 self.num_groups)(bottom, weights) 1843 d_height_width_depth = (theano.gradient.DisconnectedType()(),)\ 1844 * 3 if len(inp) == 5 else () 1845 return (d_weights, d_top) + d_height_width_depth 1846 1847 def connection_pattern(self, node): 1848 if node.nin == 2: 1849 return [[1], [1]] 1850 else: 1851 return [[1], [1], [0], [0], [0]] # no connection to height, width, depth 1852 1853 1854@inplace_allocempty(GpuGemv, 0) 1855def local_inplace_gpuagemv(node, inputs): 1856 return [gpugemv_inplace(*inputs)] 1857 1858 1859@inplace_allocempty(GpuGemm, 0) 1860def local_inplace_gpuagemm(node, inputs): 1861 return [gpugemm_inplace(*inputs)] 1862 1863 1864@inplace_allocempty(GpuGer, 0) 1865def local_inplace_gpuager(node, inputs): 1866 return [gpuger_inplace(*inputs)] 1867 1868 1869@inplace_allocempty(GpuGemmBatch, 0) 1870def local_inplace_gpuagemmbatch(node, inputs): 1871 return [gpugemmbatch_inplace(*inputs)] 1872 1873gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv, 1874 local_inplace_gpuagemm, 1875 local_inplace_gpuager, 1876 local_inplace_gpuagemmbatch), 1877 name='gpuablas_opt_inplace') 1878 1879optdb.register('InplaceGpuaBlasOpt', 1880 gpuablas_opt_inplace, 1881 70.0, 'fast_run', 'inplace', 'gpuarray') 1882