1from __future__ import absolute_import, print_function, division 2import ctypes 3import os 4import sys 5import warnings 6 7import numpy as np 8from six import integer_types 9from six.moves import reduce 10 11import theano 12from theano import Op, Apply, tensor, config, Variable 13from theano.scalar import (as_scalar, constant, Log, get_scalar_type, 14 int32 as int_t, bool as bool_t, uint32 as uint32_t) 15from theano.tensor import as_tensor_variable, Argmax 16from theano.tensor.extra_ops import cpu_contiguous 17from theano.gradient import DisconnectedType, grad_not_implemented 18from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList 19from theano.gof.cmodule import GCC_compiler 20from theano.gof.type import CDataType, Generic 21from theano.gof.opt import inherit_stack_trace 22from theano.tensor.opt import Assert 23from theano.compile import optdb 24from theano.compile.ops import shape_i, shape_i_op 25from theano.tensor.nnet import LogSoftmax, SoftmaxGrad 26from theano.tensor.nnet.abstract_conv import (AbstractConv2d, 27 AbstractConv2d_gradWeights, 28 AbstractConv2d_gradInputs, 29 AbstractConv3d, 30 AbstractConv3d_gradWeights, 31 AbstractConv3d_gradInputs, 32 get_conv_output_shape, 33 assert_conv_shape) 34from theano.tensor.signal.pool import ( 35 Pool, MaxPoolGrad, AveragePoolGrad) 36from . import pygpu, cudnn_defs 37from .type import (get_context, gpu_context_type, list_contexts, 38 GpuArraySharedVariable) 39from .basic_ops import (as_gpuarray_variable, infer_context_name, gpuarray_helper_inc_dir, 40 gpu_contiguous, GpuAllocEmpty, 41 empty_like, GpuArrayType, HostFromGpu) 42from .elemwise import GpuElemwise, GpuCAReduceCuda 43from .reduction import GpuMaxAndArgmax 44 45# These don't exist in gpuarray 46# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad 47from .nnet import GpuSoftmax 48from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2, 49 op_lifter, register_opt2, register_inplace) 50 51from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims 52 53from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME 54import theano.pathparse 55 56DNN_CONV_ALGO_CHOOSE_ONCE = ['guess_once', 'time_once'] 57DNN_CONV_ALGO_CHOOSE_TIME = ['time_once', 'time_on_shape_change'] 58 59try: 60 from pygpu import gpuarray 61except ImportError: 62 pass 63 64# Update these names when new versions of cudnn are supported. 65WIN32_CUDNN_NAMES = ['cudnn64_7.dll', 'cudnn64_6.dll', 'cudnn64_5.dll'] 66 67if sys.platform == 'win32': 68 theano.pathparse.PathParser(theano.config.dnn.bin_path) 69 70 71def _load_lib(name): 72 try: 73 return ctypes.cdll.LoadLibrary(name) 74 except OSError: 75 return None 76 77 78def _dnn_lib(): 79 if _dnn_lib.handle is None: 80 import ctypes.util 81 82 if config.dnn.bin_path != "": 83 if sys.platform == 'darwin': 84 dnn_handle = _load_lib(os.path.join(config.dnn.bin_path, 'libcudnn.dylib')) 85 elif sys.platform == 'win32': 86 for name in WIN32_CUDNN_NAMES: 87 dnn_handle = _load_lib(os.path.join(config.dnn.bin_path, name)) 88 if dnn_handle is not None: 89 break 90 else: 91 dnn_handle = _load_lib(os.path.join(config.dnn.bin_path, 'libcudnn.so')) 92 else: 93 lib_name = ctypes.util.find_library('cudnn') 94 if lib_name is None and sys.platform == 'win32': 95 for name in WIN32_CUDNN_NAMES: 96 lib_name = ctypes.util.find_library(name) 97 if lib_name: 98 break 99 if lib_name is None: 100 raise RuntimeError( 101 'Could not find cudnn library (looked for v5* to v7*).' 102 ' Check your cudnn installation. Maybe using the Theano' 103 ' flag dnn.base_path can help you. Current value "%s"' % 104 config.dnn.base_path) 105 else: 106 dnn_handle = ctypes.cdll.LoadLibrary(lib_name) 107 if dnn_handle is None: 108 raise RuntimeError('Could not load cudnn library. Check your cudnn' 109 ' installation. Maybe using the Theano' 110 ' flag dnn.base_path can help you. Current value "%s"' % 111 config.dnn.base_path) 112 _dnn_lib.handle = dnn_handle 113 cudnn = _dnn_lib.handle 114 cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)] 115 cudnn.cudnnCreate.restype = ctypes.c_int 116 cudnn.cudnnDestroy.argtypes = [ctypes.c_void_p] 117 cudnn.cudnnDestroy.restype = ctypes.c_int 118 return _dnn_lib.handle 119 120_dnn_lib.handle = None 121 122 123def _make_handle(ctx): 124 cudnn = _dnn_lib() 125 handle = ctypes.c_void_p() 126 with ctx: 127 err = cudnn.cudnnCreate(ctypes.byref(handle)) 128 if err != 0: 129 raise RuntimeError("Error creating cudnn handle. " 130 "This can be a sign of a too old driver.", err) 131 return handle 132 133 134def _dnn_check_compile(): 135 preambule = """ 136#include <stdio.h> 137#include <cudnn.h> 138#include <cudnn_helper.h> 139""" 140 141 # No need for the context in here since we won't execute that code 142 body = """ 143cudnnHandle_t _handle = NULL; 144cudnnStatus_t err; 145if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { 146 fprintf(stderr, "could not create cuDNN handle: %s", 147 cudnnGetErrorString(err)); 148 return 1; 149} 150""" 151 152 path_wrapper = "\"" if os.name == 'nt' else "" 153 params = ["-l", "cudnn"] 154 params.extend(['-I%s%s%s' % (path_wrapper, gpuarray_helper_inc_dir(), path_wrapper)]) 155 if config.dnn.include_path: 156 params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)]) 157 if config.cuda.include_path: 158 params.extend(['-I%s%s%s' % (path_wrapper, config.cuda.include_path, path_wrapper)]) 159 if config.dnn.library_path: 160 params.extend(['-L%s%s%s' % (path_wrapper, config.dnn.library_path, path_wrapper)]) 161 # Do not run here the test program. It would run on the 162 # default gpu, not the one selected by the user. If mixed 163 # GPU are installed or if the GPUs are configured in 164 # exclusive mode, this cause bad detection. 165 166 # NB: GCC_compiler.try_flags() may return just a boolean instead of a tuple (avail, out, here). 167 compiler_res = GCC_compiler.try_flags( 168 params, preambule=preambule, body=body, 169 try_run=False, output=True) 170 171 avail, out, err = compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None) 172 173 if not avail: 174 return False, ("cannot compile with cuDNN. " 175 "We got this error:\n" + str(err)) 176 return True, None 177 178 179def _dnn_check_version(): 180 v = version() 181 if v < 5000: 182 return False, "cuDNN version is too old. Update to v5* or higher, was %d." % v 183 if v >= 7200: 184 warnings.warn("Your cuDNN version is more recent than " 185 "Theano. If you encounter problems, try " 186 "updating Theano or downgrading cuDNN to " 187 "a version >= v5 and <= v7.") 188 return True, None 189 190 191def dnn_present(): 192 if dnn_present.avail is not None: 193 return dnn_present.avail 194 if config.dnn.enabled == "False": 195 dnn_present.msg = "Disabled by dnn.enabled flag" 196 dnn_present.avail = False 197 return False 198 199 if pygpu is None: 200 dnn_present.msg = "PyGPU not available" 201 dnn_present.avail = False 202 return False 203 204 if config.dnn.enabled == "no_check": 205 dnn_present.avail, dnn_present.msg = True, "presence check disabled by dnn.enabled flag" 206 else: 207 dnn_present.avail, dnn_present.msg = _dnn_check_compile() 208 if dnn_present.avail: 209 dnn_present.avail, dnn_present.msg = _dnn_check_version() 210 if not dnn_present.avail: 211 return False 212 213 return dnn_present.avail 214 215dnn_present.avail = None 216dnn_present.msg = None 217 218 219def dnn_available(context_name): 220 if not dnn_present(): 221 dnn_available.msg = dnn_present.msg 222 return False 223 224 ctx = get_context(context_name) 225 226 if not ctx.kind == b'cuda': 227 dnn_available.msg = "Not on a CUDA device." 228 return False 229 230 # This is a hack because bin_id is in the from of 231 # "<something>_<major><minor>" for cuda devices. 232 if int(ctx.bin_id[-2:]) < 30: 233 dnn_available.msg = "Device not supported" 234 return False 235 236 # On V100, cuDNN lower then 7002 don't raise error but 237 # takes hours to load or execute! So raise a good user error. 238 if version() < 7002: 239 if int(ctx.bin_id[-2:]) >= 70: 240 dnn_available.msg = "Use cuDNN 7.0.2 or higher for Volta." 241 return False 242 return True 243 244dnn_available.msg = None 245 246 247def CUDNNDataType(name, freefunc=None): 248 cargs = [] 249 if config.dnn.bin_path and sys.platform != 'win32': 250 cargs.append('-Wl,-rpath,' + config.dnn.bin_path) 251 252 return CDataType(name, freefunc, 253 headers=['cudnn.h'], 254 header_dirs=[config.dnn.include_path, 255 config.cuda.include_path], 256 libraries=['cudnn'], 257 lib_dirs=[config.dnn.library_path], 258 compile_args=cargs, 259 version=version(raises=False)) 260 261 262class DnnVersion(Op): 263 __props__ = () 264 265 def c_headers(self): 266 return ['cudnn.h'] 267 268 def c_header_dirs(self): 269 return [config.dnn.include_path, config.cuda.include_path] 270 271 def c_libraries(self): 272 return ['cudnn'] 273 274 def c_lib_dirs(self): 275 return [config.dnn.library_path] 276 277 def c_compile_args(self): 278 if config.dnn.bin_path and sys.platform != 'win32': 279 return ['-Wl,-rpath,' + config.dnn.bin_path] 280 return [] 281 282 def c_support_code(self): 283 return """ 284#if PY_MAJOR_VERSION >= 3 285#define PyInt_FromLong PyLong_FromLong 286#endif 287""" 288 289 def make_node(self): 290 return Apply(self, [], [Generic()()]) 291 292 def c_code(self, node, name, inputs, outputs, sub): 293 o = outputs[0] 294 return """ 295 %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion())); 296 """ % locals() 297 298 def do_constant_folding(self, node): 299 # Needed as we do not want to cache this information. 300 return False 301 302 def c_code_cache_version(self): 303 # Not needed, but make it clear that we do not want to cache this. 304 return None 305 306 307def version(raises=True): 308 """Return the current cuDNN version we link with. 309 310 This also does a check that the header version matches the runtime version. 311 312 :raises: If True, raise an exception if cuDNN is not present. 313 Otherwise, return -1. 314 315 It always raise an RuntimeError if the header and library version 316 are not the same. 317 318 """ 319 if not dnn_present(): 320 if raises: 321 raise RuntimeError( 322 "We can't determine the cudnn version as it is not available", 323 dnn_available.msg) 324 else: 325 return -1 326 327 if version.v is None: 328 f = theano.function([], DnnVersion()(), 329 theano.Mode(optimizer=None), 330 profile=False) 331 v = f() 332 if v[0] != v[1]: 333 raise RuntimeError("Mixed dnn version. The header is version %s " 334 "while the library is version %s." % v) 335 version.v = v[1] 336 return version.v 337version.v = None 338 339handle_type = CUDNNDataType('cudnnHandle_t', 'cudnnDestroy') 340 341# Get cuDNN definitions to be used. 342cudnn = cudnn_defs.get_definitions(version(raises=False)) 343 344 345def get_precision(precision, inputs, for_grad=False): 346 common_dtype = theano.scalar.upcast(*[i.dtype for i in inputs]) 347 if not common_dtype.startswith('float'): 348 raise TypeError("cuDNN convolution only works on real numbers") 349 350 if precision is None: 351 precision = theano.config.dnn.conv.precision 352 if precision == 'as_input' or precision == 'as_input_f32': 353 if common_dtype == 'float16' and precision == 'as_input_f32': 354 precision = 'float32' 355 else: 356 precision = common_dtype 357 if for_grad and precision == 'float16': 358 raise TypeError("Float16 precision is disabled for cuDNN backward convolutions due to computation errors.") 359 return precision, common_dtype 360 361 362class DnnBase(COp): 363 364 """ 365 Creates a handle for cudnn and pulls in the cudnn libraries and headers. 366 367 """ 368 # dnn does not know about broadcasting, so we do not need to assert 369 # the input broadcasting pattern. 370 check_broadcast = False 371 params_type = handle_type 372 373 def dnn_context(self, node): 374 return node.outputs[0].type.context_name 375 376 def get_params(self, node): 377 ctx_name = self.dnn_context(node) 378 ctx = get_context(ctx_name) 379 if not hasattr(ctx, 'cudnn_handle_param'): 380 ptr = ctx.cudnn_handle.value 381 res = handle_type.make_value(ptr) 382 ctx.cudnn_handle_param = res 383 if isinstance(self.params_type, ParamsType): 384 if not self.params_type.has_type(handle_type): 385 raise TypeError('DnnBase: params_type must take into account the cuDNN handle type.') 386 handle_field = self.params_type.get_field(handle_type) 387 return self.params_type.get_params(self, **{handle_field: ctx.cudnn_handle_param}) 388 return ctx.cudnn_handle_param 389 390 def __init__(self, files=None, c_func=None): 391 if files is None: 392 files = [] 393 COp.__init__(self, ["c_code/dnn_base.c"] + files, c_func) 394 395 def c_headers(self): 396 return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h', 397 'gpuarray/util.h', 'gpuarray/ext_cuda.h', 'gpuarray_api.h', 398 'numpy_compat.h', 'cudnn.h', 'cudnn_helper.h', 399 'gpuarray_helper.h'] 400 401 def c_header_dirs(self): 402 return [gpuarray_helper_inc_dir(), pygpu.get_include(), 403 config.dnn.include_path, config.cuda.include_path] 404 405 def c_libraries(self): 406 return ['cudnn', 'gpuarray'] 407 408 def c_lib_dirs(self): 409 return [config.dnn.library_path] 410 411 def c_compile_args(self): 412 if config.dnn.bin_path and sys.platform != 'win32': 413 return ['-Wl,-rpath,' + config.dnn.bin_path] 414 return [] 415 416 def c_code_cache_version(self): 417 return (super(DnnBase, self).c_code_cache_version(), version(), 4) 418 419 420class GpuDnnConvDesc(COp): 421 422 """ 423 This Op builds a convolution descriptor for use in the other convolution 424 operations. 425 426 See the doc of :func:`dnn_conv` for a description of the parameters 427 428 """ 429 430 __props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode', 431 'precision', 'num_groups') 432 params_type = ParamsType(pad0=int_t, pad1=int_t, pad2=int_t, 433 sub0=int_t, sub1=int_t, sub2=int_t, 434 dil0=int_t, dil1=int_t, dil2=int_t, 435 nb_dims=int_t, 436 bmode=EnumList(('BORDER_MODE_FULL', 'full'), 437 ('BORDER_MODE_VALID', 'valid'), 438 ('BORDER_MODE_HALF', 'half')), 439 conv_mode=cudnn.cudnnConvolutionMode_t, 440 precision=cudnn.cudnnDataType_t, 441 num_groups=int_t) 442 443 def c_headers(self): 444 return ['cudnn.h', 'cudnn_helper.h'] 445 446 def c_header_dirs(self): 447 return [gpuarray_helper_inc_dir(), config.dnn.include_path, 448 config.cuda.include_path] 449 450 def c_libraries(self): 451 return ['cudnn'] 452 453 def c_lib_dirs(self): 454 return [config.dnn.library_path] 455 456 def c_compile_args(self): 457 if config.dnn.bin_path and sys.platform != 'win32': 458 return ['-Wl,-rpath,' + config.dnn.bin_path] 459 return [] 460 461 def do_constant_folding(self, node): 462 return False 463 464 def __init__(self, border_mode, subsample=(1, 1), dilation=(1, 1), conv_mode='conv', 465 precision="float32", num_groups=1): 466 COp.__init__(self, ["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)") 467 468 if version() < 6000 and any([d != 1 for d in dilation]): 469 raise RuntimeError("Dilation > 1 not supported for cuDNN version < 6.") 470 471 if isinstance(border_mode, integer_types): 472 border_mode = (border_mode,) * len(subsample) 473 if isinstance(border_mode, tuple): 474 assert len(border_mode) == len(subsample) 475 border_mode = tuple(map(int, border_mode)) 476 if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or 477 border_mode in ('valid', 'full', 'half')): 478 raise ValueError( 479 'invalid border_mode {}, which must be either ' 480 '"valid", "full", "half", an integer or a pair of' 481 ' integers'.format(border_mode)) 482 self.border_mode = border_mode 483 assert len(subsample) in (2, 3) 484 self.subsample = subsample 485 assert cudnn.cudnnConvolutionMode_t.has_alias(conv_mode) 486 self.conv_mode = conv_mode 487 self.num_groups = num_groups 488 489 assert len(dilation) == len(subsample) 490 self.dilation = dilation 491 492 assert cudnn.cudnnDataType_t.has_alias(precision) 493 self.precision = precision 494 495 def make_node(self, kern_shape): 496 kern_shape = as_tensor_variable(kern_shape) 497 if kern_shape.type.ndim != 1 or kern_shape.dtype not in theano.tensor.basic.int_dtypes: 498 raise TypeError('kern must be an int64 1D shape tensor') 499 kern_shape = theano.tensor.basic.cast(kern_shape, 'int64') 500 501 node = Apply(self, [kern_shape], 502 [CUDNNDataType("cudnnConvolutionDescriptor_t", 503 freefunc="cudnnDestroyConvolutionDescriptor")()]) 504 # DebugMode cannot compare the values of CDataType variables, so by 505 # default it returns False all the time. To prevent DebugMode from 506 # complaining because of the MergeOptimizer, we make this variable 507 # always compare to True. 508 out = node.outputs[0] 509 out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true 510 return node 511 512 bmode = property(lambda self: 'valid' if isinstance(self.border_mode, tuple) else self.border_mode) 513 pad0 = property(lambda self: self.border_mode[0] if isinstance(self.border_mode, tuple) else 0) 514 pad1 = property(lambda self: self.border_mode[1] if isinstance(self.border_mode, tuple) else 0) 515 pad2 = property(lambda self: self.border_mode[2] if (isinstance(self.border_mode, tuple) and 516 len(self.border_mode) > 2) else 0) 517 sub0 = property(lambda self: self.subsample[0]) 518 sub1 = property(lambda self: self.subsample[1]) 519 sub2 = property(lambda self: self.subsample[2] if len(self.subsample) > 2 else 0) 520 dil0 = property(lambda self: self.dilation[0]) 521 dil1 = property(lambda self: self.dilation[1]) 522 dil2 = property(lambda self: self.dilation[2] if len(self.dilation) > 2 else 0) 523 nb_dims = property(lambda self: len(self.subsample)) 524 525 def c_code_cache_version(self): 526 return (super(GpuDnnConvDesc, self).c_code_cache_version(), version()) 527 528 def __setstate__(self, d): 529 self.__dict__.update(d) 530 if not hasattr(self, "dilation"): 531 self.dilation = (1,) * len(self.subsample) 532 if not hasattr(self, "num_groups"): 533 self.num_groups = 1 534 535 536# scalar constants 537_zero = constant(np.asarray(0.0, dtype='float64')) 538_one = constant(np.asarray(1.0, dtype='float64')) 539 540 541def ensure_dt(val, default, name, dtype): 542 if dtype == 'float16': 543 dtype = 'float32' 544 if val is None: 545 val = default.clone() 546 if not isinstance(val, Variable): 547 val = constant(val) 548 if hasattr(val, 'ndim') and val.ndim == 0: 549 val = as_scalar(val) 550 if not isinstance(val.type, theano.scalar.Scalar): 551 raise TypeError("%s: expected a scalar value" % (name,)) 552 if not val.type.dtype == dtype: 553 val = val.astype(dtype) 554 return val 555 556 557class GpuDnnConv(DnnBase): 558 559 """ 560 The forward convolution. 561 562 Parameters 563 ---------- 564 image 565 kernel 566 descr : 567 The convolution descriptor. 568 algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'winograd', 'guess_once', 569 'guess_on_shape_change', 'time_once', 'time_on_shape_change'} 570 Default is the value of :attr:`config.dnn.conv.algo_fwd`. 571 num_groups : 572 Divides the image, kernel and output tensors into num_groups 573 separate groups. Each which carry out convolutions separately 574 575 """ 576 _f16_ok = True 577 __props__ = ('algo', 'inplace', 'num_groups') 578 579 check_input = False 580 params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t, 581 choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t, 582 inplace=bool_t, 583 handle=handle_type, 584 num_groups=int_t) 585 586 def __init__(self, algo=None, inplace=False, num_groups=1): 587 DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_fwd.c"], 588 "APPLY_SPECIFIC(conv_fwd)") 589 590 if algo is None: 591 algo = config.dnn.conv.algo_fwd 592 self.algo = algo 593 594 self.inplace = bool(inplace) 595 if self.inplace: 596 self.destroy_map = {0: [2]} 597 598 assert cudnn.cudnnConvolutionFwdAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME 599 600 self.conv_algo = cudnn.cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM 601 if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME: 602 self.conv_algo = self.algo 603 self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME 604 self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE 605 self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME 606 self.num_groups = num_groups 607 608 def __setstate__(self, d): 609 self.__dict__.update(d) 610 if not hasattr(self, 'algo'): 611 if hasattr(self, 'workmem'): 612 self.algo = self.workmem 613 else: 614 self.algo = config.dnn.conv.algo_fwd 615 if not hasattr(self, 'inplace'): 616 self.inplace = False 617 if not hasattr(self, 'num_groups'): 618 self.num_groups = 1 619 620 def make_node(self, img, kern, output, desc, alpha=None, beta=None): 621 ctx_name = infer_context_name(img, kern, output) 622 img = as_gpuarray_variable(img, ctx_name) 623 kern = as_gpuarray_variable(kern, ctx_name) 624 output = as_gpuarray_variable(output, ctx_name) 625 626 if img.type.ndim not in (4, 5): 627 raise TypeError('img must be 4D or 5D tensor') 628 if kern.type.ndim not in (4, 5): 629 raise TypeError('kern must be 4D or 5D tensor') 630 if output.type.ndim not in (4, 5): 631 raise TypeError('output must be a 4D or 5D tensor') 632 633 if (img.type.ndim != kern.type.ndim or 634 img.type.ndim != output.type.ndim): 635 raise TypeError("The number of dimensions of " 636 "img, kern and output must match") 637 638 if img.type.ndim == 5 and self.algo not in (cudnn.conv3d_fwd_algorithms + 639 SUPPORTED_DNN_CONV_ALGO_RUNTIME): 640 raise ValueError("convolution algo %s can't be used for " 641 "3d convolutions", (self.algo,)) 642 643 if (not isinstance(desc.type, CDataType) or 644 desc.type.ctype != 'cudnnConvolutionDescriptor_t'): 645 raise TypeError('desc must be cudnnConvolutionDescriptor_t') 646 647 alpha = ensure_dt(alpha, _one, 'alpha', img.dtype) 648 beta = ensure_dt(beta, _zero, 'beta', img.dtype) 649 650 return Apply(self, [img, kern, output, desc, alpha, beta], 651 [output.type()]) 652 653 def grad(self, inp, grads): 654 img, kerns, output, desc, alpha, beta = inp 655 top, = grads 656 657 top = gpu_contiguous(top) 658 659 d_img = GpuDnnConvGradI(num_groups=self.num_groups)(kerns, top, empty_like(img), desc) 660 d_kerns = GpuDnnConvGradW(num_groups=self.num_groups)(img, top, empty_like(kerns), desc) 661 d_alpha = grad_not_implemented(self, 4, alpha) 662 d_beta = grad_not_implemented(self, 5, beta) 663 664 return [d_img * alpha, d_kerns * alpha, top * beta, 665 DisconnectedType()(), d_alpha, d_beta] 666 667 def connection_pattern(self, node): 668 # not connected to desc 669 return [[1], [1], [1], [0], [1], [1]] 670 671 @staticmethod 672 def get_out_shape(ishape, kshape, border_mode, subsample, dilation): 673 """ 674 This function computes the output shape for a convolution with 675 the specified parameters. `ishape` and `kshape` can be symbolic 676 or scalar. 677 678 """ 679 680 # if ishape and/or kshape are not tuples or list, but rather symbolic 681 # vectors, turn them into lists of symbolic scalars. 682 if not isinstance(ishape, (list, tuple)): 683 ishape = [ishape[i] for i in range(len(subsample) + 2)] 684 if not isinstance(kshape, (list, tuple)): 685 kshape = [kshape[i] for i in range(len(subsample) + 2)] 686 687 return get_conv_output_shape( 688 ishape, 689 kshape, 690 border_mode, 691 subsample, 692 dilation) 693 694 def infer_shape(self, node, shape): 695 return [shape[2]] 696 697 698class GpuDnnConvGradW(DnnBase): 699 700 """ 701 The convolution gradient with respect to the weights. 702 703 Parameters 704 ---------- 705 image 706 kernel 707 descr : 708 The convolution descriptor. 709 algo : {'none', 'deterministic', 'fft', 'small', 'guess_once', 710 'guess_on_shape_change', 'time_once', 'time_on_shape_change'} 711 Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`. 712 num_groups : 713 Divides the image, kernel and output tensors into num_groups 714 separate groups. Each which carry out convolutions separately 715 716 """ 717 _f16_ok = True 718 __props__ = ('algo', 'inplace', 'num_groups') 719 720 check_input = False 721 params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t, 722 choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t, 723 inplace=bool_t, 724 handle=handle_type, 725 num_groups=int_t) 726 727 def __init__(self, inplace=False, algo=None, num_groups=1): 728 DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gw.c"], 729 "APPLY_SPECIFIC(conv_gw)") 730 self.inplace = bool(inplace) 731 if self.inplace: 732 self.destroy_map = {0: [2]} 733 if algo is None: 734 algo = config.dnn.conv.algo_bwd_filter 735 self.algo = algo 736 737 assert cudnn.cudnnConvolutionBwdFilterAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME 738 739 self.conv_algo = cudnn.cudnnConvolutionBwdFilterAlgo_t.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 740 if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME: 741 self.conv_algo = self.algo 742 self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME 743 self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE 744 self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME 745 self.num_groups = num_groups 746 747 def __setstate__(self, d): 748 self.__dict__.update(d) 749 if not hasattr(self, 'inplace'): 750 self.inplace = False 751 if not hasattr(self, 'algo'): 752 self.algo = config.dnn.conv.algo_bwd_filter 753 if not hasattr(self, 'num_groups'): 754 self.num_groups = 1 755 756 def grad(self, inp, grads): 757 img, top, output, desc, alpha, beta = inp 758 kerns, = grads 759 760 kerns = gpu_contiguous(kerns) 761 762 d_img = GpuDnnConvGradI(num_groups=self.num_groups)(kerns, top, empty_like(img), desc) 763 d_top = GpuDnnConv(num_groups=self.num_groups)(img, kerns, empty_like(top), desc) 764 d_alpha = grad_not_implemented(self, 4, alpha) 765 d_beta = grad_not_implemented(self, 5, beta) 766 767 return (d_img * alpha, d_top * alpha, kerns * beta, 768 DisconnectedType()(), d_alpha, d_beta) 769 770 def connection_pattern(self, node): 771 # not connected to desc 772 return [[1], [1], [1], [0], [1], [1]] 773 774 def op_may_fail_with_subsample(self, img, desc): 775 return (version() < 6000 and 776 img.type.dtype == 'float32' and 777 img.type.ndim == 5 and 778 self.algo != 'none' and 779 desc.owner.op.subsample != (1, 1, 1)) 780 781 def op_may_fail_with_beta(self, img, beta): 782 return (version() < 6000 and 783 img.type.dtype == 'float32' and 784 self.algo not in ('none', 'deterministic', 'fft', 'small') and 785 beta is not None and 786 theano.tensor.extract_constant(beta) != 1) 787 788 def make_node(self, img, topgrad, output, desc, alpha=None, beta=None): 789 if self.op_may_fail_with_subsample(img, desc): 790 warnings.warn('cuDNN backward filter operation for 3D convolutions may produce bad results ' 791 'with certain cuDNN algorithms depending on the compute capability of your GPU ' 792 'if subsample is not (1, 1, 1). If you encounter problems, consider ' 793 'setting the theano flag "dnn.conv.algo_bwd_filter" to "none".') 794 if self.op_may_fail_with_beta(img, beta): 795 warnings.warn('cuDNN backward filter operation for convolutions may produce bad results ' 796 'with certain cuDNN algorithms depending on the compute capability of your GPU ' 797 'if beta != 1. If you encounter problems, consider ' 798 'setting the theano flag "dnn.conv.algo_bwd_filter" to ' 799 '"none", "deterministic", "fft", or "small".') 800 ctx_name = infer_context_name(img, topgrad, output) 801 img = as_gpuarray_variable(img, ctx_name) 802 topgrad = as_gpuarray_variable(topgrad, ctx_name) 803 output = as_gpuarray_variable(output, ctx_name) 804 if img.type.ndim not in (4, 5): 805 raise TypeError('img must be 4D or 5D tensor') 806 if topgrad.type.ndim not in (4, 5): 807 raise TypeError('topgrad must be 4D or 5D tensor') 808 if output.type.ndim not in (4, 5): 809 raise TypeError('output must be 4D or 5D tensor') 810 811 if (img.type.ndim != topgrad.type.ndim or 812 img.type.ndim != output.type.ndim): 813 raise TypeError("The number of dimensions of " 814 "img, topgrad and output must match") 815 816 if img.type.ndim == 5 and self.algo not in (cudnn.conv3d_bwd_filter_algorithms + 817 SUPPORTED_DNN_CONV_ALGO_RUNTIME): 818 raise ValueError("convolution algo %s can't be used for " 819 "3d convolutions", (self.algo,)) 820 821 if (not isinstance(desc.type, CDataType) or 822 desc.type.ctype != 'cudnnConvolutionDescriptor_t'): 823 raise TypeError('desc must be cudnnConvolutionDescriptor_t') 824 825 alpha = ensure_dt(alpha, _one, 'alpha', img.dtype) 826 beta = ensure_dt(beta, _zero, 'beta', img.dtype) 827 828 return Apply(self, [img, topgrad, output, desc, alpha, beta], 829 [output.type()]) 830 831 def infer_shape(self, node, shape): 832 return [shape[2]] 833 834 835class GpuDnnConvGradI(DnnBase): 836 """ 837 The convolution gradient with respect to the inputs. 838 839 Parameters 840 ---------- 841 image 842 kernel 843 descr 844 The convolution descriptor. 845 algo : {'none', 'deterministic', 'fft', 'fft_tiling', 'winograd', 'guess_once', 846 'guess_on_shape_change', 'time_once', 'time_on_shape_change'} 847 Default is the value of :attr:`config.dnn.conv.algo_bwd_data`. 848 num_groups : 849 Divides the image, kernel and output tensors into num_groups 850 separate groups. Each which carry out convolutions separately 851 852 """ 853 _f16_ok = True 854 __props__ = ('algo', 'inplace', 'num_groups') 855 856 check_input = False 857 params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t, 858 choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t, 859 inplace=bool_t, 860 handle=handle_type, 861 num_groups=int_t) 862 863 def __init__(self, inplace=False, algo=None, num_groups=1): 864 DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gi.c"], 865 "APPLY_SPECIFIC(conv_gi)") 866 self.inplace = bool(inplace) 867 if self.inplace: 868 self.destroy_map = {0: [2]} 869 if algo is None: 870 algo = config.dnn.conv.algo_bwd_data 871 self.algo = algo 872 assert cudnn.cudnnConvolutionBwdDataAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME 873 874 self.conv_algo = cudnn.cudnnConvolutionBwdDataAlgo_t.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 875 if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME: 876 self.conv_algo = self.algo 877 self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME 878 self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE 879 self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME 880 self.num_groups = num_groups 881 882 def __setstate__(self, d): 883 self.__dict__.update(d) 884 if not hasattr(self, 'algo'): 885 self.algo = config.dnn.conv.algo_bwd_data 886 if not hasattr(self, 'inplace'): 887 self.inplace = False 888 if not hasattr(self, 'num_groups'): 889 self.num_groups = 1 890 891 def grad(self, inp, grads): 892 kerns, top, output, desc, alpha, beta = inp 893 img, = grads 894 895 img = gpu_contiguous(img) 896 897 d_kerns = GpuDnnConvGradW(num_groups=self.num_groups)(img, top, empty_like(kerns), desc) 898 d_top = GpuDnnConv(num_groups=self.num_groups)(img, kerns, empty_like(top), desc) 899 d_alpha = grad_not_implemented(self, 4, alpha) 900 d_beta = grad_not_implemented(self, 5, beta) 901 902 return (d_kerns * alpha, d_top * alpha, img * beta, 903 DisconnectedType()(), d_alpha, d_beta) 904 905 def connection_pattern(self, node): 906 # not connected to desc 907 return [[1], [1], [1], [0], [1], [1]] 908 909 def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None): 910 ctx_name = infer_context_name(kern, topgrad, output) 911 kern = as_gpuarray_variable(kern, ctx_name) 912 topgrad = as_gpuarray_variable(topgrad, ctx_name) 913 output = as_gpuarray_variable(output, ctx_name) 914 if kern.type.ndim not in (4, 5): 915 raise TypeError('kern must be 4D or 5D tensor') 916 if topgrad.type.ndim not in (4, 5): 917 raise TypeError('topgrad must be 4D or 5D tensor') 918 if output.type.ndim not in (4, 5): 919 raise TypeError('output must be 4D or 5D tensor') 920 921 if (kern.type.ndim != topgrad.type.ndim or 922 kern.type.ndim != output.type.ndim): 923 raise TypeError("The number of dimensions of " 924 "kern, topgrad and output must match") 925 926 if kern.type.ndim == 5 and self.algo not in (cudnn.conv3d_bwd_data_algorithms + 927 SUPPORTED_DNN_CONV_ALGO_RUNTIME): 928 raise ValueError("convolution algo %s can't be used for " 929 "3d convolutions", (self.algo,)) 930 931 if (not isinstance(desc.type, CDataType) or 932 desc.type.ctype != 'cudnnConvolutionDescriptor_t'): 933 raise TypeError('desc must be cudnnConvolutionDescriptor_t') 934 935 alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype) 936 beta = ensure_dt(beta, _zero, 'beta', kern.dtype) 937 938 return Apply(self, [kern, topgrad, output, desc, alpha, beta], 939 [output.type()]) 940 941 def infer_shape(self, node, shape): 942 return [shape[2]] 943 944 945# These internal implementations for dnn_conv, dnn_gradweight and dnn_gradinput 946# support alpha, beta and out as parameters. Public interfaces follow without 947# underscore prefix. 948 949def _dnn_conv(img, kerns, alpha=1, beta=0, out=None, border_mode='valid', subsample=(1, 1), dilation=(1, 1), 950 conv_mode='conv', algo=None, precision=None, num_groups=1): 951 ctx_name = infer_context_name(img, kerns) 952 953 img = as_gpuarray_variable(img, ctx_name) 954 kerns = as_gpuarray_variable(kerns, ctx_name) 955 956 precision, dt = get_precision(precision, [img, kerns]) 957 958 img = gpu_contiguous(img.astype(dt)) 959 kerns = gpu_contiguous(kerns.astype(dt)) 960 961 desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, 962 conv_mode=conv_mode, precision=precision, num_groups=num_groups)(kerns.shape) 963 desc_op = desc.owner.op 964 # We can use Shape_i and bypass the infer_shape here as this is on 965 # the input of node and it will always be present. 966 ishape = [shape_i_op(i)(img) for i in range(img.ndim)] 967 kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)] 968 out_shp = get_conv_output_shape(ishape, kshape, desc_op.border_mode, desc_op.subsample, filter_dilation=dilation) 969 out_shp = assert_conv_shape(out_shp) 970 if beta == 0: 971 real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) 972 else: 973 assert out is not None 974 out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) 975 check = Assert('GpuDnnConv: given output (for beta not null) does not have expected shape') 976 real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, out_shp))) 977 return GpuDnnConv(algo=algo, num_groups=num_groups)(img, kerns, real_out, desc, alpha, beta) 978 979 980def _dnn_gradweight(img, topgrad, kerns_shp, alpha=1, beta=0, out=None, border_mode='valid', subsample=(1, 1), 981 dilation=(1, 1), conv_mode='conv', algo=None, precision=None, num_groups=1): 982 ctx_name = infer_context_name(img, topgrad) 983 984 img = as_gpuarray_variable(img, ctx_name) 985 topgrad = as_gpuarray_variable(topgrad, ctx_name) 986 kerns_shp = theano.tensor.as_tensor_variable(kerns_shp) 987 988 precision, dt = get_precision(precision, [img, topgrad], for_grad=True) 989 990 img = gpu_contiguous(img.astype(dt)) 991 topgrad = gpu_contiguous(topgrad.astype(dt)) 992 993 desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, 994 conv_mode=conv_mode, precision=precision, num_groups=num_groups)(kerns_shp) 995 if beta == 0: 996 real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp) 997 else: 998 assert out is not None 999 out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) 1000 check = Assert('GpuDnnConvGradW: given output (for beta not null) does not have expected shape') 1001 real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, kerns_shp))) 1002 return GpuDnnConvGradW(algo=algo, num_groups=num_groups)(img, topgrad, real_out, desc, alpha, beta) 1003 1004 1005def _dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mode='valid', subsample=(1, 1), 1006 dilation=(1, 1), conv_mode='conv', algo=None, precision=None, num_groups=1): 1007 ctx_name = infer_context_name(kerns, topgrad) 1008 1009 kerns = as_gpuarray_variable(kerns, ctx_name) 1010 topgrad = as_gpuarray_variable(topgrad, ctx_name) 1011 img_shp = theano.tensor.as_tensor_variable(img_shp) 1012 1013 precision, dt = get_precision(precision, [kerns, topgrad], for_grad=True) 1014 1015 kerns = gpu_contiguous(kerns.astype(dt)) 1016 topgrad = gpu_contiguous(topgrad.astype(dt)) 1017 1018 desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, 1019 conv_mode=conv_mode, precision=precision, num_groups=num_groups)(kerns.shape) 1020 if beta == 0: 1021 real_out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp) 1022 else: 1023 assert out is not None 1024 out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) 1025 check = Assert('GpuDnnConvGradI: given output (for beta not null) does not have expected shape') 1026 real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, img_shp))) 1027 return GpuDnnConvGradI(algo=algo, num_groups=num_groups)(kerns, topgrad, real_out, desc, alpha, beta) 1028 1029 1030def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1), 1031 conv_mode='conv', direction_hint=None, workmem=None, 1032 algo=None, precision=None, num_groups=1): 1033 """ 1034 GPU convolution using cuDNN from NVIDIA. 1035 1036 The memory layout to use is 'bc01', that is 'batch', 'channel', 1037 'first dim', 'second dim' in that order. 1038 1039 Parameters 1040 ---------- 1041 img 1042 Images to do the convolution over. 1043 kerns 1044 Convolution filters. 1045 border_mode 1046 One of 'valid', 'full', 'half'; additionally, the padding size 1047 could be directly specified by an integer or a pair of integers. 1048 subsample 1049 Perform subsampling of the output (default: (1, 1)). 1050 dilation 1051 Filter dilation factor. A dilation factor of d is equivalent to a 1052 convolution with d - 1 zeros inserted between neighboring filter 1053 values. 1054 conv_mode 1055 Perform convolution (kernels flipped) or cross-correlation. 1056 One of 'conv', 'cross' (default: 'conv'). 1057 direction_hint 1058 Used by graph optimizers to change algorithm choice. 1059 By default, GpuDnnConv will be used to carry out the convolution. 1060 If border_mode is 'valid', subsample is (1, 1), dilation is (1, 1), and 1061 direction_hint is 'bprop weights', it will use GpuDnnConvGradW. 1062 If border_mode is 'full', subsample is (1, 1), dilation is (1, 1), and 1063 direction_hint is *not* 'forward!', it will use GpuDnnConvGradI. 1064 This parameter is used internally by graph optimizers and may be 1065 removed at any time without a deprecation period. You have been warned. 1066 algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'} 1067 Convolution implementation to use. Some of its values may 1068 require certain versions of cuDNN to be installed. Default is 1069 the value of :attr:`config.dnn.conv.algo_fwd`. 1070 precision : {'as_input_f32', 'as_input', 'float16', 'float32', 'float64'} 1071 Description of the dtype in which the computation of the convolution 1072 should be done. Possible values are 'as_input', 'float16', 'float32' 1073 and 'float64'. Default is the value of 1074 :attr:`config.dnn.conv.precision`. 1075 num_groups : 1076 Divides the image, kernel and output tensors into num_groups 1077 separate groups. Each which carry out convolutions separately 1078 1079 1080 .. warning:: The cuDNN library only works with GPUs that have a compute 1081 capability of 3.0 or higher. This means that older GPUs will not 1082 work with this Op. 1083 1084 """ 1085 1086 if workmem is not None: 1087 if algo is not None: 1088 raise ValueError("You can't use both algo and workmem") 1089 warnings.warn("workmem is deprecated, use algo instead", stacklevel=2) 1090 algo = workmem 1091 fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) 1092 ctx_name = infer_context_name(img, kerns) 1093 if (border_mode == 'valid' and subsample == (1, 1) and dilation == (1, 1) and 1094 direction_hint == 'bprop weights' and num_groups == 1): 1095 # Special case: We are asked to use GpuDnnConvGradW. We need to set 1096 # up a suitable 'fake' convolution to compute the gradient for. 1097 img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) 1098 if conv_mode == 'conv': 1099 # We need to flip manually. These 'kerns' are not the kernels 1100 # that would be flipped by conv_mode='conv' in GpuDnnConvGradW. 1101 kerns = kerns[:, :, ::-1, ::-1] 1102 kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) 1103 out_shp = (shape_i(kerns, 1, fgraph), 1104 shape_i(img, 1, fgraph), 1105 shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1, 1106 shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1) 1107 out_shp = assert_conv_shape(out_shp) 1108 out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) 1109 precision, _ = get_precision(precision, [img, kerns], for_grad=True) 1110 desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1), 1111 num_groups=num_groups, 1112 conv_mode='cross', precision=precision)(out.shape) 1113 conv = GpuDnnConvGradW(num_groups=num_groups)(img, kerns, out, desc) 1114 return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) 1115 1116 elif (border_mode == 'full' and subsample == (1, 1) and 1117 direction_hint != 'forward!' and num_groups == 1): 1118 # Special case: We can be faster by using GpuDnnConvGradI to compute 1119 # the full convolution as the backward pass of a valid convolution. 1120 # We just need to set up a suitable 'fake' valid convolution. 1121 img = gpu_contiguous(img) # cudnn v2 rc3 need contiguous data 1122 kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) 1123 conv_mode = 'cross' if conv_mode == 'conv' else 'conv' 1124 out_shp = (shape_i(img, 0, fgraph), 1125 shape_i(kerns, 1, fgraph), 1126 shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0], 1127 shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1]) 1128 out_shp = assert_conv_shape(out_shp) 1129 out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) 1130 precision, _ = get_precision(precision, [img, kerns], for_grad=True) 1131 desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=dilation, 1132 num_groups=num_groups, 1133 conv_mode=conv_mode, precision=precision)(kerns.shape) 1134 return GpuDnnConvGradI(num_groups=num_groups)(kerns, img, out, desc) 1135 1136 # Standard case: We use GpuDnnConv with suitable padding. 1137 return _dnn_conv(img, kerns, algo=algo, border_mode=border_mode, subsample=subsample, dilation=dilation, 1138 conv_mode=conv_mode, precision=precision, num_groups=num_groups) 1139 1140 1141def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1), 1142 conv_mode='conv', direction_hint=None, 1143 algo=None, precision=None, num_groups=1): 1144 """ 1145 GPU convolution using cuDNN from NVIDIA. 1146 1147 The memory layout to use is 'bc012', that is 'batch', 'channel', 1148 'first dim', 'second dim', 'third dim' in that order. 1149 1150 Parameters 1151 ---------- 1152 img 1153 Images to do the convolution over. 1154 kerns 1155 Convolution filters. 1156 border_mode 1157 One of 'valid', 'full', 'half'; additionally, the padding size 1158 could be directly specified by an integer or a pair of integers. 1159 subsample 1160 Perform subsampling of the output (default: (1, 1, 1)). 1161 dilation 1162 Filter dilation factor. A dilation factor of d is equivalent to a 1163 convolution with d - 1 zeros inserted between neighboring filter 1164 values. 1165 conv_mode 1166 Perform convolution (kernels flipped) or cross-correlation. 1167 One of 'conv', 'cross' (default: 'conv'). 1168 direction_hint 1169 Used by graph optimizers to change algorithm choice. 1170 By default, GpuDnnConv will be used to carry out the convolution. 1171 If border_mode is 'valid', subsample is (1, 1, 1), dilation is 1172 (1, 1, 1), and direction_hint is 'bprop weights', it will use 1173 GpuDnnConvGradW. 1174 If border_mode is 'full', subsample is (1, 1, 1), dilation is 1175 (1, 1, 1), and direction_hint is *not* 'forward!', it will use 1176 GpuDnnConvGradI. 1177 This parameter is used internally by graph optimizers and may be 1178 removed at any time without a deprecation period. You have been warned. 1179 algo : convolution implementation to use. Only 'none' is implemented 1180 for the conv3d. Default is the value of :attr:`config.dnn.conv.algo_fwd`. 1181 precision : {'as_input_f32', 'as_input', 'float16', 'float32', 'float64'} 1182 Description of the dtype in which the computation of the convolution 1183 should be done. Possible values are 'as_input', 'float16', 'float32' 1184 and 'float64'. Default is the value of 1185 :attr:`config.dnn.conv.precision`. 1186 num_groups : 1187 Divides the image, kernel and output tensors into num_groups 1188 separate groups. Each which carry out convolutions separately 1189 1190 1191 .. warning:: The cuDNN library only works with GPUs that have a compute 1192 capability of 3.0 or higher. This means that older GPUs will not 1193 work with this Op. 1194 1195 """ 1196 1197 fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) 1198 ctx_name = infer_context_name(img, kerns) 1199 if (border_mode == 'valid' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and 1200 direction_hint == 'bprop weights' and num_groups == 1): 1201 # Special case: We are asked to use GpuDnnConvGradW. We need to set 1202 # up a suitable 'fake' convolution to compute the gradient for. 1203 img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)) 1204 if conv_mode == 'conv': 1205 # We need to flip manually. These 'kerns' are not the kernels 1206 # that would be flipped by conv_mode='conv' in GpuDnnConvGradW. 1207 kerns = kerns[:, :, ::-1, ::-1, ::-1] 1208 kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4)) 1209 out_shp = (shape_i(kerns, 1, fgraph), 1210 shape_i(img, 1, fgraph), 1211 shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1, 1212 shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1, 1213 shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1) 1214 out_shp = assert_conv_shape(out_shp) 1215 out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) 1216 precision, _ = get_precision(precision, [img, kerns], for_grad=True) 1217 desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1), 1218 num_groups=num_groups, 1219 conv_mode='cross', precision=precision)(out.shape) 1220 conv = GpuDnnConvGradW(num_groups=num_groups)(img, kerns, out, desc) 1221 return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name) 1222 1223 elif (border_mode == 'full' and subsample == (1, 1, 1) and 1224 direction_hint != 'forward!' and num_groups == 1): 1225 # Special case: We can be faster by using GpuDnnConvGradI to compute 1226 # the full convolution as the backward pass of a valid convolution. 1227 # We just need to set up a suitable 'fake' valid convolution. 1228 img = gpu_contiguous(img) # cudnn v2 rc3 need contiguous data 1229 kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4)) 1230 conv_mode = 'cross' if conv_mode == 'conv' else 'conv' 1231 out_shp = (shape_i(img, 0, fgraph), 1232 shape_i(kerns, 1, fgraph), 1233 shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0], 1234 shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1], 1235 shape_i(img, 4, fgraph) + (shape_i(kerns, 4, fgraph) - 1) * dilation[2]) 1236 out_shp = assert_conv_shape(out_shp) 1237 out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) 1238 precision, _ = get_precision(precision, [img, kerns], for_grad=True) 1239 desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=dilation, 1240 num_groups=num_groups, 1241 conv_mode=conv_mode, precision=precision)(kerns.shape) 1242 return GpuDnnConvGradI(num_groups=num_groups)(kerns, img, out, desc) 1243 1244 # Standard case: We use GpuDnnConv with suitable padding. 1245 return _dnn_conv(img, kerns, algo=algo, border_mode=border_mode, subsample=subsample, dilation=dilation, 1246 conv_mode=conv_mode, precision=precision, num_groups=num_groups) 1247 1248 1249def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', 1250 subsample=(1, 1), dilation=(1, 1), conv_mode='conv', 1251 precision=None, algo=None, num_groups=1): 1252 """ 1253 TODO: document this 1254 """ 1255 return _dnn_gradweight(img, topgrad, kerns_shp, border_mode=border_mode, subsample=subsample, dilation=dilation, 1256 conv_mode=conv_mode, algo=algo, precision=precision, num_groups=num_groups) 1257 1258 1259def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid', 1260 subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', 1261 precision=None, algo=None, num_groups=1): 1262 """ 1263 3d version of dnn_gradweight 1264 """ 1265 return dnn_gradweight(img, topgrad, kerns_shp, border_mode, 1266 subsample, dilation, conv_mode, precision, 1267 algo, num_groups) 1268 1269 1270def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', 1271 subsample=(1, 1), dilation=(1, 1), conv_mode='conv', 1272 precision=None, algo=None, num_groups=1): 1273 """ 1274 TODO: document this 1275 """ 1276 return _dnn_gradinput(kerns, topgrad, img_shp, border_mode=border_mode, subsample=subsample, dilation=dilation, 1277 conv_mode=conv_mode, algo=algo, precision=precision, num_groups=num_groups) 1278 1279 1280def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid', 1281 subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', 1282 precision=None, algo=None, num_groups=1): 1283 """ 1284 3d version of `dnn_gradinput`. 1285 """ 1286 return dnn_gradinput(kerns, topgrad, img_shp, border_mode, subsample, 1287 dilation, conv_mode, precision, algo, 1288 num_groups) 1289 1290 1291class GpuDnnPoolDesc(Op): 1292 1293 """ 1294 This Op builds a pooling descriptor for use in the other 1295 pooling operations. 1296 1297 `ws`, `stride` and `pad` must have the same length. 1298 1299 Parameters 1300 ---------- 1301 ws : tuple 1302 Window size. 1303 stride : tuple 1304 (dx, dy) or (dx, dy, dz). 1305 mode : {'max', 'average_inc_pad', 'average_exc_pad'} 1306 The old deprecated name 'average' corresponds to 'average_inc_pad'. 1307 pad : tuple 1308 (padX, padY) or (padX, padY, padZ) 1309 1310 Note 1311 ---- 1312 Not used anymore. Only needed to reload old pickled files. 1313 """ 1314 1315 __props__ = ('ws', 'stride', 'mode', 'pad') 1316 1317 def c_headers(self): 1318 return ['cudnn.h', 'cudnn_helper.h'] 1319 1320 def c_header_dirs(self): 1321 return [gpuarray_helper_inc_dir(), config.dnn.include_path] 1322 1323 def c_libraries(self): 1324 return ['cudnn'] 1325 1326 def c_lib_dirs(self): 1327 return [config.dnn.library_path] 1328 1329 def do_constant_folding(self, node): 1330 return False 1331 1332 def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)): 1333 if mode == 'average': 1334 mode = 'average_inc_pad' 1335 assert mode in ('max', 'average_inc_pad', 'average_exc_pad') 1336 self.mode = mode 1337 1338 assert len(ws) == len(stride) and len(stride) == len(pad) 1339 assert len(ws) in (2, 3) 1340 self.ws = ws 1341 self.stride = stride 1342 self.pad = pad 1343 1344 def get_ndim(self): 1345 return len(self.ws) 1346 1347 def __setstate__(self, d): 1348 self.__dict__.update(d) 1349 if not hasattr(self, 'pad'): 1350 self.pad = (0, 0) 1351 1352 def make_node(self): 1353 node = Apply(self, [], 1354 [CUDNNDataType("cudnnPoolingDescriptor_t", 1355 freefunc="cudnnDestroyPoolingDescriptor")()]) 1356 # DebugMode cannot compare the values of CDataType variables, so by 1357 # default it returns False all the time. To prevent DebugMode from 1358 # complaining because of the MergeOptimizer, we make this variable 1359 # always compare to True. 1360 out = node.outputs[0] 1361 out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true 1362 return node 1363 1364 def c_code(self, node, name, inputs, outputs, sub): 1365 desc, = outputs 1366 1367 if self.mode == 'max': 1368 mode_flag = 'CUDNN_POOLING_MAX' 1369 elif self.mode == "average_inc_pad": 1370 mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING' 1371 elif self.mode == "average_exc_pad": 1372 mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING' 1373 else: 1374 raise NotImplementedError("Unsupported pooling model.") 1375 1376 return """ 1377{ 1378 cudnnStatus_t err; 1379 1380 if ((err = cudnnCreatePoolingDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) { 1381 PyErr_Format(PyExc_MemoryError, "could not allocate pooling " 1382 "descriptor: %%s", cudnnGetErrorString(err)); 1383 %(fail)s 1384 } 1385 1386 static const int win[%(nd)d] = {%(win)s}; 1387 static const int pad[%(nd)d] = {%(pad)s}; 1388 static const int str[%(nd)d] = {%(str)s}; 1389 1390 err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, CUDNN_PROPAGATE_NAN, %(nd)d, win, pad, str); 1391 1392 if (err != CUDNN_STATUS_SUCCESS) { 1393 PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s", 1394 cudnnGetErrorString(err)); 1395 %(fail)s 1396 } 1397} 1398""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'], 1399 nd=self.get_ndim(), win=', '.join(map(str, self.ws)), 1400 pad=', '.join(map(str, self.pad)), 1401 str=', '.join(map(str, self.stride))) 1402 1403 def c_code_cache_version(self): 1404 return (4, version()) 1405 1406 1407class GpuDnnPoolBase(DnnBase): 1408 1409 """ 1410 Abstract base class for GpuDnnPool and GpuDnnPoolGrad. 1411 1412 """ 1413 1414 # c_file and c_function must be defined in sub-classes. 1415 c_file = None 1416 c_function = None 1417 1418 _f16_ok = True 1419 __props__ = ('mode',) 1420 check_input = False 1421 params_type = ParamsType(mode=cudnn.cudnnPoolingMode_t, 1422 handle=handle_type) 1423 1424 def __init__(self, mode='max'): 1425 DnnBase.__init__(self, [self.c_file], self.c_function) 1426 if mode == 'average': 1427 mode = 'average_inc_pad' 1428 # Supported modes depend on runtime cuDNN version. 1429 assert cudnn.cudnnPoolingMode_t.has_alias(mode) 1430 self.mode = mode 1431 1432 1433class GpuDnnPool(GpuDnnPoolBase): 1434 1435 """ 1436 Parameters 1437 ---------- 1438 img : tensor 1439 The image 4d or 5d tensor. 1440 ws : tensor 1441 Window size. 1442 stride : tensor 1443 (dx, dy) or (dx, dy, dz). 1444 mode : {'max', 'average_inc_pad', 'average_exc_pad'} 1445 The old deprecated name 'average' corresponds to 'average_inc_pad'. 1446 pad : tensor 1447 (padX, padY) or (padX, padY, padZ) 1448 1449 """ 1450 c_file = "c_code/dnn_pool.c" 1451 c_function = "APPLY_SPECIFIC(dnn_pool)" 1452 1453 def make_node(self, img, ws, stride, pad): 1454 ctx_name = infer_context_name(img) 1455 img = as_gpuarray_variable(img, ctx_name) 1456 1457 ws = tensor.as_tensor_variable(ws) 1458 stride = tensor.as_tensor_variable(stride) 1459 pad = tensor.as_tensor_variable(pad) 1460 assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim 1461 assert ws.type.ndim == 1 1462 1463 return Apply(self, [img, ws, stride, pad], [img.type()]) 1464 1465 def infer_shape(self, node, shape): 1466 w = node.inputs[1] 1467 s = node.inputs[2] 1468 p = node.inputs[3] 1469 1470 res = [shape[0][0], shape[0][1], 1471 (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1, 1472 (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1 1473 ] 1474 if node.inputs[0].ndim == 5: 1475 res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1) 1476 return [res] 1477 1478 def L_op(self, inp, outputs, grads): 1479 img, ws, stride, pad = inp 1480 grad, = grads 1481 1482 grad = gpu_contiguous(grad) 1483 1484 out, = outputs 1485 1486 g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad) 1487 1488 return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()() 1489 1490 def connection_pattern(self, node): 1491 # not connected to parameters 1492 return [[1], [0], [0], [0]] 1493 1494 1495class GpuDnnPoolGrad(GpuDnnPoolBase): 1496 1497 """ 1498 The pooling gradient. 1499 1500 Parameters 1501 ---------- 1502 inp 1503 The input of the pooling. 1504 out 1505 The output of the pooling in the forward. 1506 out_grad 1507 Same size as out, but is the corresponding gradient information. 1508 ws : tensor variable 1509 Window size. 1510 stride : tensor variable 1511 (dx, dy) or (dx, dy, dz). 1512 mode : {'max', 'average_inc_pad', 'average_exc_pad'} 1513 The old deprecated name 'average' corresponds to 'average_inc_pad'. 1514 pad : tensor 1515 (padX, padY) or (padX, padY, padZ) 1516 1517 """ 1518 c_file = "c_code/dnn_pool_grad.c" 1519 c_function = "APPLY_SPECIFIC(dnn_pool_grad)" 1520 1521 def make_node(self, inp, out, out_grad, ws, stride, pad): 1522 ctx_name = infer_context_name(inp, out, out_grad) 1523 inp = as_gpuarray_variable(inp, ctx_name) 1524 assert (inp.ndim in [4, 5]) 1525 out_grad = as_gpuarray_variable(out_grad, ctx_name) 1526 assert (out_grad.ndim in [4, 5]) 1527 out = as_gpuarray_variable(out, ctx_name) 1528 assert(out.ndim in [4, 5]) 1529 1530 assert (out_grad.ndim == inp.ndim) 1531 assert (inp.ndim == out.ndim) 1532 1533 ws = tensor.as_tensor_variable(ws) 1534 stride = tensor.as_tensor_variable(stride) 1535 pad = tensor.as_tensor_variable(pad) 1536 assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim 1537 assert ws.type.ndim == 1 1538 1539 return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()]) 1540 1541 def infer_shape(self, node, shape): 1542 return [shape[0]] 1543 1544 1545def dnn_pool(img, ws, stride=None, mode='max', pad=None): 1546 """ 1547 GPU pooling using cuDNN from NVIDIA. 1548 1549 The memory layout to use is 'bc01', that is 'batch', 'channel', 1550 'first dim', 'second dim' in that order. 1551 1552 `ws`, `stride` and `pad` must have the same length. 1553 1554 Parameters 1555 ---------- 1556 img 1557 Images to do the pooling over. 1558 ws : tuple 1559 Subsampling window size. Should have 2 or 3 elements. 1560 stride : tuple 1561 Subsampling stride (default: (1, 1) or (1, 1, 1)). 1562 mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum', 'max_deterministic'} 1563 **NB**: 'max_deterministic' is supported since cuDNN v6. 1564 pad : tuple 1565 (padX, padY) or (padX, padY, padZ) 1566 default: (0, 0) or (0, 0, 0) 1567 1568 1569 .. warning:: The cuDNN library only works with GPU that have a compute 1570 capability of 3.0 or higher. This means that older GPU will not 1571 work with this Op. 1572 1573 Notes 1574 ----- 1575 This Op implements the ignore_border=True of max_pool_2d. 1576 1577 """ 1578 img = gpu_contiguous(img) 1579 if stride is None: 1580 stride = (1,) * len(ws) 1581 if pad is None: 1582 pad = (0,) * len(ws) 1583 if mode == "sum": 1584 ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad) 1585 context_name = ret.type.context_name 1586 window_elem = theano.tensor.prod(ws).astype(ret.dtype) 1587 return as_gpuarray_variable(ret * window_elem, context_name) 1588 return GpuDnnPool(mode=mode)(img, ws, stride, pad) 1589 1590 1591class GpuDnnSoftmaxBase(DnnBase): 1592 1593 """ 1594 Op for the cuDNN Softmax. 1595 1596 Parameters 1597 ---------- 1598 algo : {'fast', 'accurate', 'log'} 1599 Indicating whether, respectively, computations should be optimized for 1600 speed, for accuracy, or if cuDNN should rather compute the log-softmax instead. 1601 mode : {'instance', 'channel'} 1602 Indicating whether the softmax should be computed per image across 'c01' 1603 or per spatial location '01' per image across 'c'. 1604 1605 """ 1606 1607 __props__ = ('mode', 'algo') 1608 # Neither inputs nor output types properties are used 1609 # neither in dnn_base.c nor in dnn_softmax*.c, 1610 # so we can disable input checking. 1611 check_input = False 1612 params_type = ParamsType(algo=cudnn.cudnnSoftmaxAlgorithm_t, 1613 mode=cudnn.cudnnSoftmaxMode_t, 1614 handle=handle_type) 1615 1616 def __init__(self, algo, mode): 1617 DnnBase.__init__(self, [self.file], self.c_func) 1618 1619 assert cudnn.cudnnSoftmaxAlgorithm_t.has_alias(algo) 1620 self.algo = algo 1621 1622 assert cudnn.cudnnSoftmaxMode_t.has_alias(mode) 1623 self.mode = mode 1624 1625 def infer_shape(self, node, shape): 1626 if self.direction == 'forward': 1627 return [shape[0]] 1628 else: 1629 return [shape[1]] 1630 1631 1632class GpuDnnSoftmax(GpuDnnSoftmaxBase): 1633 1634 """ 1635 Op for the cuDNN Softmax. 1636 1637 algo : {'fast', 'accurate', 'log'} 1638 Indicating whether, respectively, computations should be optimized for 1639 speed, for accuracy, or if cuDNN should rather compute the log-softmax instead. 1640 mode : {'instance', 'channel'} 1641 Indicating whether the softmax should be computed per image across 'c01' 1642 or per spatial location '01' per image across 'c'. 1643 1644 """ 1645 _f16_ok = True 1646 direction = "forward" 1647 file = "c_code/dnn_softmax.c" 1648 c_func = "APPLY_SPECIFIC(softmax)" 1649 1650 def make_node(self, x): 1651 x = as_gpuarray_variable(x, infer_context_name(x)) 1652 assert x.ndim == 4 1653 return Apply(self, [x], [x.type()]) 1654 1655 def L_op(self, inp, outputs, grads): 1656 x, = inp 1657 g_sm, = grads 1658 sm, = outputs 1659 return [GpuDnnSoftmaxGrad( 1660 self.algo, 1661 self.mode 1662 )(g_sm, sm)] 1663 1664 1665class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): 1666 1667 """ 1668 Op for the cuDNN SoftmaxGrad. 1669 1670 Parameters 1671 ---------- 1672 algo 1673 'fast', 'accurate' or 'log' indicating whether, respectively, 1674 computations should be optimized for speed, for accuracy, or if cuDNN 1675 should rather compute the gradient of the log-softmax instead. 1676 mode 1677 'instance' or 'channel' indicating whether the softmax should 1678 be computed per image across 'c01' or per spatial location '01' per 1679 image across 'c'. 1680 1681 """ 1682 _f16_ok = True 1683 direction = 'backward' 1684 file = "c_code/dnn_softmax_grad.c" 1685 c_func = "APPLY_SPECIFIC(softmax_grad)" 1686 1687 def make_node(self, dy, sm): 1688 ctx_name = infer_context_name(dy, sm) 1689 dy = as_gpuarray_variable(dy, ctx_name) 1690 sm = as_gpuarray_variable(sm, ctx_name) 1691 assert dy.ndim == 4 1692 assert sm.ndim == 4 1693 return Apply(self, [dy, sm], [sm.type()]) 1694 1695 1696class GpuDnnReduction(DnnBase): 1697 check_input = False 1698 _f16_ok = True 1699 _cop_num_outputs = 2 1700 1701 __props__ = ('red_op', 'axis', 'acc_dtype', 'dtype', 'return_indices') 1702 1703 params_type = ParamsType(red_op=cudnn.cudnnReduceTensorOp_t, 1704 acc_dtype=cudnn.cudnnDataType_t, 1705 c_axis=uint32_t, 1706 handle=handle_type) 1707 1708 def __init__(self, red_op, axis, acc_dtype, dtype, return_indices): 1709 DnnBase.__init__(self, ['c_code/dnn_redux.c'], 'APPLY_SPECIFIC(dnn_redux)') 1710 assert cudnn.cudnnReduceTensorOp_t.has_alias(red_op) 1711 self.red_op = red_op 1712 assert acc_dtype in ['float16', 'float32', 'float64'] 1713 self.acc_dtype = acc_dtype 1714 assert dtype in ['float16', 'float32', 'float64'] 1715 self.dtype = dtype 1716 # 8 is the current limit for cudnn 1717 if axis is not None: 1718 if len(axis) > 8: 1719 raise ValueError('Too many axes to reduce on') 1720 if any(a >= 8 for a in axis): 1721 raise ValueError('Axes larger than 8 not supported') 1722 axis = tuple(axis) 1723 # c_axis is a bitfield (1 to reduce) 1724 self.c_axis = self._convert_axis(axis) 1725 # axis is a list of axes to reduce on 1726 self.axis = axis 1727 if return_indices and (red_op != 'maximum' and red_op != 'minimum'): 1728 raise ValueError("Can't request indices for something other than" 1729 " minimum or maximum") 1730 self.return_indices = return_indices 1731 1732 def _convert_axis(self, axis): 1733 if axis is None: 1734 return np.uint32(-1) 1735 else: 1736 return reduce(lambda a, b: a | b, map(lambda a: 1 << a, axis), 0) 1737 1738 def make_node(self, inp): 1739 ctx_name = infer_context_name(inp) 1740 inp = as_gpuarray_variable(inp, ctx_name) 1741 inp = gpu_contiguous(inp) 1742 if inp.ndim > 8: 1743 raise ValueError("cuDNN reduction doesn't support nd > 8") 1744 assert inp.dtype in ['float16', 'float32', 'float64'] 1745 1746 # These restrictions where guessed from vague clues since 1747 # there is no actual documentation on this 1748 if inp.dtype == 'float64': 1749 assert self.acc_dtype == 'float64' 1750 if inp.dtype == 'float32': 1751 assert self.acc_dtype == 'float32' 1752 if inp.dtype == 'float16': 1753 assert self.acc_dtype != 'float64' 1754 1755 bcast = [] 1756 for i in range(inp.ndim): 1757 if not (self.c_axis & (1 << i)): 1758 bcast.append(inp.broadcastable[i]) 1759 outs = [inp.type.clone(dtype=self.dtype, broadcastable=bcast)()] 1760 if self.return_indices: 1761 outs.append(GpuArrayType(dtype='uint32', broadcastable=bcast, 1762 context_name=ctx_name)()) 1763 1764 return Apply(self, [inp], outs) 1765 1766 1767class GpuDnnBatchNorm(DnnBase): 1768 """ 1769 Base Op for cuDNN Batch Normalization. 1770 1771 Parameters 1772 ---------- 1773 mode : {'per-activation', 'spatial'} 1774 Whether to normalize per activation (in this mode, bias and scale 1775 tensor dimensions are 1xCxHxW) or share normalization factors across 1776 spatial dimensions (in this mode, bias and scale tensor dimensions 1777 are 1xCx1x1). 1778 epsilon 1779 Epsilon value used in the batch normalization formula. Minimum allowed 1780 value is 1e-5 (imposed by cuDNN). 1781 running_average_factor : float 1782 Factor for updating the values or `running_mean` and `running_var`. 1783 If the factor is close to one, the running averages will update quickly, 1784 if the factor is close to zero it will update slowly. 1785 running_mean : tensor or None 1786 Previous value of the running mean. If this is given, the new value 1787 ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor`` 1788 will be returned as one of the outputs of this function. 1789 `running_mean` and `running_var` should either both be given or 1790 both be None. 1791 running_var : tensor or None 1792 Previous value of the running variance. If this is given, the new value 1793 ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor`` 1794 will be returned as one of the outputs of this function, 1795 where `m` is the product of lengths of the averaged-over dimensions. 1796 `running_mean` and `running_var` should either both be given or 1797 both be None. 1798 """ 1799 1800 __props__ = ('mode', 'running_averages', 'inplace_running_mean', 1801 'inplace_running_var', 'inplace_output') 1802 _cop_num_inputs = 7 1803 _cop_num_outputs = 5 1804 check_input = False 1805 params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t, 1806 inplace_output=bool_t, 1807 inplace_running_mean=bool_t, 1808 inplace_running_var=bool_t, 1809 handle=handle_type) 1810 1811 def __init__(self, mode='per-activation', running_averages=False, 1812 inplace_running_mean=False, inplace_running_var=False, 1813 inplace_output=False): 1814 DnnBase.__init__(self, ['c_code/dnn_batchnorm_base.c', 'c_code/dnn_batchnorm.c'], 1815 'dnn_batchnorm_op') 1816 1817 assert cudnn.cudnnBatchNormMode_t.has_alias(mode) 1818 self.mode = mode 1819 self.running_averages = running_averages 1820 self.inplace_output = inplace_output 1821 self.inplace_running_mean = inplace_running_mean 1822 self.inplace_running_var = inplace_running_var 1823 self.destroy_map = {} 1824 if self.inplace_output: 1825 self.destroy_map[0] = [0] 1826 if self.running_averages and self.inplace_running_mean: 1827 self.destroy_map[3] = [5] 1828 if self.running_averages and self.inplace_running_var: 1829 self.destroy_map[4] = [6] 1830 1831 def __setstate__(self, d): 1832 self.__dict__.update(d) 1833 if not hasattr(self, 'running_average_factor'): 1834 self.running_average_factor = 0 1835 if not hasattr(self, 'running_averages'): 1836 self.running_averages = False 1837 if not (hasattr(self, 'inplace_running_mean') and 1838 hasattr(self, 'inplace_running_var') and 1839 hasattr(self, 'inplace_output')): 1840 self.inplace_running_mean = False 1841 self.inplace_running_var = False 1842 self.inplace_output = False 1843 self.destroy_map = {} 1844 1845 def infer_shape(self, node, shape): 1846 return [shape[0]] + [shape[1]] * (len(node.outputs) - 1) 1847 1848 def make_node(self, x, scale, bias, epsilon=1e-4, 1849 running_average_factor=0.1, 1850 running_mean=None, running_var=None): 1851 assert x.ndim == scale.ndim == bias.ndim 1852 assert x.ndim in (4, 5) 1853 assert self.running_averages == (running_mean is not None) == (running_var is not None) 1854 assert (running_mean is None or running_mean.ndim == x.ndim) 1855 assert (running_var is None or running_var.ndim == x.ndim) 1856 ctx_name = infer_context_name(x, scale, bias) 1857 x = as_gpuarray_variable(x, ctx_name) 1858 scale = as_gpuarray_variable(scale, ctx_name) 1859 bias = as_gpuarray_variable(bias, ctx_name) 1860 epsilon = as_scalar(epsilon).astype('float64') 1861 running_average_factor = as_scalar(running_average_factor).astype('float64') 1862 inputs = [x, scale, bias, epsilon, running_average_factor] 1863 output_types = [x.type(), scale.type(), scale.type()] 1864 if running_mean is not None and running_var is not None: 1865 inputs.append(as_gpuarray_variable(running_mean, ctx_name)) 1866 inputs.append(as_gpuarray_variable(running_var, ctx_name)) 1867 output_types.append(scale.type()) 1868 output_types.append(scale.type()) 1869 return Apply(self, inputs, output_types) 1870 1871 def L_op(self, inputs, outputs, grads): 1872 x, scale, bias, epsilon, running_average_factor = inputs[:5] 1873 dy = grads[0] 1874 _, x_mean, x_invstd = outputs[:3] 1875 disconnected_outputs = [ 1876 DisconnectedType()(), # epsilon 1877 DisconnectedType()()] # running_average_factor 1878 # Optional running_mean and running_var. 1879 for i in range(5, len(inputs)): 1880 disconnected_outputs.append(DisconnectedType()()) 1881 return GpuDnnBatchNormGrad(self.mode)( 1882 x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs 1883 1884 def connection_pattern(self, node): 1885 # Specificy that epsilon and running_average_factor are not connected to outputs. 1886 patterns = [[True, True, True], # x 1887 [True, True, True], # scale 1888 [True, True, True], # bias 1889 [False, False, False], # epsilon 1890 [False, False, False]] # running_average_factor 1891 # Optional running_mean and running_var are only 1892 # connected to their new values. 1893 for i in range(5, len(node.inputs)): 1894 patterns[0].append(True) 1895 for pattern in patterns[1:]: 1896 pattern.append(False) 1897 patterns.append([False] * (3 + i - 5) + [True]) 1898 return patterns 1899 1900 1901class GpuDnnBatchNormInference(DnnBase): 1902 """ 1903 Base Op for cuDNN Batch Normalization. 1904 1905 Parameters 1906 ---------- 1907 mode : {'per-activation', 'spatial'} 1908 Whether to normalize per activation (in this mode, bias and scale 1909 tensor dimensions are 1xCxHxW) or share normalization factors across 1910 spatial dimensions (in this mode, bias and scale tensor dimensions 1911 are 1xCx1x1). 1912 epsilon 1913 Epsilon value used in the batch normalization formula. Minimum allowed 1914 value is 1e-5 (imposed by cuDNN). 1915 """ 1916 1917 __props__ = ('mode', 'inplace') 1918 1919 check_input = False 1920 params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t, 1921 inplace=bool_t, 1922 handle=handle_type) 1923 1924 def __init__(self, mode='per-activation', inplace=False): 1925 DnnBase.__init__(self, ['c_code/dnn_batchnorm_base.c', 'c_code/dnn_batchnorm_inf.c'], 1926 'dnn_batchnorm_op') 1927 1928 assert cudnn.cudnnBatchNormMode_t.has_alias(mode) 1929 self.mode = mode 1930 self.inplace = bool(inplace) 1931 if self.inplace: 1932 self.destroy_map = {0: [0]} 1933 1934 def __setstate__(self, d): 1935 self.__dict__.update(d) 1936 if not hasattr(self, 'inplace'): 1937 self.inplace = False 1938 1939 def infer_shape(self, node, shape): 1940 return [shape[0]] 1941 1942 def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4): 1943 ctx_name = infer_context_name(x, scale, bias, estimated_mean, 1944 estimated_variance) 1945 x = as_gpuarray_variable(x, ctx_name) 1946 scale = as_gpuarray_variable(scale, ctx_name) 1947 bias = as_gpuarray_variable(bias, ctx_name) 1948 estimated_mean = as_gpuarray_variable(estimated_mean, ctx_name) 1949 estimated_variance = as_gpuarray_variable(estimated_variance, ctx_name) 1950 epsilon = as_scalar(epsilon).astype('float64') 1951 assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim 1952 assert x.ndim in (4, 5) 1953 return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()]) 1954 1955 def grad(self, inputs, grads): 1956 x, scale, bias, est_mean, est_var, epsilon = inputs 1957 dy = grads[0] 1958 1959 if self.mode == "per-activation": 1960 axes = (0,) 1961 elif self.mode == "spatial": 1962 axes = (0,) + tuple(range(2, x.ndim)) 1963 scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes) 1964 for t in (scale, bias, est_mean, est_var)) 1965 1966 # define helper expressions 1967 est_var_eps = est_var + epsilon 1968 est_std = theano.tensor.sqrt(est_var_eps) 1969 two = theano.tensor.constant(2.) 1970 1971 # define and return gradients 1972 dx = dy * (scale / est_std) 1973 dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std 1974 dbias = dy.sum(axes, keepdims=True) 1975 dmean = -dy.sum(axes, keepdims=True) * (scale / est_std) 1976 dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std)) 1977 return [dx, dscale, dbias, dmean, dvar, DisconnectedType()()] 1978 1979 def connection_pattern(self, node): 1980 # Specificy that epsilon is not connected to outputs. 1981 return [[True], [True], [True], [True], [True], [False]] 1982 1983 1984class GpuDnnBatchNormGrad(DnnBase): 1985 __props__ = ('mode',) 1986 1987 check_input = False 1988 params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t, 1989 handle=handle_type) 1990 1991 def __init__(self, mode='per-activation'): 1992 DnnBase.__init__(self, ['c_code/dnn_batchnorm_base.c', 'c_code/dnn_batchnorm_grad.c'], 1993 'dnn_batchnorm_grad') 1994 1995 assert cudnn.cudnnBatchNormMode_t.has_alias(mode) 1996 self.mode = mode 1997 1998 def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4): 1999 ctx_name = infer_context_name(x, dy, scale, x_mean, x_invstd) 2000 x = as_gpuarray_variable(x, ctx_name) 2001 dy = as_gpuarray_variable(dy, ctx_name) 2002 scale = as_gpuarray_variable(scale, ctx_name) 2003 x_mean = as_gpuarray_variable(x_mean, ctx_name) 2004 x_invstd = as_gpuarray_variable(x_invstd, ctx_name) 2005 epsilon = as_scalar(epsilon).astype('float64') 2006 assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim 2007 assert x.ndim in (4, 5) 2008 return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon], [x.type(), scale.type(), scale.type()]) 2009 2010 def infer_shape(self, node, shape): 2011 return [shape[0], shape[2], shape[2]] 2012 2013gpudata_type = CDataType('gpudata *', 'gpudata_release') 2014dropoutdesc_type = CUDNNDataType('cudnnDropoutDescriptor_t', 2015 'cudnnDestroyDropoutDescriptor') 2016 2017 2018class GpuDnnDropoutOp(DnnBase): 2019 __props__ = ('inplace',) 2020 2021 def __init__(self, inplace=False): 2022 DnnBase.__init__(self, ["c_code/dnn_dropout_fwd.c"], "dnn_dropout_fwd") 2023 self.inplace = inplace 2024 if self.inplace: 2025 self.destroy_map = {1: [2]} 2026 2027 def make_node(self, inp, descriptor, state): 2028 ctx_name = infer_context_name(inp) 2029 inp = as_gpuarray_variable(inp, ctx_name) 2030 return Apply(self, [inp, descriptor, state], 2031 [inp.type(), state.type(), gpudata_type()]) 2032 2033 def prepare_node(self, node, storage_map, compute_map, impl): 2034 assert self.inplace, "GpuDnnDropoutOp not inplace" 2035 2036 2037class _DropoutDescriptor(DnnBase): 2038 __props__ = ('context_name',) 2039 2040 def __init__(self, context_name): 2041 DnnBase.__init__(self, ["c_code/dnn_dropout_desc.c"], "dnn_dropout_desc") 2042 self.context_name = context_name 2043 2044 def dnn_context(self, node): 2045 return self.context_name 2046 2047 def do_constant_folding(self, node): 2048 return False 2049 2050 def make_node(self, dropout, seed, context_name): 2051 dropout = as_scalar(dropout).astype('float32') 2052 seed = as_scalar(seed).astype('uint64') 2053 2054 assert context_name == self.context_name 2055 # This is a dirty hack to pass the context because params is 2056 # occupied by the cudnn handle 2057 context = gpu_context_type.make_constant(get_context(context_name)) 2058 2059 return Apply(self, [dropout, seed, context], 2060 [dropoutdesc_type(), 2061 GpuArrayType('uint8', (False,), 2062 context_name=context_name)()]) 2063 2064 def c_code_cache_version_apply(self, node): 2065 # disable the cache since we can't pickle contexts 2066 return None 2067 2068 2069def _make_dropout_desc(dropout, seed, context_name): 2070 desc, states = theano.function( 2071 [], 2072 _DropoutDescriptor(context_name)(dropout, seed, context_name), 2073 theano.Mode(optimizer=None), 2074 profile=False)() 2075 return desc, states 2076 2077 2078def dropout(x, dropout=0.0, seed=4242): 2079 desc, states = _make_dropout_desc(dropout, seed, x.type.context_name) 2080 y, odesc = GpuDnnDropoutOp()(x, desc) 2081 return y, desc, odesc, states 2082 2083rnndesc_type = CUDNNDataType('cudnnRNNDescriptor_t', 2084 'cudnnDestroyRNNDescriptor') 2085 2086 2087def as_i32(v): 2088 return as_scalar(v).astype('int32') 2089 2090 2091class _RNNDescriptor(DnnBase): 2092 __props__ = ('context_name',) 2093 2094 def __init__(self, context_name): 2095 if version() < 5005: 2096 raise RuntimeError("cudnn RNN require cudnn v5 final or higher.") 2097 DnnBase.__init__(self, ["c_code/dnn_rnn_desc.c"], "dnn_rnn_desc") 2098 self.context_name = context_name 2099 2100 def dnn_context(self, node): 2101 return self.context_name 2102 2103 def do_constant_folding(self, node): 2104 return False 2105 2106 def make_node(self, hidden_size, num_layers, ddesc, input_mode, 2107 direction_mode, rnn_mode, dtype): 2108 2109 hidden_size = as_i32(hidden_size) 2110 num_layers = as_i32(num_layers) 2111 2112 if version() < 5005: 2113 raise RuntimeError("cudnn RNN require cudnn v5 final or higher.") 2114 2115 if input_mode == 'linear': 2116 input_mode = as_i32(0) 2117 elif input_mode == 'skip': 2118 input_mode = as_i32(1) 2119 else: 2120 raise ValueError("input_mode") 2121 2122 if direction_mode == 'unidirectional': 2123 direction_mode = as_i32(0) 2124 elif direction_mode == 'bidirectional': 2125 direction_mode = as_i32(1) 2126 else: 2127 raise ValueError("direction_mode") 2128 2129 if rnn_mode == 'rnn_relu': 2130 rnn_mode = as_i32(0) 2131 elif rnn_mode == 'rnn_tanh': 2132 rnn_mode = as_i32(1) 2133 elif rnn_mode == 'lstm': 2134 rnn_mode = as_i32(2) 2135 elif rnn_mode == 'gru': 2136 rnn_mode = as_i32(3) 2137 else: 2138 raise ValueError("rnn_mode") 2139 2140 dtype = as_i32(gpuarray.dtype_to_typecode(dtype)) 2141 2142 return Apply(self, [hidden_size, num_layers, 2143 dropoutdesc_type.make_constant(ddesc), 2144 input_mode, direction_mode, rnn_mode, dtype], 2145 [rnndesc_type()]) 2146 2147 2148def _make_rnn_desc(hidden_size, num_layers, ddesc, rnn_mode, 2149 input_mode, direction_mode, dtype, context_name): 2150 desc = theano.function( 2151 [], 2152 _RNNDescriptor(context_name)(hidden_size, num_layers, ddesc, 2153 input_mode, direction_mode, 2154 rnn_mode, dtype), 2155 theano.Mode(optimizer=None), 2156 profile=False)() 2157 return desc 2158 2159 2160class _RNNParamSize(DnnBase): 2161 __props__ = ('context_name',) 2162 2163 def __init__(self, context_name): 2164 DnnBase.__init__(self, ["c_code/dnn_rnn_paramsize.c"], 2165 "dnn_rnn_paramsize") 2166 self.context_name = context_name 2167 2168 def dnn_context(self, node): 2169 return self.context_name 2170 2171 def do_constant_folding(self, node): 2172 return False 2173 2174 def make_node(self, desc, input_size, typecode): 2175 input_size = as_tensor_variable(input_size).astype('uint64') 2176 typecode = as_i32(typecode) 2177 return Apply(self, [rnndesc_type.make_constant(desc), input_size, 2178 typecode], 2179 [get_scalar_type('uint64')()]) 2180 2181 2182def _get_param_size(desc, input_size, dtype, context_name): 2183 typecode = gpuarray.dtype_to_typecode(dtype) 2184 return theano.function( 2185 [], 2186 _RNNParamSize(context_name)(desc, input_size, typecode), 2187 theano.Mode(optimizer=None), 2188 profile=False)() 2189 2190 2191class _RNNSplitParams(DnnBase): 2192 __props__ = ('rnn_mode',) 2193 2194 def __init__(self, rnn_mode): 2195 DnnBase.__init__(self) 2196 self.rnn_mode = rnn_mode 2197 2198 def make_node(self, w, desc, layer, isize, typecode): 2199 w = as_gpuarray_variable(w, infer_context_name(w)) 2200 assert w.ndim == 1 2201 layer = as_scalar(layer).astype('int32') 2202 isize = as_tensor_variable(isize).astype('uint64') 2203 assert isize.ndim == 1 2204 typecode = as_scalar(typecode).astype('int32') 2205 _1d = GpuArrayType(w.type.dtype, [False], 2206 context_name=w.type.context_name) 2207 _2d = GpuArrayType(w.type.dtype, [False, False], 2208 context_name=w.type.context_name) 2209 outputs = [] 2210 if self.rnn_mode == 'rnn_relu' or self.rnn_mode == 'rnn_tanh': 2211 outputs.extend([_2d(), _1d()]) # input 2212 outputs.extend([_2d(), _1d()]) # recurrent 2213 elif self.rnn_mode == 'lstm': 2214 outputs.extend([_2d(), _1d()]) # input input 2215 outputs.extend([_2d(), _1d()]) # input forget 2216 outputs.extend([_2d(), _1d()]) # input newmem 2217 outputs.extend([_2d(), _1d()]) # input output 2218 outputs.extend([_2d(), _1d()]) # recur input 2219 outputs.extend([_2d(), _1d()]) # recur forget 2220 outputs.extend([_2d(), _1d()]) # recur newmem 2221 outputs.extend([_2d(), _1d()]) # recur output 2222 elif self.rnn_mode == 'gru': 2223 outputs.extend([_2d(), _1d()]) # input reset 2224 outputs.extend([_2d(), _1d()]) # input update 2225 outputs.extend([_2d(), _1d()]) # input newmem 2226 outputs.extend([_2d(), _1d()]) # recur reset 2227 outputs.extend([_2d(), _1d()]) # recur update 2228 outputs.extend([_2d(), _1d()]) # recur newmem 2229 2230 return Apply(self, [w, layer, rnndesc_type.make_constant(desc), 2231 isize, typecode], outputs) 2232 2233 def c_code(self, node, name, inputs, outputs, sub): 2234 kw = dict(fail=sub['fail'], w=inputs[0], layer=inputs[1], 2235 desc=inputs[2], isize=inputs[3], typecode=inputs[4], 2236 handle=sub['params']) 2237 code = """ 2238 cudnnTensorDescriptor_t xdesc; 2239 cudnnFilterDescriptor_t wdesc; 2240 cudnnFilterDescriptor_t odesc; 2241 size_t nshp[2]; 2242 void *w; 2243 void *o; 2244 ptrdiff_t off; 2245#if CUDNN_VERSION < 7100 2246 size_t bshp; 2247#endif 2248 cudnnStatus_t err; 2249 cudnnDataType_t dt; 2250 cudnnTensorFormat_t tf; 2251 int nd; 2252 int dims[3]; 2253 int strs[3]; 2254 2255 if (PyArray_DIM(%(isize)s, 0) != 2) { 2256 PyErr_SetString(PyExc_ValueError, "input_size should be of length two"); 2257 %(fail)s; 2258 } 2259 2260 switch (%(typecode)s) { 2261 case GA_FLOAT: 2262 dt = CUDNN_DATA_FLOAT; 2263 break; 2264 case GA_DOUBLE: 2265 dt = CUDNN_DATA_DOUBLE; 2266 break; 2267 case GA_HALF: 2268 dt = CUDNN_DATA_HALF; 2269 break; 2270 default: 2271 PyErr_SetString(PyExc_ValueError, "Unsupported data type"); 2272 %(fail)s; 2273 } 2274 2275 err = cudnnCreateTensorDescriptor(&xdesc); 2276 if (err != CUDNN_STATUS_SUCCESS) { 2277 PyErr_SetString(PyExc_RuntimeError, "Could not create xdesc"); 2278 %(fail)s; 2279 } 2280 2281 dims[0] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 0); 2282 dims[1] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 1); 2283 dims[2] = 1; 2284 strs[0] = dims[2] * dims[1]; 2285 strs[1] = dims[2]; 2286 strs[2] = 1; 2287 2288 err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs); 2289 if (err != CUDNN_STATUS_SUCCESS) { 2290 cudnnDestroyTensorDescriptor(xdesc); 2291 PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %%s", 2292 cudnnGetErrorString(err)); 2293 %(fail)s; 2294 } 2295 2296 if (c_make_filter(%(w)s, &wdesc)) { 2297 cudnnDestroyTensorDescriptor(xdesc); 2298 %(fail)s 2299 } 2300 2301 err = cudnnCreateFilterDescriptor(&odesc); 2302 if (err != CUDNN_STATUS_SUCCESS) { 2303 PyErr_SetString(PyExc_RuntimeError, "could not create odesc"); 2304 cudnnDestroyTensorDescriptor(xdesc); 2305 cudnnDestroyFilterDescriptor(wdesc); 2306 %(fail)s 2307 } 2308 2309 w = PyGpuArray_DEV_DATA(%(w)s); 2310 nshp[0] = PyGpuArray_DIM(%(w)s, 0); 2311 nshp[1] = 1; 2312 """ % kw 2313 2314 def get_params(id, m, b): 2315 kw2 = kw.copy() 2316 kw2['id'] = id 2317 kw2['m'] = m 2318 kw2['b'] = b 2319 return """ 2320 err = cudnnGetRNNLinLayerBiasParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o); 2321 if (err != CUDNN_STATUS_SUCCESS) { 2322 cudnnDestroyTensorDescriptor(xdesc); 2323 cudnnDestroyFilterDescriptor(wdesc); 2324 cudnnDestroyFilterDescriptor(odesc); 2325 PyErr_SetString(PyExc_RuntimeError, "can't fetch bias for id %(id)s"); 2326 %(fail)s 2327 } 2328 off = (intptr_t)o - (intptr_t)w; 2329 assert(off >= 0 && "bias"); 2330 2331 err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims); 2332 if (err != CUDNN_STATUS_SUCCESS) { 2333 cudnnDestroyTensorDescriptor(xdesc); 2334 cudnnDestroyFilterDescriptor(wdesc); 2335 cudnnDestroyFilterDescriptor(odesc); 2336 PyErr_SetString(PyExc_RuntimeError, "could not get bias shape for id %(id)s"); 2337 %(fail)s; 2338 } 2339 // We assume that the typecode matches 2340#if CUDNN_VERSION < 7100 2341 assert(dims[2] == 1 && "bias"); 2342 assert(dims[1] == 1 && "bias"); 2343 %(b)s = pygpu_view(%(w)s, Py_None); 2344 %(b)s->ga.offset += off; 2345 %(b)s->ga.dimensions[0] = dims[0]; 2346 bshp = dims[0]; 2347#else 2348 assert(dims[0] == 1 && "bias"); 2349 assert(dims[2] == 1 && "bias"); 2350 %(b)s = pygpu_view(%(w)s, Py_None); 2351 %(b)s->ga.offset += off; 2352 %(b)s->ga.dimensions[0] = dims[1]; 2353#endif 2354 GpuArray_fix_flags(&%(b)s->ga); 2355 2356 err = cudnnGetRNNLinLayerMatrixParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o); 2357 if (err != CUDNN_STATUS_SUCCESS) { 2358 cudnnDestroyTensorDescriptor(xdesc); 2359 cudnnDestroyFilterDescriptor(wdesc); 2360 cudnnDestroyFilterDescriptor(odesc); 2361 PyErr_SetString(PyExc_RuntimeError, "can't fetch matrix for id %(id)s"); 2362 %(fail)s 2363 } 2364 off = (intptr_t)o - (intptr_t)w; 2365 assert(off >= 0 && "matrix"); 2366 2367 // This is 3d because of cudnn limitations. 2368 err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims); 2369 if (err != CUDNN_STATUS_SUCCESS) { 2370 cudnnDestroyTensorDescriptor(xdesc); 2371 cudnnDestroyFilterDescriptor(wdesc); 2372 cudnnDestroyFilterDescriptor(odesc); 2373 PyErr_SetString(PyExc_RuntimeError, "could not get matrix shape for id %(id)s"); 2374 %(fail)s; 2375 } 2376 2377#if CUDNN_VERSION < 7100 2378 assert(dims[1] == 1 && "matrix"); 2379 assert(dims[2] == 1 && "matrix"); 2380 // We assume that the typecode matches 2381 %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1); 2382 %(m)s->ga.offset += off; 2383 assert(dims[0] %% bshp == 0); 2384 %(m)s->ga.dimensions[0] = dims[0] / bshp; 2385 %(m)s->ga.dimensions[1] = bshp; 2386#else 2387 assert(dims[0] == 1 && "matrix"); 2388 // We assume that the typecode matches 2389 %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1); 2390 %(m)s->ga.offset += off; 2391 %(m)s->ga.dimensions[1] = dims[1]; 2392 %(m)s->ga.dimensions[0] = dims[2]; 2393#endif 2394 %(m)s->ga.strides[1] = %(m)s->ga.dimensions[0] * gpuarray_get_elsize(%(m)s->ga.typecode); 2395 GpuArray_fix_flags(&%(m)s->ga); 2396 """ % kw2 2397 2398 for i in range(len(outputs) // 2): 2399 code += get_params(i, outputs[2 * i], outputs[(2 * i) + 1]) 2400 2401 code += """ 2402 cudnnDestroyTensorDescriptor(xdesc); 2403 cudnnDestroyFilterDescriptor(wdesc); 2404 cudnnDestroyFilterDescriptor(odesc); 2405 """ 2406 return code 2407 2408 def c_code_cache_version(self): 2409 return (5, version()) 2410 2411 2412def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode): 2413 typecode = gpuarray.dtype_to_typecode(dtype) 2414 outs = _RNNSplitParams(rnn_mode)(w, desc, layer, input_size, typecode) 2415 outs = [theano.Out(o, borrow=True) for o in outs] 2416 return theano.function( 2417 [], outs, 2418 theano.Mode(optimizer=None), 2419 profile=False)() 2420 2421 2422class GpuDnnRNNOp(DnnBase): 2423 __props__ = () 2424 _cop_num_inputs = 6 2425 _cop_num_outputs = 4 2426 2427 def __init__(self, rnn_mode, direction_mode): 2428 DnnBase.__init__(self, ["c_code/dnn_rnn_fwd.c"], 'dnn_rnn_fwd') 2429 self.rnn_mode = rnn_mode 2430 if direction_mode == 'bidirectional': 2431 self.num_dirs = 2 2432 elif direction_mode == 'unidirectional': 2433 self.num_dirs = 1 2434 else: 2435 raise ValueError('direction_mode is invalid (got %s)' % (direction_mode,)) 2436 2437 def dnn_context(self, node): 2438 return node.outputs[1].type.context_name 2439 2440 def make_node(self, desc, w, x, hx, cx=None): 2441 if cx is None: 2442 context_name = infer_context_name(w, x, hx) 2443 else: 2444 context_name = infer_context_name(w, x, hx, cx) 2445 2446 w = as_gpuarray_variable(w, context_name) 2447 x = as_gpuarray_variable(x, context_name) 2448 hx = as_gpuarray_variable(hx, context_name) 2449 inputs = [desc, as_i32(self.num_dirs), w, x, hx] 2450 assert w.ndim == 1 2451 assert x.ndim == 3 # seqLength, minibatch, inputSize 2452 assert hx.ndim == 3 # numLayers, minibatch, hiddenSize * bidi 2453 2454 if self.rnn_mode == 'lstm': 2455 cx = as_gpuarray_variable(cx, context_name) 2456 assert cx.ndim == 3 # numLayers, minibatch, hiddenSize * bidi 2457 inputs.append(cx) 2458 2459 _3d = GpuArrayType(dtype=x.dtype, broadcastable=(False, False, False), 2460 context_name=context_name) 2461 reserve = gpudata_type() 2462 y = _3d() # seqLength, minibatch, hiddenSize * bidi 2463 hy = _3d() # numLayers, miniBatch, hiddenSize * bidi 2464 outputs = [reserve, y, hy] 2465 2466 if self.rnn_mode == 'lstm': 2467 cy = _3d() # numLayers, miniBatch, hiddenSize * bidi 2468 outputs.append(cy) 2469 2470 return Apply(self, inputs, outputs) 2471 2472 def L_op(self, inputs, outputs, output_grads): 2473 desc, numDirs, w, x, hx = inputs[:5] 2474 cx = inputs[5] if len(inputs) == 6 else None 2475 reserve, y, hy = outputs[:3] 2476 _, dy, dhy = output_grads[:3] 2477 dcy = output_grads[3] if len(output_grads) == 4 else None 2478 # Since the op return two outputs which contain essentially 2479 # the same information, the user will most likely only use one 2480 # of them. This leads to the situation that the other is 2481 # considered "disconnected" by theano in the gradient. 2482 # However we know that this isn't really the case so we fix it 2483 # here. 2484 2485 # If all the ys are disconnected, then you get a boring 2486 # gradient instead of an error. But in that case you 2487 # shouldn't call this method anyway. 2488 if isinstance(dy.type, DisconnectedType): 2489 dy = as_gpuarray_variable(y.zeros_like(), 2490 context_name=y.type.context_name) 2491 if isinstance(dhy.type, DisconnectedType): 2492 dhy = None 2493 if dcy and isinstance(dcy.type, DisconnectedType): 2494 dcy = None 2495 dinputs = GpuDnnRNNGradInputs(rnn_mode=self.rnn_mode, 2496 grad_h=(dhy is not None), 2497 grad_c=(dcy is not None))( 2498 desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True) 2499 reserve2, dx, dhx = dinputs[:3] 2500 dw = GpuDnnRNNGradWeights()( 2501 desc, x, hx, y, reserve2, w) 2502 res = [DisconnectedType()(), DisconnectedType()(), dw, dx, dhx] 2503 if cx is not None: 2504 res.append(dinputs[3]) # dcx 2505 return res 2506 2507 def connection_pattern(self, node): 2508 deconn = [[False] * len(node.outputs)] * 2 2509 conn = [[True] * len(node.outputs)] * (len(node.inputs) - 2) 2510 return deconn + conn 2511 2512 2513class GpuDnnRNNGradInputs(DnnBase): 2514 __props__ = ('rnn_mode', 'grad_c', 'grad_h') 2515 _cop_num_inputs = 10 2516 _cop_num_outputs = 4 2517 2518 def __init__(self, rnn_mode, grad_h, grad_c): 2519 DnnBase.__init__(self, ['c_code/dnn_rnn_gi.c'], 'dnn_rnn_gi') 2520 self.rnn_mode = rnn_mode 2521 self.grad_h = grad_h 2522 self.grad_c = grad_c 2523 if self.grad_c: 2524 assert self.rnn_mode == 'lstm' 2525 2526 def dnn_context(self, node): 2527 return node.outputs[1].type.context_name 2528 2529 def make_node(self, desc, x, y, dy, dhy, dcy, w, hx, cx, reserve): 2530 # We trust the callers here 2531 xshp = as_scalar(x.shape[2]).astype('uint64') 2532 inputs = [desc, xshp, y, dy, w, hx, reserve] 2533 outputs = [reserve.type(), x.type(), hx.type()] 2534 if self.rnn_mode == 'lstm': 2535 inputs.append(cx) 2536 outputs.append(cx.type()) 2537 if self.grad_h: 2538 inputs.append(dhy) 2539 if self.grad_c: 2540 inputs.append(dcy) 2541 2542 return Apply(self, inputs, outputs) 2543 2544 # We have special requirements so this is hooking into COp 2545 def format_c_function_args(self, inp, out): 2546 rinp = inp[:7] 2547 others = inp[7:] 2548 if self.rnn_mode == 'lstm': 2549 rinp.append(others.pop(0)) 2550 else: 2551 rinp.append('NULL') 2552 if self.grad_h: 2553 rinp.append(others.pop(0)) 2554 else: 2555 rinp.append('NULL') 2556 if self.grad_c: 2557 rinp.append(others.pop(0)) 2558 else: 2559 rinp.append('NULL') 2560 assert len(others) == 0 2561 return COp.format_c_function_args(self, rinp, out) 2562 2563 2564class GpuDnnRNNGradWeights(DnnBase): 2565 __props__ = () 2566 2567 def __init__(self): 2568 DnnBase.__init__(self, ['c_code/dnn_rnn_gw.c'], 'dnn_rnn_gw') 2569 2570 def make_node(self, desc, x, hx, y, reserve, w): 2571 # We trust the callers here 2572 wsize = as_scalar(w.shape[0]).astype('uint64') 2573 inputs = [desc, wsize, x, hx, y, reserve] 2574 outputs = [w.type()] 2575 return Apply(self, inputs, outputs) 2576 2577 2578class RNNBlock(object): 2579 """ 2580 An object that allow us to use CuDNN RNN implementation. 2581 TODO: make an example how to use. You can check Theano tests 2582 test_dnn_rnn_gru() and test_dnn_rnn_lstm() in the file 2583 theano/gpuarray/tests/test_dnn.py for now. 2584 2585 2586 Parameters 2587 ---------- 2588 dtype : data type of computation 2589 hidden_size : int 2590 hidden layer dimension. 2591 num_layers : int 2592 number of the recurrent layer you want to set. 2593 rnn_mode : {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'} 2594 rnn_relu: A single-gate recurrent neural network with a ReLU activation function. 2595 2596 .. math:: 2597 2598 h_t=ReLU(W_ix_t+U_ih_{t-1}+b_{wi}+b_{Ri}) 2599 rnn_tanh: A single-gate recurrent neural network with a tanh activation function. 2600 2601 .. math:: 2602 2603 h_t=tanh(W_ix_t+U_ih_{t-1}+b_{wi}+b_{Ri}) 2604 2605 lstm: A four-gate Long Short-Term Memory network with no peephole connections. 2606 gru: A three-gate network consisting of Gated Recurrent Units. 2607 input_mode : {'linear', 'skip'} 2608 linear: input will be multiplied by a biased matrix 2609 skip: No operation is performed on the input. The size must match the hidden size. 2610 direction_mode : {'unidirectional', 'bidirectional'} 2611 unidirectional: The network operates recurrently from the first input to the last. 2612 bidirectional: The network operates from first to last then from last to first and concatenates the results at each layer. 2613 2614 """ 2615 2616 def __init__(self, dtype, hidden_size, num_layers, rnn_mode, 2617 input_mode='linear', direction_mode='unidirectional', 2618 context_name=None): 2619 # This is not supported for any value other than 0, so don't change it 2620 ddesc, states = _make_dropout_desc(0, 4242, context_name) 2621 self.ddesc = ddesc 2622 self.dstates = states 2623 self.desc = _make_rnn_desc(hidden_size, num_layers, 2624 ddesc, rnn_mode, input_mode, 2625 direction_mode, dtype, context_name) 2626 self.rnn_mode = rnn_mode 2627 self.direction_mode = direction_mode 2628 self.context_name = context_name 2629 self.dtype = dtype 2630 2631 def get_param_size(self, input_size): 2632 """ 2633 Get the size of the shared variable for the parameters of the RNN. 2634 2635 This will return a size (in items) necessary to store all the 2636 parameters for the RNN. You should allocate a variable of 2637 that size to store those parameters. The order and layout of 2638 the parameters is opaque. 2639 2640 Parameters 2641 ---------- 2642 input_size: (int, int) 2643 Size of the input blocks 2644 2645 """ 2646 bytesize = _get_param_size(self.desc, input_size, self.dtype, 2647 self.context_name) 2648 bytesize = int(bytesize) 2649 assert bytesize % np.dtype(self.dtype).itemsize == 0 2650 return bytesize // np.dtype(self.dtype).itemsize 2651 2652 def split_params(self, w, layer, input_size): 2653 """ 2654 Split the opaque parameter block into components. 2655 2656 Parameters 2657 ---------- 2658 w: GpuArraySharedVariable 2659 opaque parameter block 2660 layer: int 2661 ID of the layer 2662 input_size: (int, int) 2663 Size of the input blocks 2664 2665 """ 2666 if not isinstance(w, GpuArraySharedVariable): 2667 raise TypeError("split_params only works on gpuarray shared variables") 2668 return _split_rnn_params(w, self.desc, layer, input_size, self.dtype, self.rnn_mode) 2669 2670 def apply(self, w, x, hx, cx=None): 2671 """ 2672 Apply the RNN to some data 2673 2674 Parameters 2675 ---------- 2676 w: 2677 opaque parameter block 2678 x: 2679 input 2680 hx: 2681 initial hidden state 2682 cx: 2683 initial cell state (for LSTM) 2684 """ 2685 # Don't return the reserve as an output 2686 return GpuDnnRNNOp(self.rnn_mode, self.direction_mode)( 2687 rnndesc_type.make_constant(self.desc), 2688 w, x, hx, cx, return_list=True)[1:] 2689 2690 2691def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', 2692 epsilon=1e-4, running_average_factor=0.1, 2693 running_mean=None, running_var=None): 2694 """ 2695 Performs batch normalization of the given inputs, using the mean and 2696 variance of the inputs. 2697 2698 Parameters 2699 ---------- 2700 mode : {'per-activation', 'spatial'} 2701 Whether to normalize per activation or share normalization factors 2702 across spatial dimensions (i.e., all dimensions past the second). 2703 gamma : tensor 2704 Learnable scale factors. Must match the dimensionality of `inputs`, 2705 but have sizes of `1` for all axes normalized over (i.e., in the first 2706 dimension for ``mode='per-activation'`, and additionally in all 2707 dimensions past the second for ``mode='spatial'``). 2708 beta : tensor 2709 Learnable biases. Must match the tensor layout of `gamma`. 2710 epsilon : float 2711 Epsilon value used in the batch normalization formula. Minimum allowed 2712 value is 1e-5 (imposed by cuDNN). 2713 running_average_factor : float 2714 Factor for updating the values or `running_mean` and `running_var`. 2715 If the factor is close to one, the running averages will update quickly, 2716 if the factor is close to zero it will update slowly. 2717 running_mean : tensor or None 2718 Previous value of the running mean. If this is given, the new value 2719 ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor`` 2720 will be returned as one of the outputs of this function. 2721 `running_mean` and `running_var` should either both be given or 2722 both be None. 2723 running_var : tensor or None 2724 Previous value of the running variance. If this is given, the new value 2725 ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor`` 2726 will be returned as one of the outputs of this function, 2727 where `m` is the product of lengths of the averaged-over dimensions. 2728 `running_mean` and `running_var` should either both be given or 2729 both be None. 2730 2731 Returns 2732 ------- 2733 out : tensor 2734 Batch-normalized inputs. 2735 mean : tensor 2736 Means of `inputs` across the normalization axes. 2737 invstd : tensor 2738 Inverse standard deviations of `inputs` across the normalization axes. 2739 new_running_mean : tensor 2740 New value of the running mean (only if both `running_mean` and 2741 `running_var` were given). 2742 new_running_var : tensor 2743 New value of the running variance (only if both `running_var` and 2744 `running_mean` were given). 2745 2746 Notes 2747 ----- 2748 Requires cuDNN 5 and Theano 0.9dev2 or more recent. 2749 2750 For 4d tensors, returned values are equivalent to: 2751 2752 .. code-block:: python 2753 2754 axes = 0 if mode == 'per-activation' else (0, 2, 3) 2755 mean = inputs.mean(axes, keepdims=True) 2756 var = inputs.var(axes, keepdims=True) 2757 invstd = T.inv(T.sqrt(var + epsilon)) 2758 out = (inputs - mean) * gamma * invstd + beta 2759 2760 m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32') 2761 running_mean = running_mean * (1 - running_average_factor) + \\ 2762 mean * running_average_factor 2763 running_var = running_var * (1 - running_average_factor) + \\ 2764 (m / (m - 1)) * var * running_average_factor 2765 2766 For 5d tensors, the axes are (0, 2, 3, 4). 2767 """ 2768 ndim = inputs.ndim 2769 if gamma.ndim != ndim or beta.ndim != ndim: 2770 raise ValueError("gamma and beta must be of the same dimensionality " 2771 "as inputs; got %d and %d instead of %d" % 2772 (gamma.ndim, beta.ndim, ndim)) 2773 if (running_mean is None) != (running_var is None): 2774 raise ValueError("running_mean and running_var must either both be " 2775 "given or both be None") 2776 if running_mean is not None and running_mean.ndim != ndim: 2777 raise ValueError("running_mean must be of the same dimensionality " 2778 "as inputs; got %d instead of %d" % 2779 (running_mean.ndim, ndim)) 2780 if running_var is not None and running_var.ndim != ndim: 2781 raise ValueError("running_var must be of the same dimensionality " 2782 "as inputs; got %d instead of %d" % 2783 (running_var.ndim, ndim)) 2784 if epsilon < 1e-5: 2785 raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon) 2786 2787 running_averages = (running_mean is not None and running_var is not None) 2788 2789 if ndim < 4: 2790 inputs = theano.tensor.shape_padright(inputs, 4 - ndim) 2791 gamma = theano.tensor.shape_padright(gamma, 4 - ndim) 2792 beta = theano.tensor.shape_padright(beta, 4 - ndim) 2793 if running_averages: 2794 running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim) 2795 running_var = theano.tensor.shape_padright(running_var, 4 - ndim) 2796 elif ndim > 5: 2797 inputs_shape = inputs.shape 2798 params_shape = gamma.shape 2799 inputs = theano.tensor.flatten(inputs, 5) 2800 gamma = theano.tensor.flatten(gamma, 5) 2801 beta = theano.tensor.flatten(beta, 5) 2802 if running_averages: 2803 running_mean = theano.tensor.flatten(running_mean, 5) 2804 running_var = theano.tensor.flatten(running_var, 5) 2805 2806 batchnorm_op = GpuDnnBatchNorm(mode=mode, running_averages=running_averages) 2807 if running_averages: 2808 out, mean, invstd, new_running_mean, new_running_var = batchnorm_op( 2809 gpu_contiguous(inputs), gpu_contiguous(gamma), 2810 gpu_contiguous(beta), epsilon=epsilon, 2811 running_average_factor=running_average_factor, 2812 running_mean=gpu_contiguous(running_mean), 2813 running_var=gpu_contiguous(running_var)) 2814 if new_running_mean.broadcastable != running_mean.broadcastable: 2815 new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable) 2816 if new_running_var.broadcastable != running_var.broadcastable: 2817 new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable) 2818 result = (out, mean, invstd, new_running_mean, new_running_var) 2819 else: 2820 result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), 2821 gpu_contiguous(beta), epsilon=epsilon) 2822 if ndim < 4: 2823 result = tuple(theano.tensor.flatten(r, ndim) for r in result) 2824 elif ndim > 5: 2825 result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple( 2826 theano.tensor.reshape(r, params_shape) for r in result[1:]) 2827 return result 2828 2829 2830def dnn_batch_normalization_test(inputs, gamma, beta, mean, var, 2831 mode='per-activation', epsilon=1e-4): 2832 """ 2833 Performs batch normalization of the given inputs, using the given mean and 2834 variance. 2835 2836 Parameters 2837 ---------- 2838 mode : {'per-activation', 'spatial'} 2839 Whether to normalize per activation or share normalization factors 2840 across spatial dimensions (i.e., all dimensions past the second). 2841 gamma : tensor 2842 Scale factors. Must match the dimensionality of `inputs`, but have 2843 sizes of `1` for all axes normalized over (i.e., in the first dimension 2844 for ``mode='per-activation'`, and additionally in all dimensions past 2845 the second for ``mode='spatial'``). 2846 beta : tensor 2847 Biases. Must match the tensor layout of `gamma`. 2848 mean : tensor 2849 Means. Usually these are running averages computed during training. 2850 Must match the tensor layout of `gamma`. 2851 var : tensor 2852 Variances. Usually these are running averages computed during training. 2853 Must match the tensor layout of `gamma`. 2854 epsilon : float 2855 Epsilon value used in the batch normalization formula. Minimum allowed 2856 value is 1e-5 (imposed by cuDNN). 2857 2858 Returns 2859 ------- 2860 out : tensor 2861 Batch-normalized inputs. 2862 2863 Notes 2864 ----- 2865 Requires cuDNN 5 and Theano 0.9dev2 or more recent. 2866 2867 For 4d tensors, the returned value is equivalent to: 2868 2869 .. code-block:: python 2870 2871 axes = (0,) if mode == 'per-activation' else (0, 2, 3) 2872 gamma, beta, mean, var = (T.addbroadcast(t, *axes) 2873 for t in (gamma, beta, mean, var)) 2874 out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta 2875 2876 For 5d tensors, the axes would be (0, 2, 3, 4). 2877 """ 2878 ndim = inputs.ndim 2879 if gamma.ndim != ndim or beta.ndim != ndim: 2880 raise ValueError("gamma and beta must be of the same dimensionality " 2881 "as inputs; got %d and %d instead of %d" % 2882 (gamma.ndim, beta.ndim, ndim)) 2883 if mean.ndim != ndim or var.ndim != ndim: 2884 raise ValueError("mean and var must be of the same dimensionality " 2885 "as inputs; got %d and %d instead of %d" % 2886 (mean.ndim, var.ndim, ndim)) 2887 if epsilon < 1e-5: 2888 raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon) 2889 2890 if ndim < 4: 2891 inputs = theano.tensor.shape_padright(inputs, 4 - ndim) 2892 gamma = theano.tensor.shape_padright(gamma, 4 - ndim) 2893 beta = theano.tensor.shape_padright(beta, 4 - ndim) 2894 mean = theano.tensor.shape_padright(mean, 4 - ndim) 2895 var = theano.tensor.shape_padright(var, 4 - ndim) 2896 elif ndim > 5: 2897 inputs_shape = inputs.shape 2898 inputs = theano.tensor.flatten(inputs, 5) 2899 gamma = theano.tensor.flatten(gamma, 5) 2900 beta = theano.tensor.flatten(beta, 5) 2901 mean = theano.tensor.flatten(mean, 5) 2902 var = theano.tensor.flatten(var, 5) 2903 batchnorm_op = GpuDnnBatchNormInference(mode=mode) 2904 result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma), 2905 gpu_contiguous(beta), gpu_contiguous(mean), 2906 gpu_contiguous(var), epsilon=epsilon) 2907 if ndim < 4: 2908 result = theano.tensor.flatten(result, ndim) 2909 elif ndim > 5: 2910 result = theano.tensor.reshape(result, inputs_shape) 2911 return result 2912 2913 2914class GpuDnnTransformerGrid(DnnBase): 2915 """ 2916 Grid generator Op for cuDNN Spatial Transformer. 2917 """ 2918 __props__ = () 2919 _cop_num_inputs = 2 2920 _cop_num_outputs = 1 2921 _f16_ok = True 2922 check_input = False 2923 2924 def __init__(self): 2925 DnnBase.__init__(self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)") 2926 2927 def make_node(self, theta, out_dims): 2928 """ 2929 Create a grid generator node for a cuDNN Spatial Transformer 2930 2931 Parameters 2932 ---------- 2933 theta : tensor 2934 Affine transformation tensor containing one affine transformation 2935 matrix per image. ``theta`` is usually generated by the localization 2936 network. 2937 2938 out_dims : tuple 2939 Dimensions of the transformed inputs, containing four elements, and is given 2940 by (N, C, H, W), where N is the number of inputs, C the number of channels, 2941 H and W are the height and width of each input. 2942 """ 2943 context_name = infer_context_name(theta) 2944 2945 theta = gpu_contiguous(as_gpuarray_variable(theta, context_name)) 2946 assert theta.dtype in ('float16', 'float32', 'float64') 2947 assert theta.ndim == 3 2948 2949 out_dims = cpu_contiguous(as_tensor_variable(out_dims)) 2950 assert out_dims.dtype in theano.tensor.basic.integer_dtypes 2951 assert out_dims.ndim == 1 2952 # Ensure 64-bit ints are passed to the C code 2953 out_dims = theano.tensor.basic.cast(out_dims, 'int64') 2954 grid = GpuArrayType(dtype=theta.dtype, 2955 broadcastable=(theta.type.ndim + 1) * (False,), 2956 context_name=context_name)() 2957 2958 inputs = [theta, out_dims] 2959 outputs = [grid] 2960 return Apply(self, inputs, outputs) 2961 2962 def grad(self, inputs, grads): 2963 theta, out_dims = inputs 2964 dgrid = grads[0] 2965 2966 dtheta = GpuDnnTransformerGradT()(dgrid) 2967 return [dtheta, grad_not_implemented(self, 1, out_dims)] 2968 2969 2970class GpuDnnTransformerSampler(DnnBase): 2971 """ 2972 Grid sampler Op for cuDNN Spatial Transformer. 2973 """ 2974 __props__ = () 2975 _cop_num_inputs = 2 2976 _cop_num_outputs = 1 2977 _f16_ok = True 2978 check_input = False 2979 2980 def __init__(self): 2981 DnnBase.__init__(self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)") 2982 2983 def make_node(self, img, grid): 2984 """ 2985 Create a grid sampler node for a cuDNN Spatial Transformer 2986 2987 Parameters 2988 ---------- 2989 img : tensor 2990 Images from which the pixels will be sampled. The implementation 2991 assumes the tensor is in NCHW format, where N is the number of images, 2992 C is the number of color channels, H is the height of the inputs, and 2993 W is width of the inputs. 2994 2995 grid : GpuDnnTransformerGrid 2996 Grid that contains the coordinates of the pixels to be sampled from 2997 the inputs images. 2998 """ 2999 context_name = infer_context_name(img, grid) 3000 3001 img = gpu_contiguous(as_gpuarray_variable(img, context_name)) 3002 if img.type.ndim != 4: 3003 raise TypeError('img must be a 4D tensor') 3004 elif img.dtype not in ('float16', 'float32', 'float64'): 3005 raise TypeError('img type must be floating-point') 3006 3007 grid = gpu_contiguous(as_gpuarray_variable(grid, context_name)) 3008 if grid.type.ndim != 4: 3009 raise TypeError('grid must be a 4D tensor') 3010 elif grid.dtype not in ('float16', 'float32', 'float64'): 3011 raise TypeError('grid type must be floating-point') 3012 3013 out = GpuArrayType(dtype=img.dtype, 3014 broadcastable=img.type.ndim * (False,), 3015 context_name=context_name)() 3016 3017 inputs = [img, grid] 3018 outputs = [out] 3019 return Apply(self, inputs, outputs) 3020 3021 def grad(self, inputs, grads): 3022 img, grid = inputs 3023 dy = grads[0] 3024 3025 dimg, dgrid = GpuDnnTransformerGradI()(img, grid, dy) 3026 return [dimg, dgrid] 3027 3028 3029class GpuDnnTransformerGradI(DnnBase): 3030 """ 3031 Gradient of inputs Op for cuDNN Spatial Transformer. 3032 """ 3033 __props__ = () 3034 _cop_num_inputs = 3 3035 _cop_num_outputs = 2 3036 _f16_ok = True 3037 check_input = False 3038 3039 def __init__(self): 3040 DnnBase.__init__(self, ["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)") 3041 3042 def make_node(self, img, grid, dy): 3043 context_name = infer_context_name(img, grid, dy) 3044 3045 img = as_gpuarray_variable(gpu_contiguous(img), context_name) 3046 if img.ndim != 4: 3047 raise TypeError('img must have 4 dimensions.') 3048 3049 grid = as_gpuarray_variable(gpu_contiguous(grid), context_name) 3050 if img.ndim != grid.ndim: 3051 raise TypeError('grid should have the same number of dimensions as img') 3052 3053 dy = as_gpuarray_variable(dy, context_name) 3054 if dy.ndim != 4: 3055 raise TypeError('dy must have 4 dimensions.') 3056 3057 dimg = img.type() 3058 dgrid = grid.type() 3059 3060 inputs = [img, grid, dy] 3061 outputs = [dimg, dgrid] 3062 3063 return Apply(self, inputs, outputs) 3064 3065 3066class GpuDnnTransformerGradT(DnnBase): 3067 """ 3068 Gradient of affine transformations Op for cuDNN Spatial Transformer. 3069 """ 3070 __props__ = () 3071 _cop_num_inputs = 1 3072 _cop_num_outputs = 1 3073 _f16_ok = True 3074 check_input = False 3075 3076 def __init__(self): 3077 DnnBase.__init__(self, ["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)") 3078 3079 def make_node(self, dgrid): 3080 context_name = infer_context_name(dgrid) 3081 3082 dgrid = as_gpuarray_variable(dgrid, context_name) 3083 assert dgrid.dtype in ('float16', 'float32', 'float64') 3084 assert dgrid.ndim == 4 3085 3086 dtheta = GpuArrayType(dtype=dgrid.dtype, 3087 broadcastable=(dgrid.type.ndim - 1) * (False,), 3088 context_name=context_name)() 3089 inputs = [dgrid] 3090 outputs = [dtheta] 3091 3092 return Apply(self, inputs, outputs) 3093 3094 3095def dnn_spatialtf(img, theta, scale_width=1, scale_height=1): 3096 """ 3097 GPU spatial transformer using cuDNN from NVIDIA. 3098 3099 Parameters 3100 ---------- 3101 img : tensor 3102 Images to which the transformations will be applied. The implementation 3103 assumes the tensor is in NCHW format, where N is the number of images, 3104 C is the number of color channels, H is the height of the inputs, and 3105 W is width of the inputs. 3106 theta : tensor 3107 Affine transformation tensor containing one affine transformation 3108 matrix per image. ``theta`` is usually generated by the localization 3109 network. 3110 scale_height: float 3111 A float specifying the scaling factor for the height of the output 3112 image. A value of 1 will keep the original height of the input. Values 3113 larger than 1 will upsample the input. Values below 1 will downsample 3114 the input. 3115 scale_width: float 3116 A float specifying the scaling factor for the width of the output 3117 image. A value of 1 will keep the original width of the input. Values 3118 larger than 1 will upsample the input. Values below 1 will downsample 3119 the input. 3120 3121 Returns 3122 ------- 3123 out : tensor 3124 Transformed images with width and height properly scaled. 3125 3126 Notes 3127 ----- 3128 Currently, cuDNN only supports 2D transformations with 2x3 affine 3129 transformation matrices. 3130 3131 Bilinear interpolation is the only grid sampler method available. 3132 """ 3133 out_dims = (img.shape[0], img.shape[1], 3134 theano.tensor.ceil(img.shape[2] * scale_height), 3135 theano.tensor.ceil(img.shape[3] * scale_width)) 3136 out_dims = tuple([as_scalar(v).astype('int64') for v in out_dims]) 3137 # Setup spatial transformer 3138 grid = GpuDnnTransformerGrid()(theta, out_dims) 3139 sampler = GpuDnnTransformerSampler()(img, grid) 3140 return sampler 3141 3142 3143def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): 3144 if (not isinstance(op, (AbstractConv2d, 3145 AbstractConv2d_gradWeights, 3146 AbstractConv2d_gradInputs))): 3147 return 3148 3149 if version(raises=False) < 6000 and op.filter_dilation != (1, 1): 3150 return None 3151 3152 if op.unshared: 3153 return None 3154 3155 if isinstance(op.border_mode, tuple) and any(isinstance(p, tuple) for p in op.border_mode): 3156 # Asymmetric padding not yet supported 3157 return None 3158 3159 inp1 = inputs[0] 3160 inp2 = inputs[1] 3161 3162 if not dnn_available(inp1.type.context_name): 3163 return 3164 3165 if op.filter_flip: 3166 conv_mode = 'conv' 3167 else: 3168 conv_mode = 'cross' 3169 3170 if isinstance(op, AbstractConv2d): 3171 rval = dnn_conv(inp1, inp2, 3172 border_mode=op.border_mode, 3173 subsample=op.subsample, 3174 dilation=op.filter_dilation, 3175 direction_hint='forward!', 3176 conv_mode=conv_mode, 3177 num_groups=op.num_groups) 3178 elif isinstance(op, AbstractConv2d_gradWeights): 3179 shape = (inp2.shape[1], inp1.shape[1] // op.num_groups, 3180 inputs[2][0], inputs[2][1]) 3181 rval = dnn_gradweight(inp1, inp2, shape, 3182 border_mode=op.border_mode, 3183 subsample=op.subsample, 3184 dilation=op.filter_dilation, 3185 conv_mode=conv_mode, 3186 num_groups=op.num_groups) 3187 elif isinstance(op, AbstractConv2d_gradInputs): 3188 shape = (inp2.shape[0], inp1.shape[1] * op.num_groups, 3189 inputs[2][0], inputs[2][1]) 3190 rval = dnn_gradinput(inp1, inp2, shape, 3191 border_mode=op.border_mode, 3192 subsample=op.subsample, 3193 dilation=op.filter_dilation, 3194 conv_mode=conv_mode, 3195 num_groups=op.num_groups) 3196 return [rval] 3197 3198 3199def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs): 3200 if (not isinstance(op, (AbstractConv3d, 3201 AbstractConv3d_gradWeights, 3202 AbstractConv3d_gradInputs))): 3203 return 3204 3205 if version(raises=False) < 6000 and op.filter_dilation != (1, 1, 1): 3206 return None 3207 3208 inp1 = inputs[0] 3209 inp2 = inputs[1] 3210 3211 if not dnn_available(inp1.type.context_name): 3212 return 3213 3214 if op.filter_flip: 3215 conv_mode = 'conv' 3216 else: 3217 conv_mode = 'cross' 3218 3219 if isinstance(op, AbstractConv3d): 3220 rval = dnn_conv3d(inp1, inp2, 3221 border_mode=op.border_mode, 3222 subsample=op.subsample, 3223 dilation=op.filter_dilation, 3224 direction_hint='forward!', 3225 conv_mode=conv_mode, 3226 num_groups=op.num_groups) 3227 elif isinstance(op, AbstractConv3d_gradWeights): 3228 shape = (inp2.shape[1], inp1.shape[1] // op.num_groups, 3229 inputs[2][0], inputs[2][1], inputs[2][2]) 3230 rval = dnn_gradweight3d(inp1, inp2, shape, 3231 border_mode=op.border_mode, 3232 subsample=op.subsample, 3233 dilation=op.filter_dilation, 3234 conv_mode=conv_mode, 3235 num_groups=op.num_groups) 3236 elif isinstance(op, AbstractConv3d_gradInputs): 3237 shape = (inp2.shape[0], inp1.shape[1] * op.num_groups, 3238 inputs[2][0], inputs[2][1], inputs[2][2]) 3239 rval = dnn_gradinput3d(inp1, inp2, shape, 3240 border_mode=op.border_mode, 3241 subsample=op.subsample, 3242 dilation=op.filter_dilation, 3243 conv_mode=conv_mode, 3244 num_groups=op.num_groups) 3245 return [rval] 3246 3247 3248@local_optimizer([AbstractConv2d, AbstractConv3d]) 3249def local_abstractconv_cudnn(node): 3250 ctx = infer_context_name(*node.inputs) 3251 if not isinstance(node.inputs[0].type, GpuArrayType): 3252 return 3253 if node.op.unshared: 3254 return None 3255 if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode): 3256 # Asymmetric padding not yet supported 3257 return None 3258 if isinstance(node.op, AbstractConv2d): 3259 with inherit_stack_trace(node.outputs): 3260 return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) 3261 elif isinstance(node.op, AbstractConv3d): 3262 with inherit_stack_trace(node.outputs): 3263 return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs) 3264 3265 3266@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs]) 3267def local_abstractconv_cudnn_alt(node): 3268 if(not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights, 3269 AbstractConv2d_gradInputs))): 3270 return 3271 3272 if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1): 3273 return None 3274 if node.op.unshared: 3275 return None 3276 if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode): 3277 # Asymmetric padding not yet supported 3278 return None 3279 inp1 = node.inputs[0] 3280 inp2 = node.inputs[1] 3281 3282 if not dnn_available(inp1.type.context_name): 3283 return 3284 3285 op = node.op 3286 border_mode = node.op.border_mode 3287 subsample = node.op.subsample 3288 filter_dilation = node.op.filter_dilation 3289 num_groups = node.op.num_groups 3290 precision, _ = get_precision(None, [inp1, inp2]) 3291 3292 if node.op.filter_flip: 3293 conv_mode = 'conv' 3294 else: 3295 conv_mode = 'cross' 3296 3297 if isinstance(op, AbstractConv2d): 3298 if border_mode == 'half' or subsample != (1, 1) or num_groups != 1: 3299 return None 3300 if border_mode == 'full': 3301 direction_hint = 'bprop inputs' 3302 elif border_mode == 'valid' and filter_dilation == (1, 1): 3303 direction_hint = 'bprop weights' 3304 else: 3305 return None 3306 3307 rval = dnn_conv(inp1, inp2, 3308 border_mode=border_mode, 3309 subsample=subsample, 3310 dilation=filter_dilation, 3311 direction_hint=direction_hint, 3312 conv_mode=conv_mode, 3313 num_groups=num_groups) 3314 3315 elif isinstance(op, AbstractConv2d_gradWeights): 3316 if(border_mode == 'valid' and subsample == (1, 1) and 3317 filter_dilation == (1, 1) and num_groups == 1): 3318 img = gpu_contiguous(inp1) 3319 topgrad = gpu_contiguous(inp2) 3320 ctx_name = infer_context_name(img, topgrad) 3321 img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) 3322 topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3)) 3323 ishape = [shape_i_op(i)(img) for i in range(img.ndim)] 3324 tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] 3325 out_shp = get_conv_output_shape(ishape, 3326 tshape, 3327 border_mode=border_mode, 3328 subsample=subsample, 3329 filter_dilation=filter_dilation) 3330 3331 out_shp = assert_conv_shape(out_shp) 3332 out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) 3333 desc = GpuDnnConvDesc(border_mode=border_mode, 3334 subsample=subsample, 3335 dilation=filter_dilation, 3336 conv_mode='cross', 3337 precision=precision)(out.shape) 3338 3339 conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc) 3340 if conv_mode == 'conv': 3341 conv = conv[:, :, ::-1, ::-1] 3342 3343 rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) 3344 else: 3345 return None 3346 3347 elif isinstance(op, AbstractConv2d_gradInputs): 3348 if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1: 3349 kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3)) 3350 topgrad = gpu_contiguous(inp2) 3351 ctx_name = infer_context_name(kerns, topgrad) 3352 conv_mode = 'cross' if conv_mode == 'conv' else 'conv' 3353 desc = GpuDnnConvDesc(border_mode='full', 3354 subsample=subsample, 3355 dilation=filter_dilation, 3356 conv_mode=conv_mode, 3357 precision=precision)(kerns.shape) 3358 3359 tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] 3360 kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)] 3361 shape = get_conv_output_shape(tshape, 3362 kshape, 3363 border_mode='full', 3364 subsample=subsample, 3365 filter_dilation=filter_dilation) 3366 3367 shape = assert_conv_shape(shape) 3368 out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape) 3369 rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns, out, desc) 3370 else: 3371 return None 3372 3373 return [rval] 3374 3375 3376@local_optimizer([AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs]) 3377def local_abstractconv3d_cudnn_alt(node): 3378 if(not isinstance(node.op, (AbstractConv3d, 3379 AbstractConv3d_gradWeights, 3380 AbstractConv3d_gradInputs))): 3381 return 3382 3383 if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1): 3384 return None 3385 inp1 = node.inputs[0] 3386 inp2 = node.inputs[1] 3387 3388 if not dnn_available(inp1.type.context_name): 3389 return 3390 3391 op = node.op 3392 border_mode = node.op.border_mode 3393 subsample = node.op.subsample 3394 filter_dilation = node.op.filter_dilation 3395 num_groups = node.op.num_groups 3396 precision, _ = get_precision(None, [inp1, inp2]) 3397 3398 if node.op.filter_flip: 3399 conv_mode = 'conv' 3400 else: 3401 conv_mode = 'cross' 3402 3403 if isinstance(op, AbstractConv3d): 3404 if border_mode == 'half' or subsample != (1, 1, 1) or num_groups > 1: 3405 return None 3406 if border_mode == 'full': 3407 direction_hint = 'bprop inputs' 3408 elif border_mode == 'valid' and filter_dilation == (1, 1, 1): 3409 direction_hint = 'bprop weights' 3410 else: 3411 return None 3412 3413 rval = dnn_conv3d(inp1, inp2, 3414 border_mode=border_mode, 3415 subsample=subsample, 3416 dilation=filter_dilation, 3417 direction_hint=direction_hint, 3418 conv_mode=conv_mode) 3419 3420 elif isinstance(op, AbstractConv3d_gradWeights): 3421 if(border_mode == 'valid' and subsample == (1, 1, 1) and 3422 filter_dilation == (1, 1, 1) and num_groups == 1): 3423 img = gpu_contiguous(inp1) 3424 topgrad = gpu_contiguous(inp2) 3425 ctx_name = infer_context_name(img, topgrad) 3426 img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)) 3427 topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4)) 3428 ishape = [shape_i_op(i)(img) for i in range(img.ndim)] 3429 tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] 3430 out_shp = get_conv_output_shape(ishape, 3431 tshape, 3432 border_mode=border_mode, 3433 subsample=subsample, 3434 filter_dilation=filter_dilation) 3435 3436 out_shp = assert_conv_shape(out_shp) 3437 out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) 3438 desc = GpuDnnConvDesc(border_mode=border_mode, 3439 subsample=subsample, 3440 dilation=filter_dilation, 3441 conv_mode='cross', 3442 num_groups=num_groups, 3443 precision=precision)(out.shape) 3444 3445 conv = GpuDnnConv(algo=None, num_groups=num_groups)( 3446 img, topgrad, out, desc) 3447 if conv_mode == 'conv': 3448 conv = conv[:, :, ::-1, ::-1, ::-1] 3449 3450 rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name) 3451 else: 3452 return None 3453 3454 elif isinstance(op, AbstractConv3d_gradInputs): 3455 if border_mode == 'valid' and subsample == (1, 1, 1) and num_groups == 1: 3456 kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4)) 3457 topgrad = gpu_contiguous(inp2) 3458 ctx_name = infer_context_name(kerns, topgrad) 3459 conv_mode = 'cross' if conv_mode == 'conv' else 'conv' 3460 desc = GpuDnnConvDesc(border_mode='full', 3461 subsample=subsample, 3462 dilation=filter_dilation, 3463 conv_mode=conv_mode, 3464 num_groups=num_groups, 3465 precision=precision)(kerns.shape) 3466 3467 tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] 3468 kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)] 3469 shape = get_conv_output_shape(tshape, 3470 kshape, 3471 border_mode='full', 3472 subsample=subsample, 3473 filter_dilation=filter_dilation) 3474 3475 shape = assert_conv_shape(shape) 3476 out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape) 3477 rval = GpuDnnConv(algo=None, num_groups=num_groups)( 3478 topgrad, kerns, out, desc) 3479 else: 3480 return None 3481 3482 return [rval] 3483 3484 3485@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights]) 3486def local_abstractconv_gw_cudnn(node): 3487 ctx = infer_context_name(*node.inputs) 3488 if not isinstance(node.inputs[0].type, GpuArrayType): 3489 return 3490 if node.op.unshared: 3491 return None 3492 if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode): 3493 # Asymmetric padding not yet supported 3494 return None 3495 if isinstance(node.op, AbstractConv2d_gradWeights): 3496 with inherit_stack_trace(node.outputs): 3497 return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) 3498 elif isinstance(node.op, AbstractConv3d_gradWeights): 3499 with inherit_stack_trace(node.outputs): 3500 return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs) 3501 3502 3503@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs]) 3504def local_abstractconv_gi_cudnn(node): 3505 ctx = infer_context_name(*node.inputs) 3506 if not isinstance(node.inputs[0].type, GpuArrayType): 3507 return 3508 if node.op.unshared: 3509 return None 3510 if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode): 3511 # Asymmetric padding not yet supported 3512 return None 3513 if isinstance(node.op, AbstractConv2d_gradInputs): 3514 with inherit_stack_trace(node.outputs): 3515 return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs) 3516 elif isinstance(node.op, AbstractConv3d_gradInputs): 3517 with inherit_stack_trace(node.outputs): 3518 return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs) 3519 3520 3521@inplace_allocempty(GpuDnnConv, 2) 3522def local_dnn_conv_inplace(node, inputs): 3523 return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)] 3524 3525 3526@inplace_allocempty(GpuDnnConvGradW, 2) 3527def local_dnn_convgw_inplace(node, inputs): 3528 return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)] 3529 3530 3531@inplace_allocempty(GpuDnnConvGradI, 2) 3532def local_dnn_convgi_inplace(node, inputs): 3533 return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)] 3534 3535optdb.register('local_dnna_conv_inplace', 3536 tensor.opt.in2out(local_dnn_conv_inplace, 3537 local_dnn_convgw_inplace, 3538 local_dnn_convgi_inplace, 3539 name="local_dnna_conv_inplace"), 3540 70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn') 3541 3542 3543@register_opt('cudnn') 3544@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5) 3545def local_dnn_conv_alpha_merge(node, *inputs): 3546 return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] 3547 3548 3549@register_opt('cudnn') 3550@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5) 3551def local_dnn_convw_alpha_merge(node, *inputs): 3552 return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] 3553 3554 3555@register_opt('cudnn') 3556@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5) 3557def local_dnn_convi_alpha_merge(node, *inputs): 3558 return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] 3559 3560 3561@register_opt('cudnn') 3562@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2) 3563def local_dnn_conv_output_merge(node, *inputs): 3564 inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] 3565 return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] 3566 3567 3568@register_opt('cudnn') 3569@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2) 3570def local_dnn_convw_output_merge(node, *inputs): 3571 inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] 3572 return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] 3573 3574 3575@register_opt('cudnn') 3576@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2) 3577def local_dnn_convi_output_merge(node, *inputs): 3578 inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] 3579 return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)] 3580 3581 3582def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs): 3583 if not dnn_available(ctx_name): 3584 return 3585 if not op.ignore_border: 3586 return 3587 img, ws, stride, pad = inputs 3588 nd = op.ndim 3589 if nd not in (2, 3): 3590 return 3591 img = gpu_contiguous(as_gpuarray_variable(img, ctx_name)) 3592 mode = op.mode 3593 # dnn_pool expects exactly 2 non-pooling dimensions 3594 if img.ndim == nd + 2: 3595 return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode) 3596 else: 3597 # reshape to 4D or 5D with 2 non-pooling dimensions 3598 img_padded = pad_dims(img, 2, nd) 3599 ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode) 3600 return unpad_dims(ret_padded, img, 2, nd) 3601pool_db.register("local_gpua_pool_dnn_alternative", 3602 op_lifter([Pool])(local_gpua_pool_dnn_alternative), 3603 'gpuarray', 'fast_compile', 'fast_run', 'cudnn', 3604 position=0) 3605pool_db2.register("local_gpua_pool_dnn_alternative", 3606 local_optimizer([Pool])(local_gpua_pool_dnn_alternative), 3607 'gpuarray', 'fast_compile', 'fast_run', 'cudnn', 3608 position=0) 3609 3610 3611def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): 3612 if not dnn_available(ctx_name): 3613 return 3614 if not op.ignore_border: 3615 return 3616 inp, out, out_grad, ws, stride, pad = inputs 3617 nd = op.ndim 3618 if nd not in (2, 3): 3619 return 3620 inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) 3621 out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) 3622 out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) 3623 mode = op.mode 3624 3625 # the GPU ops expect exactly 2 non-pooling dimensions 3626 if inp.ndim == nd + 2: 3627 return GpuDnnPoolGrad(mode=mode)(inp, 3628 out, 3629 out_grad, 3630 ws, 3631 stride, 3632 pad) 3633 else: 3634 # reshape to 4D or 5D with 2 non-pooling dimensions 3635 inp_padded = pad_dims(inp, 2, nd) 3636 out_padded = pad_dims(out, 2, nd) 3637 out_grad_padded = pad_dims(out_grad, 2, nd) 3638 ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, 3639 out_padded, 3640 out_grad_padded, 3641 ws, 3642 stride, 3643 pad) 3644 return unpad_dims(ret_padded, inp, 2, nd) 3645pool_db.register("local_gpua_pool_dnn_grad_stride", 3646 op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride), 3647 'gpuarray', 'fast_compile', 'fast_run', 'cudnn', 3648 position=0) 3649pool_db2.register("local_gpua_pool_dnn_grad_stride", 3650 local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride), 3651 'gpuarray', 'fast_compile', 'fast_run', 'cudnn', 3652 position=0) 3653 3654 3655def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs): 3656 if not dnn_available(ctx_name): 3657 return 3658 if not op.ignore_border: 3659 return 3660 inp, out_grad, ws, stride, pad = inputs 3661 nd = op.ndim 3662 if nd not in (2, 3): 3663 return 3664 inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) 3665 out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) 3666 mode = op.mode 3667 3668 # the GPU ops expect exactly 2 non-pooling dimensions 3669 if inp.ndim == nd + 2: 3670 # We reuse out_grad because cuDNN does not use the value of the `out` 3671 # argument but still checks its shape for average pooling. This 3672 # has been observed in v2 and v3 as far as I know. 3673 return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad) 3674 else: 3675 # reshape to 4D or 5D with 2 non-pooling dimensions 3676 inp_padded = pad_dims(inp, 2, nd) 3677 out_grad_padded = pad_dims(out_grad, 2, nd) 3678 ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, 3679 out_grad_padded, 3680 out_grad_padded, 3681 ws, 3682 stride, 3683 pad) 3684 return unpad_dims(ret_padded, inp, 2, nd) 3685pool_db.register("local_gpua_avg_pool_dnn_grad_stride", 3686 op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride), 3687 'gpuarray', 'fast_compile', 'fast_run', 'cudnn', 3688 position=0) 3689pool_db2.register("local_gpua_avg_pool_dnn_grad_stride", 3690 local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride), 3691 'gpuarray', 'fast_compile', 'fast_run', 'cudnn', 3692 position=0) 3693 3694 3695@register_opt('cudnn', 'fast_compile') 3696@local_optimizer([GpuSoftmax]) 3697def local_softmax_dnn(node): 3698 if isinstance(node.op, GpuSoftmax): 3699 if not dnn_available(node.outputs[0].type.context_name): 3700 return 3701 ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x') 3702 ins = gpu_contiguous(ins) 3703 out = GpuDnnSoftmax('accurate', 'channel')(ins) 3704 out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name) 3705 return [out] 3706 3707 3708@register_opt('cudnn', 'stabilize') 3709@local_optimizer([GpuElemwise]) 3710def local_log_softmax_dnn(node): 3711 # This looks for GpuDnnSoftmax so we know that we have cudnn. 3712 if (isinstance(node.op, GpuElemwise) and 3713 isinstance(node.op.scalar_op, Log) and 3714 node.inputs[0].owner and 3715 isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and 3716 len(node.inputs[0].clients) == 1): 3717 softmax_node = node.inputs[0].owner 3718 new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode) 3719 return [new_softmax(softmax_node.inputs[0])] 3720 3721 3722@register_opt('cudnn', 'fast_compile') 3723@op_lifter([LogSoftmax]) 3724@register_opt2([LogSoftmax], 'fast_compile', 'cudnn') 3725def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs): 3726 # Transform the input in the format expected by GpuDnnSoftmax 3727 inp = inputs[0] 3728 if inp.ndim != 2: 3729 return 3730 if not dnn_available(ctx_name): 3731 return 3732 3733 inp = inp.dimshuffle(0, 1, 'x', 'x') 3734 inp.tag.context_name = ctx_name 3735 3736 # Apply GpuDnnSoftmax and return the result 3737 out = GpuDnnSoftmax('log', 'channel')(gpu_contiguous(inp)) 3738 return [out.dimshuffle(0, 1)] 3739 3740 3741@register_opt('cudnn', 'fast_compile') 3742@op_lifter([SoftmaxGrad]) 3743@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile') 3744def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs): 3745 if not dnn_available(ctx_name): 3746 return 3747 ins = [] 3748 for n in inputs: 3749 n = as_gpuarray_variable(n, ctx_name) 3750 if n.ndim != 2: 3751 return 3752 ins.append(n.dimshuffle(0, 'x', 1, 'x')) 3753 3754 out = GpuDnnSoftmaxGrad('accurate', 'instance')( 3755 gpu_contiguous(ins[0]), gpu_contiguous(ins[1])) 3756 return [out.dimshuffle(0, 2)] 3757 3758 3759@register_opt('cudnn') 3760@local_optimizer([GpuCAReduceCuda]) 3761def local_dnn_reduction(node): 3762 if not isinstance(node.op, GpuCAReduceCuda): 3763 return 3764 3765 if not dnn_available(node.inputs[0].type.context_name): 3766 return 3767 3768 if version(raises=False) < 6000: 3769 return 3770 3771 if node.inputs[0].ndim > 8: 3772 return 3773 3774 acc_dtype = node.op._acc_dtype(node.inputs[0].dtype) 3775 3776 if node.inputs[0].dtype != node.outputs[0].dtype: 3777 # We can mix float16 and float32, but not float64. 3778 if (node.inputs[0].dtype == 'float64' or 3779 node.outputs[0].dtype == 'float64'): 3780 return 3781 if acc_dtype != 'float32': 3782 return 3783 3784 if node.inputs[0].dtype not in ['float16', 'float32', 'float64']: 3785 return 3786 3787 if (node.inputs[0].dtype == 'float64' and acc_dtype != 'float64'): 3788 return 3789 3790 if (node.inputs[0].dtype == 'float32' and acc_dtype != 'float32'): 3791 return 3792 3793 if (node.inputs[0].dtype == 'float16' and acc_dtype == 'float64'): 3794 return 3795 3796 def _identity(a): 3797 return a 3798 3799 def _square(a): 3800 return GpuElemwise(theano.scalar.basic.sqr)(a) 3801 3802 scal = node.op.scalar_op.name 3803 post = _identity 3804 3805 if node.op.pre_scalar_op is not None: 3806 if isinstance(node.op.scalar_op, theano.scalar.basic.Add): 3807 if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr): 3808 scal = 'norm2' 3809 post = _square 3810 elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs): 3811 scal = 'norm1' 3812 else: 3813 return 3814 elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and 3815 isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)): 3816 scal = 'absmax' 3817 else: 3818 return 3819 3820 if not cudnn.cudnnReduceTensorOp_t.has_alias(scal): 3821 return 3822 3823 with inherit_stack_trace(node.outputs): 3824 ret = GpuDnnReduction(scal, 3825 node.op.axis, 3826 acc_dtype, 3827 node.op.dtype, 3828 False)(node.inputs[0]) 3829 return [post(ret)] 3830 3831 3832@register_opt('cudnn') 3833@local_optimizer([GpuMaxAndArgmax]) 3834def local_cudnn_maxandargmax(node): 3835 if not isinstance(node.op, GpuMaxAndArgmax): 3836 return 3837 3838 if not dnn_available(node.inputs[0].type.context_name): 3839 return 3840 3841 if version(raises=False) < 6000: 3842 return 3843 3844 if node.inputs[0].ndim > 8: 3845 return 3846 3847 if node.inputs[0].dtype != node.outputs[0].dtype: 3848 return 3849 3850 if node.inputs[0].dtype not in ['float16', 'float32', 'float64']: 3851 return 3852 3853 # order of the axes influences the output indices 3854 if (node.op.axis is not None and 3855 tuple(sorted(node.op.axis)) != node.op.axis): 3856 return 3857 3858 max, arg = GpuDnnReduction('maximum', node.op.axis, node.outputs[0].dtype, 3859 node.outputs[0].dtype, True)(node.inputs[0]) 3860 3861 # cudnn can only return int32 indices 3862 return (max, as_gpuarray_variable(arg.astype('int64'), 3863 node.outputs[1].type.context_name)) 3864 3865 3866@register_opt('cudnn', 'fast_compile') 3867@op_lifter([Argmax]) 3868@register_opt2([Argmax], 'fast_compile', 'cudnn') 3869def local_dnn_argmax(op, ctx_name, inputs, outputs): 3870 if not dnn_available(ctx_name): 3871 return 3872 3873 if version(raises=False) < 6000: 3874 return 3875 3876 if inputs[0].ndim > 8: 3877 return 3878 3879 if inputs[0].dtype not in ['float16', 'float32', 'float64']: 3880 return 3881 3882 # order of the axes influences the output indices 3883 if op.axis is not None and tuple(sorted(op.axis)) != op.axis: 3884 return 3885 3886 max, arg = GpuDnnReduction('maximum', op.axis, inputs[0].dtype, 3887 inputs[0].dtype, True)(*inputs) 3888 3889 return [as_gpuarray_variable(arg.astype('int64'), ctx_name)] 3890 3891 3892class NoCuDNNRaise(Optimizer): 3893 3894 def apply(self, fgraph): 3895 """ 3896 Raise a error if cudnn can't be used. 3897 3898 """ 3899 for c in list_contexts(): 3900 if not dnn_available(c): 3901 # Make an assert error as we want Theano to fail, not 3902 # just skip this optimization. 3903 raise AssertionError( 3904 "cuDNN optimization was enabled, but Theano was not able " 3905 "to use it for context " + str(c) + ". We got this error: \n" + 3906 dnn_available.msg) 3907 3908gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') 3909 3910 3911def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs): 3912 x, scale, bias, epsilon, running_average_factor = inputs[:5] 3913 running_mean = inputs[5] if len(inputs) > 5 else None 3914 running_var = inputs[6] if len(inputs) > 6 else None 3915 3916 # convert axes to cuDNN mode 3917 axes = tuple(op.axes) 3918 if axes == (0,): 3919 mode = 'per-activation' 3920 elif axes == (0,) + tuple(range(2, x.ndim)): 3921 mode = 'spatial' 3922 else: 3923 return None 3924 3925 try: 3926 eps = theano.tensor.get_scalar_constant_value(epsilon) 3927 except theano.tensor.NotScalarConstantError: 3928 return None 3929 if eps < 1e-5: 3930 return None 3931 try: 3932 running_average_factor = theano.tensor.get_scalar_constant_value(running_average_factor) 3933 except theano.tensor.NotScalarConstantError: 3934 return None 3935 3936 ctx = infer_context_name(*inputs) 3937 if not dnn_available(ctx): 3938 return 3939 x = as_gpuarray_variable(x, context_name=ctx) 3940 scale = as_gpuarray_variable(scale, context_name=ctx) 3941 bias = as_gpuarray_variable(bias, context_name=ctx) 3942 3943 inputs = [x, scale, bias, mode, eps, running_average_factor] 3944 if running_mean is not None and running_var is not None: 3945 inputs.append(running_mean) 3946 inputs.append(running_var) 3947 3948 results = list(dnn_batch_normalization_train(*inputs)) 3949 3950 return results 3951 3952 3953@register_inplace() 3954@local_optimizer([GpuDnnBatchNorm], inplace=True) 3955def local_batch_norm_inplace_output(node): 3956 if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output: 3957 return GpuDnnBatchNorm(mode=node.op.mode, 3958 running_averages=node.op.running_averages, 3959 inplace_running_mean=node.op.inplace_running_mean, 3960 inplace_running_var=node.op.inplace_running_var, 3961 inplace_output=True)(*node.inputs) 3962 3963 3964@register_inplace() 3965@local_optimizer([GpuDnnBatchNorm], inplace=True) 3966def local_batch_norm_inplace_running_mean(node): 3967 if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean: 3968 return GpuDnnBatchNorm(mode=node.op.mode, 3969 running_averages=node.op.running_averages, 3970 inplace_running_mean=True, 3971 inplace_running_var=node.op.inplace_running_var, 3972 inplace_output=node.op.inplace_output)(*node.inputs) 3973 3974 3975@register_inplace() 3976@local_optimizer([GpuDnnBatchNorm], inplace=True) 3977def local_batch_norm_inplace_running_var(node): 3978 if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var: 3979 return GpuDnnBatchNorm(mode=node.op.mode, 3980 running_averages=node.op.running_averages, 3981 inplace_running_mean=node.op.inplace_running_mean, 3982 inplace_running_var=True, 3983 inplace_output=node.op.inplace_output)(*node.inputs) 3984 3985 3986@register_inplace() 3987@local_optimizer([GpuDnnBatchNormInference], inplace=True) 3988def local_batch_norm_inference_inplace(node): 3989 if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace: 3990 return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)] 3991 3992 3993def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs): 3994 x, dy, scale, x_mean, x_invstd, epsilon = inputs 3995 3996 # input on gpu? TODO what about the output? 3997 x_on_gpu = (isinstance(x.type, GpuArrayType) or 3998 (x.owner and isinstance(x.owner.op, HostFromGpu))) 3999 dy_on_gpu = (isinstance(dy.type, GpuArrayType) or 4000 (dy.owner and isinstance(dy.owner.op, HostFromGpu))) 4001 if not (x_on_gpu or dy_on_gpu): 4002 return None 4003 4004 # convert axes to cuDNN mode 4005 axes = tuple(op.axes) 4006 if axes == (0,): 4007 mode = 'per-activation' 4008 elif axes == (0,) + tuple(range(2, x.ndim)): 4009 mode = 'spatial' 4010 else: 4011 return None 4012 4013 ndim = x.ndim 4014 if ndim < 4: 4015 x = theano.tensor.shape_padright(x, 4 - ndim) 4016 dy = theano.tensor.shape_padright(dy, 4 - ndim) 4017 scale = theano.tensor.shape_padright(scale, 4 - ndim) 4018 x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim) 4019 x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim) 4020 elif ndim > 5: 4021 x_shape = x.shape 4022 params_shape = scale.shape 4023 x = theano.tensor.flatten(x, 5) 4024 dy = theano.tensor.flatten(dy, 5) 4025 scale = theano.tensor.flatten(scale, 5) 4026 x_mean = theano.tensor.flatten(x_mean, 5) 4027 x_invstd = theano.tensor.flatten(x_invstd, 5) 4028 4029 try: 4030 eps = theano.tensor.get_scalar_constant_value(epsilon) 4031 except theano.tensor.NotScalarConstantError: 4032 return None 4033 if eps < 1e-5: 4034 return None 4035 4036 ctx = infer_context_name(*inputs) 4037 if not dnn_available(ctx): 4038 return 4039 x = as_gpuarray_variable(x, context_name=ctx) 4040 dy = as_gpuarray_variable(dy, context_name=ctx) 4041 scale = as_gpuarray_variable(scale, context_name=ctx) 4042 x_mean = as_gpuarray_variable(x_mean, context_name=ctx) 4043 x_invstd = as_gpuarray_variable(x_invstd, context_name=ctx) 4044 4045 g_wrt_inputs, g_wrt_scale, g_wrt_bias = \ 4046 GpuDnnBatchNormGrad(mode)(x, dy, scale, x_mean, x_invstd, eps) 4047 4048 if ndim < 4: 4049 g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim) 4050 g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim) 4051 g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim) 4052 elif ndim > 5: 4053 g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape) 4054 g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape) 4055 g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape) 4056 4057 return [g_wrt_inputs, g_wrt_scale, g_wrt_bias] 4058 4059 4060def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs): 4061 x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs 4062 4063 axes = tuple(op.axes) 4064 if axes == (0,): 4065 mode = 'per-activation' 4066 elif axes == (0,) + tuple(range(2, x.ndim)): 4067 mode = 'spatial' 4068 else: 4069 return None 4070 4071 try: 4072 eps = theano.tensor.get_scalar_constant_value(epsilon) 4073 except theano.tensor.NotScalarConstantError: 4074 return None 4075 if eps < 1e-5: 4076 return None 4077 4078 ctx = infer_context_name(*inputs) 4079 if not dnn_available(ctx): 4080 return 4081 x = as_gpuarray_variable(x, context_name=ctx) 4082 scale = as_gpuarray_variable(scale, context_name=ctx) 4083 bias = as_gpuarray_variable(bias, context_name=ctx) 4084 estimated_mean = as_gpuarray_variable(estimated_mean, context_name=ctx) 4085 estimated_variance = as_gpuarray_variable(estimated_variance, context_name=ctx) 4086 4087 out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance, 4088 mode, eps) 4089 4090 return [out] 4091