1from __future__ import absolute_import, print_function, division
2import ctypes
3import os
4import sys
5import warnings
6
7import numpy as np
8from six import integer_types
9from six.moves import reduce
10
11import theano
12from theano import Op, Apply, tensor, config, Variable
13from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
14                           int32 as int_t, bool as bool_t, uint32 as uint32_t)
15from theano.tensor import as_tensor_variable, Argmax
16from theano.tensor.extra_ops import cpu_contiguous
17from theano.gradient import DisconnectedType, grad_not_implemented
18from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
19from theano.gof.cmodule import GCC_compiler
20from theano.gof.type import CDataType, Generic
21from theano.gof.opt import inherit_stack_trace
22from theano.tensor.opt import Assert
23from theano.compile import optdb
24from theano.compile.ops import shape_i, shape_i_op
25from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
26from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
27                                              AbstractConv2d_gradWeights,
28                                              AbstractConv2d_gradInputs,
29                                              AbstractConv3d,
30                                              AbstractConv3d_gradWeights,
31                                              AbstractConv3d_gradInputs,
32                                              get_conv_output_shape,
33                                              assert_conv_shape)
34from theano.tensor.signal.pool import (
35    Pool, MaxPoolGrad, AveragePoolGrad)
36from . import pygpu, cudnn_defs
37from .type import (get_context, gpu_context_type, list_contexts,
38                   GpuArraySharedVariable)
39from .basic_ops import (as_gpuarray_variable, infer_context_name, gpuarray_helper_inc_dir,
40                        gpu_contiguous, GpuAllocEmpty,
41                        empty_like, GpuArrayType, HostFromGpu)
42from .elemwise import GpuElemwise, GpuCAReduceCuda
43from .reduction import GpuMaxAndArgmax
44
45# These don't exist in gpuarray
46# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
47from .nnet import GpuSoftmax
48from .opt import (gpu_seqopt, register_opt, pool_db, pool_db2,
49                  op_lifter, register_opt2, register_inplace)
50
51from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, unpad_dims
52
53from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_RUNTIME
54import theano.pathparse
55
56DNN_CONV_ALGO_CHOOSE_ONCE = ['guess_once', 'time_once']
57DNN_CONV_ALGO_CHOOSE_TIME = ['time_once', 'time_on_shape_change']
58
59try:
60    from pygpu import gpuarray
61except ImportError:
62    pass
63
64# Update these names when new versions of cudnn are supported.
65WIN32_CUDNN_NAMES = ['cudnn64_7.dll', 'cudnn64_6.dll', 'cudnn64_5.dll']
66
67if sys.platform == 'win32':
68    theano.pathparse.PathParser(theano.config.dnn.bin_path)
69
70
71def _load_lib(name):
72    try:
73        return ctypes.cdll.LoadLibrary(name)
74    except OSError:
75        return None
76
77
78def _dnn_lib():
79    if _dnn_lib.handle is None:
80        import ctypes.util
81
82        if config.dnn.bin_path != "":
83            if sys.platform == 'darwin':
84                dnn_handle = _load_lib(os.path.join(config.dnn.bin_path, 'libcudnn.dylib'))
85            elif sys.platform == 'win32':
86                for name in WIN32_CUDNN_NAMES:
87                    dnn_handle = _load_lib(os.path.join(config.dnn.bin_path, name))
88                    if dnn_handle is not None:
89                        break
90            else:
91                dnn_handle = _load_lib(os.path.join(config.dnn.bin_path, 'libcudnn.so'))
92        else:
93            lib_name = ctypes.util.find_library('cudnn')
94            if lib_name is None and sys.platform == 'win32':
95                for name in WIN32_CUDNN_NAMES:
96                    lib_name = ctypes.util.find_library(name)
97                    if lib_name:
98                        break
99            if lib_name is None:
100                raise RuntimeError(
101                    'Could not find cudnn library (looked for v5* to v7*).'
102                    ' Check your cudnn installation. Maybe using the Theano'
103                    ' flag dnn.base_path can help you. Current value "%s"' %
104                    config.dnn.base_path)
105            else:
106                dnn_handle = ctypes.cdll.LoadLibrary(lib_name)
107        if dnn_handle is None:
108            raise RuntimeError('Could not load cudnn library. Check your cudnn'
109                               ' installation. Maybe using the Theano'
110                               ' flag dnn.base_path can help you. Current value "%s"' %
111                               config.dnn.base_path)
112        _dnn_lib.handle = dnn_handle
113        cudnn = _dnn_lib.handle
114        cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
115        cudnn.cudnnCreate.restype = ctypes.c_int
116        cudnn.cudnnDestroy.argtypes = [ctypes.c_void_p]
117        cudnn.cudnnDestroy.restype = ctypes.c_int
118    return _dnn_lib.handle
119
120_dnn_lib.handle = None
121
122
123def _make_handle(ctx):
124    cudnn = _dnn_lib()
125    handle = ctypes.c_void_p()
126    with ctx:
127        err = cudnn.cudnnCreate(ctypes.byref(handle))
128    if err != 0:
129        raise RuntimeError("Error creating cudnn handle. "
130                           "This can be a sign of a too old driver.", err)
131    return handle
132
133
134def _dnn_check_compile():
135    preambule = """
136#include <stdio.h>
137#include <cudnn.h>
138#include <cudnn_helper.h>
139"""
140
141    # No need for the context in here since we won't execute that code
142    body = """
143cudnnHandle_t _handle = NULL;
144cudnnStatus_t err;
145if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
146  fprintf(stderr, "could not create cuDNN handle: %s",
147          cudnnGetErrorString(err));
148  return 1;
149}
150"""
151
152    path_wrapper = "\"" if os.name == 'nt' else ""
153    params = ["-l", "cudnn"]
154    params.extend(['-I%s%s%s' % (path_wrapper, gpuarray_helper_inc_dir(), path_wrapper)])
155    if config.dnn.include_path:
156        params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)])
157    if config.cuda.include_path:
158        params.extend(['-I%s%s%s' % (path_wrapper, config.cuda.include_path, path_wrapper)])
159    if config.dnn.library_path:
160        params.extend(['-L%s%s%s' % (path_wrapper, config.dnn.library_path, path_wrapper)])
161    # Do not run here the test program. It would run on the
162    # default gpu, not the one selected by the user. If mixed
163    # GPU are installed or if the GPUs are configured in
164    # exclusive mode, this cause bad detection.
165
166    # NB: GCC_compiler.try_flags() may return just a boolean instead of a tuple (avail, out, here).
167    compiler_res = GCC_compiler.try_flags(
168        params, preambule=preambule, body=body,
169        try_run=False, output=True)
170
171    avail, out, err = compiler_res if isinstance(compiler_res, tuple) else (compiler_res, None, None)
172
173    if not avail:
174        return False, ("cannot compile with cuDNN. "
175                       "We got this error:\n" + str(err))
176    return True, None
177
178
179def _dnn_check_version():
180    v = version()
181    if v < 5000:
182        return False, "cuDNN version is too old. Update to v5* or higher, was %d." % v
183    if v >= 7200:
184        warnings.warn("Your cuDNN version is more recent than "
185                      "Theano. If you encounter problems, try "
186                      "updating Theano or downgrading cuDNN to "
187                      "a version >= v5 and <= v7.")
188    return True, None
189
190
191def dnn_present():
192    if dnn_present.avail is not None:
193        return dnn_present.avail
194    if config.dnn.enabled == "False":
195        dnn_present.msg = "Disabled by dnn.enabled flag"
196        dnn_present.avail = False
197        return False
198
199    if pygpu is None:
200        dnn_present.msg = "PyGPU not available"
201        dnn_present.avail = False
202        return False
203
204    if config.dnn.enabled == "no_check":
205        dnn_present.avail, dnn_present.msg = True, "presence check disabled by dnn.enabled flag"
206    else:
207        dnn_present.avail, dnn_present.msg = _dnn_check_compile()
208    if dnn_present.avail:
209        dnn_present.avail, dnn_present.msg = _dnn_check_version()
210        if not dnn_present.avail:
211            return False
212
213    return dnn_present.avail
214
215dnn_present.avail = None
216dnn_present.msg = None
217
218
219def dnn_available(context_name):
220    if not dnn_present():
221        dnn_available.msg = dnn_present.msg
222        return False
223
224    ctx = get_context(context_name)
225
226    if not ctx.kind == b'cuda':
227        dnn_available.msg = "Not on a CUDA device."
228        return False
229
230    # This is a hack because bin_id is in the from of
231    # "<something>_<major><minor>" for cuda devices.
232    if int(ctx.bin_id[-2:]) < 30:
233        dnn_available.msg = "Device not supported"
234        return False
235
236    # On V100, cuDNN lower then 7002 don't raise error but
237    # takes hours to load or execute! So raise a good user error.
238    if version() < 7002:
239        if int(ctx.bin_id[-2:]) >= 70:
240            dnn_available.msg = "Use cuDNN 7.0.2 or higher for Volta."
241            return False
242    return True
243
244dnn_available.msg = None
245
246
247def CUDNNDataType(name, freefunc=None):
248    cargs = []
249    if config.dnn.bin_path and sys.platform != 'win32':
250        cargs.append('-Wl,-rpath,' + config.dnn.bin_path)
251
252    return CDataType(name, freefunc,
253                     headers=['cudnn.h'],
254                     header_dirs=[config.dnn.include_path,
255                                  config.cuda.include_path],
256                     libraries=['cudnn'],
257                     lib_dirs=[config.dnn.library_path],
258                     compile_args=cargs,
259                     version=version(raises=False))
260
261
262class DnnVersion(Op):
263    __props__ = ()
264
265    def c_headers(self):
266        return ['cudnn.h']
267
268    def c_header_dirs(self):
269        return [config.dnn.include_path, config.cuda.include_path]
270
271    def c_libraries(self):
272        return ['cudnn']
273
274    def c_lib_dirs(self):
275        return [config.dnn.library_path]
276
277    def c_compile_args(self):
278        if config.dnn.bin_path and sys.platform != 'win32':
279            return ['-Wl,-rpath,' + config.dnn.bin_path]
280        return []
281
282    def c_support_code(self):
283        return """
284#if PY_MAJOR_VERSION >= 3
285#define PyInt_FromLong PyLong_FromLong
286#endif
287"""
288
289    def make_node(self):
290        return Apply(self, [], [Generic()()])
291
292    def c_code(self, node, name, inputs, outputs, sub):
293        o = outputs[0]
294        return """
295        %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
296        """ % locals()
297
298    def do_constant_folding(self, node):
299        # Needed as we do not want to cache this information.
300        return False
301
302    def c_code_cache_version(self):
303        # Not needed, but make it clear that we do not want to cache this.
304        return None
305
306
307def version(raises=True):
308    """Return the current cuDNN version we link with.
309
310    This also does a check that the header version matches the runtime version.
311
312    :raises: If True, raise an exception if cuDNN is not present.
313        Otherwise, return -1.
314
315    It always raise an RuntimeError if the header and library version
316    are not the same.
317
318    """
319    if not dnn_present():
320        if raises:
321            raise RuntimeError(
322                "We can't determine the cudnn version as it is not available",
323                dnn_available.msg)
324        else:
325            return -1
326
327    if version.v is None:
328        f = theano.function([], DnnVersion()(),
329                            theano.Mode(optimizer=None),
330                            profile=False)
331        v = f()
332        if v[0] != v[1]:
333            raise RuntimeError("Mixed dnn version. The header is version %s "
334                               "while the library is version %s." % v)
335        version.v = v[1]
336    return version.v
337version.v = None
338
339handle_type = CUDNNDataType('cudnnHandle_t', 'cudnnDestroy')
340
341# Get cuDNN definitions to be used.
342cudnn = cudnn_defs.get_definitions(version(raises=False))
343
344
345def get_precision(precision, inputs, for_grad=False):
346    common_dtype = theano.scalar.upcast(*[i.dtype for i in inputs])
347    if not common_dtype.startswith('float'):
348        raise TypeError("cuDNN convolution only works on real numbers")
349
350    if precision is None:
351        precision = theano.config.dnn.conv.precision
352    if precision == 'as_input' or precision == 'as_input_f32':
353        if common_dtype == 'float16' and precision == 'as_input_f32':
354            precision = 'float32'
355        else:
356            precision = common_dtype
357    if for_grad and precision == 'float16':
358        raise TypeError("Float16 precision is disabled for cuDNN backward convolutions due to computation errors.")
359    return precision, common_dtype
360
361
362class DnnBase(COp):
363
364    """
365    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
366
367    """
368    # dnn does not know about broadcasting, so we do not need to assert
369    # the input broadcasting pattern.
370    check_broadcast = False
371    params_type = handle_type
372
373    def dnn_context(self, node):
374        return node.outputs[0].type.context_name
375
376    def get_params(self, node):
377        ctx_name = self.dnn_context(node)
378        ctx = get_context(ctx_name)
379        if not hasattr(ctx, 'cudnn_handle_param'):
380            ptr = ctx.cudnn_handle.value
381            res = handle_type.make_value(ptr)
382            ctx.cudnn_handle_param = res
383        if isinstance(self.params_type, ParamsType):
384            if not self.params_type.has_type(handle_type):
385                raise TypeError('DnnBase: params_type must take into account the cuDNN handle type.')
386            handle_field = self.params_type.get_field(handle_type)
387            return self.params_type.get_params(self, **{handle_field: ctx.cudnn_handle_param})
388        return ctx.cudnn_handle_param
389
390    def __init__(self, files=None, c_func=None):
391        if files is None:
392            files = []
393        COp.__init__(self, ["c_code/dnn_base.c"] + files, c_func)
394
395    def c_headers(self):
396        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h',
397                'gpuarray/util.h', 'gpuarray/ext_cuda.h', 'gpuarray_api.h',
398                'numpy_compat.h', 'cudnn.h', 'cudnn_helper.h',
399                'gpuarray_helper.h']
400
401    def c_header_dirs(self):
402        return [gpuarray_helper_inc_dir(), pygpu.get_include(),
403                config.dnn.include_path, config.cuda.include_path]
404
405    def c_libraries(self):
406        return ['cudnn', 'gpuarray']
407
408    def c_lib_dirs(self):
409        return [config.dnn.library_path]
410
411    def c_compile_args(self):
412        if config.dnn.bin_path and sys.platform != 'win32':
413            return ['-Wl,-rpath,' + config.dnn.bin_path]
414        return []
415
416    def c_code_cache_version(self):
417        return (super(DnnBase, self).c_code_cache_version(), version(), 4)
418
419
420class GpuDnnConvDesc(COp):
421
422    """
423    This Op builds a convolution descriptor for use in the other convolution
424    operations.
425
426    See the doc of :func:`dnn_conv` for a description of the parameters
427
428    """
429
430    __props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode',
431                 'precision', 'num_groups')
432    params_type = ParamsType(pad0=int_t, pad1=int_t, pad2=int_t,
433                             sub0=int_t, sub1=int_t, sub2=int_t,
434                             dil0=int_t, dil1=int_t, dil2=int_t,
435                             nb_dims=int_t,
436                             bmode=EnumList(('BORDER_MODE_FULL', 'full'),
437                                            ('BORDER_MODE_VALID', 'valid'),
438                                            ('BORDER_MODE_HALF', 'half')),
439                             conv_mode=cudnn.cudnnConvolutionMode_t,
440                             precision=cudnn.cudnnDataType_t,
441                             num_groups=int_t)
442
443    def c_headers(self):
444        return ['cudnn.h', 'cudnn_helper.h']
445
446    def c_header_dirs(self):
447        return [gpuarray_helper_inc_dir(), config.dnn.include_path,
448                config.cuda.include_path]
449
450    def c_libraries(self):
451        return ['cudnn']
452
453    def c_lib_dirs(self):
454        return [config.dnn.library_path]
455
456    def c_compile_args(self):
457        if config.dnn.bin_path and sys.platform != 'win32':
458            return ['-Wl,-rpath,' + config.dnn.bin_path]
459        return []
460
461    def do_constant_folding(self, node):
462        return False
463
464    def __init__(self, border_mode, subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
465                 precision="float32", num_groups=1):
466        COp.__init__(self, ["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
467
468        if version() < 6000 and any([d != 1 for d in dilation]):
469            raise RuntimeError("Dilation > 1 not supported for cuDNN version < 6.")
470
471        if isinstance(border_mode, integer_types):
472            border_mode = (border_mode,) * len(subsample)
473        if isinstance(border_mode, tuple):
474            assert len(border_mode) == len(subsample)
475            border_mode = tuple(map(int, border_mode))
476        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
477                border_mode in ('valid', 'full', 'half')):
478            raise ValueError(
479                'invalid border_mode {}, which must be either '
480                '"valid", "full", "half", an integer or a pair of'
481                ' integers'.format(border_mode))
482        self.border_mode = border_mode
483        assert len(subsample) in (2, 3)
484        self.subsample = subsample
485        assert cudnn.cudnnConvolutionMode_t.has_alias(conv_mode)
486        self.conv_mode = conv_mode
487        self.num_groups = num_groups
488
489        assert len(dilation) == len(subsample)
490        self.dilation = dilation
491
492        assert cudnn.cudnnDataType_t.has_alias(precision)
493        self.precision = precision
494
495    def make_node(self, kern_shape):
496        kern_shape = as_tensor_variable(kern_shape)
497        if kern_shape.type.ndim != 1 or kern_shape.dtype not in theano.tensor.basic.int_dtypes:
498            raise TypeError('kern must be an int64 1D shape tensor')
499        kern_shape = theano.tensor.basic.cast(kern_shape, 'int64')
500
501        node = Apply(self, [kern_shape],
502                     [CUDNNDataType("cudnnConvolutionDescriptor_t",
503                                    freefunc="cudnnDestroyConvolutionDescriptor")()])
504        # DebugMode cannot compare the values of CDataType variables, so by
505        # default it returns False all the time. To prevent DebugMode from
506        # complaining because of the MergeOptimizer, we make this variable
507        # always compare to True.
508        out = node.outputs[0]
509        out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
510        return node
511
512    bmode = property(lambda self: 'valid' if isinstance(self.border_mode, tuple) else self.border_mode)
513    pad0 = property(lambda self: self.border_mode[0] if isinstance(self.border_mode, tuple) else 0)
514    pad1 = property(lambda self: self.border_mode[1] if isinstance(self.border_mode, tuple) else 0)
515    pad2 = property(lambda self: self.border_mode[2] if (isinstance(self.border_mode, tuple) and
516                                                         len(self.border_mode) > 2) else 0)
517    sub0 = property(lambda self: self.subsample[0])
518    sub1 = property(lambda self: self.subsample[1])
519    sub2 = property(lambda self: self.subsample[2] if len(self.subsample) > 2 else 0)
520    dil0 = property(lambda self: self.dilation[0])
521    dil1 = property(lambda self: self.dilation[1])
522    dil2 = property(lambda self: self.dilation[2] if len(self.dilation) > 2 else 0)
523    nb_dims = property(lambda self: len(self.subsample))
524
525    def c_code_cache_version(self):
526        return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
527
528    def __setstate__(self, d):
529        self.__dict__.update(d)
530        if not hasattr(self, "dilation"):
531            self.dilation = (1,) * len(self.subsample)
532        if not hasattr(self, "num_groups"):
533            self.num_groups = 1
534
535
536# scalar constants
537_zero = constant(np.asarray(0.0, dtype='float64'))
538_one = constant(np.asarray(1.0, dtype='float64'))
539
540
541def ensure_dt(val, default, name, dtype):
542    if dtype == 'float16':
543        dtype = 'float32'
544    if val is None:
545        val = default.clone()
546    if not isinstance(val, Variable):
547        val = constant(val)
548    if hasattr(val, 'ndim') and val.ndim == 0:
549        val = as_scalar(val)
550    if not isinstance(val.type, theano.scalar.Scalar):
551        raise TypeError("%s: expected a scalar value" % (name,))
552    if not val.type.dtype == dtype:
553        val = val.astype(dtype)
554    return val
555
556
557class GpuDnnConv(DnnBase):
558
559    """
560    The forward convolution.
561
562    Parameters
563    ----------
564    image
565    kernel
566    descr :
567        The convolution descriptor.
568    algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'winograd', 'guess_once',
569            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
570        Default is the value of :attr:`config.dnn.conv.algo_fwd`.
571    num_groups :
572        Divides the image, kernel and output tensors into num_groups
573        separate groups. Each which carry out convolutions separately
574
575    """
576    _f16_ok = True
577    __props__ = ('algo', 'inplace', 'num_groups')
578
579    check_input = False
580    params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t,
581                             choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
582                             inplace=bool_t,
583                             handle=handle_type,
584                             num_groups=int_t)
585
586    def __init__(self, algo=None, inplace=False, num_groups=1):
587        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_fwd.c"],
588                         "APPLY_SPECIFIC(conv_fwd)")
589
590        if algo is None:
591            algo = config.dnn.conv.algo_fwd
592        self.algo = algo
593
594        self.inplace = bool(inplace)
595        if self.inplace:
596            self.destroy_map = {0: [2]}
597
598        assert cudnn.cudnnConvolutionFwdAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
599
600        self.conv_algo = cudnn.cudnnConvolutionFwdAlgo_t.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
601        if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
602            self.conv_algo = self.algo
603        self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
604        self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
605        self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
606        self.num_groups = num_groups
607
608    def __setstate__(self, d):
609        self.__dict__.update(d)
610        if not hasattr(self, 'algo'):
611            if hasattr(self, 'workmem'):
612                self.algo = self.workmem
613            else:
614                self.algo = config.dnn.conv.algo_fwd
615        if not hasattr(self, 'inplace'):
616            self.inplace = False
617        if not hasattr(self, 'num_groups'):
618            self.num_groups = 1
619
620    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
621        ctx_name = infer_context_name(img, kern, output)
622        img = as_gpuarray_variable(img, ctx_name)
623        kern = as_gpuarray_variable(kern, ctx_name)
624        output = as_gpuarray_variable(output, ctx_name)
625
626        if img.type.ndim not in (4, 5):
627            raise TypeError('img must be 4D or 5D tensor')
628        if kern.type.ndim not in (4, 5):
629            raise TypeError('kern must be 4D or 5D tensor')
630        if output.type.ndim not in (4, 5):
631            raise TypeError('output must be a 4D or 5D tensor')
632
633        if (img.type.ndim != kern.type.ndim or
634                img.type.ndim != output.type.ndim):
635            raise TypeError("The number of dimensions of "
636                            "img, kern and output must match")
637
638        if img.type.ndim == 5 and self.algo not in (cudnn.conv3d_fwd_algorithms +
639                                                    SUPPORTED_DNN_CONV_ALGO_RUNTIME):
640            raise ValueError("convolution algo %s can't be used for "
641                             "3d convolutions", (self.algo,))
642
643        if (not isinstance(desc.type, CDataType) or
644                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
645            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
646
647        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
648        beta = ensure_dt(beta, _zero, 'beta', img.dtype)
649
650        return Apply(self, [img, kern, output, desc, alpha, beta],
651                     [output.type()])
652
653    def grad(self, inp, grads):
654        img, kerns, output, desc, alpha, beta = inp
655        top, = grads
656
657        top = gpu_contiguous(top)
658
659        d_img = GpuDnnConvGradI(num_groups=self.num_groups)(kerns, top, empty_like(img), desc)
660        d_kerns = GpuDnnConvGradW(num_groups=self.num_groups)(img, top, empty_like(kerns), desc)
661        d_alpha = grad_not_implemented(self, 4, alpha)
662        d_beta = grad_not_implemented(self, 5, beta)
663
664        return [d_img * alpha, d_kerns * alpha, top * beta,
665                DisconnectedType()(), d_alpha, d_beta]
666
667    def connection_pattern(self, node):
668        # not connected to desc
669        return [[1], [1], [1], [0], [1], [1]]
670
671    @staticmethod
672    def get_out_shape(ishape, kshape, border_mode, subsample, dilation):
673        """
674        This function computes the output shape for a convolution with
675        the specified parameters. `ishape` and `kshape` can be symbolic
676        or scalar.
677
678        """
679
680        # if ishape and/or kshape are not tuples or list, but rather symbolic
681        # vectors, turn them into lists of symbolic scalars.
682        if not isinstance(ishape, (list, tuple)):
683            ishape = [ishape[i] for i in range(len(subsample) + 2)]
684        if not isinstance(kshape, (list, tuple)):
685            kshape = [kshape[i] for i in range(len(subsample) + 2)]
686
687        return get_conv_output_shape(
688            ishape,
689            kshape,
690            border_mode,
691            subsample,
692            dilation)
693
694    def infer_shape(self, node, shape):
695        return [shape[2]]
696
697
698class GpuDnnConvGradW(DnnBase):
699
700    """
701    The convolution gradient with respect to the weights.
702
703    Parameters
704    ----------
705    image
706    kernel
707    descr :
708        The convolution descriptor.
709    algo : {'none', 'deterministic', 'fft', 'small', 'guess_once',
710            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
711        Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`.
712    num_groups :
713        Divides the image, kernel and output tensors into num_groups
714        separate groups. Each which carry out convolutions separately
715
716    """
717    _f16_ok = True
718    __props__ = ('algo', 'inplace', 'num_groups')
719
720    check_input = False
721    params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t,
722                             choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
723                             inplace=bool_t,
724                             handle=handle_type,
725                             num_groups=int_t)
726
727    def __init__(self, inplace=False, algo=None, num_groups=1):
728        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gw.c"],
729                         "APPLY_SPECIFIC(conv_gw)")
730        self.inplace = bool(inplace)
731        if self.inplace:
732            self.destroy_map = {0: [2]}
733        if algo is None:
734            algo = config.dnn.conv.algo_bwd_filter
735        self.algo = algo
736
737        assert cudnn.cudnnConvolutionBwdFilterAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
738
739        self.conv_algo = cudnn.cudnnConvolutionBwdFilterAlgo_t.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
740        if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
741            self.conv_algo = self.algo
742        self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
743        self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
744        self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
745        self.num_groups = num_groups
746
747    def __setstate__(self, d):
748        self.__dict__.update(d)
749        if not hasattr(self, 'inplace'):
750            self.inplace = False
751        if not hasattr(self, 'algo'):
752            self.algo = config.dnn.conv.algo_bwd_filter
753        if not hasattr(self, 'num_groups'):
754            self.num_groups = 1
755
756    def grad(self, inp, grads):
757        img, top, output, desc, alpha, beta = inp
758        kerns, = grads
759
760        kerns = gpu_contiguous(kerns)
761
762        d_img = GpuDnnConvGradI(num_groups=self.num_groups)(kerns, top, empty_like(img), desc)
763        d_top = GpuDnnConv(num_groups=self.num_groups)(img, kerns, empty_like(top), desc)
764        d_alpha = grad_not_implemented(self, 4, alpha)
765        d_beta = grad_not_implemented(self, 5, beta)
766
767        return (d_img * alpha, d_top * alpha, kerns * beta,
768                DisconnectedType()(), d_alpha, d_beta)
769
770    def connection_pattern(self, node):
771        # not connected to desc
772        return [[1], [1], [1], [0], [1], [1]]
773
774    def op_may_fail_with_subsample(self, img, desc):
775        return (version() < 6000 and
776                img.type.dtype == 'float32' and
777                img.type.ndim == 5 and
778                self.algo != 'none' and
779                desc.owner.op.subsample != (1, 1, 1))
780
781    def op_may_fail_with_beta(self, img, beta):
782        return (version() < 6000 and
783                img.type.dtype == 'float32' and
784                self.algo not in ('none', 'deterministic', 'fft', 'small') and
785                beta is not None and
786                theano.tensor.extract_constant(beta) != 1)
787
788    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
789        if self.op_may_fail_with_subsample(img, desc):
790            warnings.warn('cuDNN backward filter operation for 3D convolutions may produce bad results '
791                          'with certain cuDNN algorithms depending on the compute capability of your GPU '
792                          'if subsample is not (1, 1, 1). If you encounter problems, consider '
793                          'setting the theano flag "dnn.conv.algo_bwd_filter" to "none".')
794        if self.op_may_fail_with_beta(img, beta):
795            warnings.warn('cuDNN backward filter operation for convolutions may produce bad results '
796                          'with certain cuDNN algorithms depending on the compute capability of your GPU '
797                          'if beta != 1. If you encounter problems, consider '
798                          'setting the theano flag "dnn.conv.algo_bwd_filter" to '
799                          '"none", "deterministic", "fft", or "small".')
800        ctx_name = infer_context_name(img, topgrad, output)
801        img = as_gpuarray_variable(img, ctx_name)
802        topgrad = as_gpuarray_variable(topgrad, ctx_name)
803        output = as_gpuarray_variable(output, ctx_name)
804        if img.type.ndim not in (4, 5):
805            raise TypeError('img must be 4D or 5D tensor')
806        if topgrad.type.ndim not in (4, 5):
807            raise TypeError('topgrad must be 4D or 5D tensor')
808        if output.type.ndim not in (4, 5):
809            raise TypeError('output must be 4D or 5D tensor')
810
811        if (img.type.ndim != topgrad.type.ndim or
812                img.type.ndim != output.type.ndim):
813            raise TypeError("The number of dimensions of "
814                            "img, topgrad and output must match")
815
816        if img.type.ndim == 5 and self.algo not in (cudnn.conv3d_bwd_filter_algorithms +
817                                                    SUPPORTED_DNN_CONV_ALGO_RUNTIME):
818            raise ValueError("convolution algo %s can't be used for "
819                             "3d convolutions", (self.algo,))
820
821        if (not isinstance(desc.type, CDataType) or
822                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
823            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
824
825        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
826        beta = ensure_dt(beta, _zero, 'beta', img.dtype)
827
828        return Apply(self, [img, topgrad, output, desc, alpha, beta],
829                     [output.type()])
830
831    def infer_shape(self, node, shape):
832        return [shape[2]]
833
834
835class GpuDnnConvGradI(DnnBase):
836    """
837    The convolution gradient with respect to the inputs.
838
839    Parameters
840    ----------
841    image
842    kernel
843    descr
844        The convolution descriptor.
845    algo : {'none', 'deterministic', 'fft', 'fft_tiling', 'winograd', 'guess_once',
846            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
847        Default is the value of :attr:`config.dnn.conv.algo_bwd_data`.
848    num_groups :
849        Divides the image, kernel and output tensors into num_groups
850        separate groups. Each which carry out convolutions separately
851
852    """
853    _f16_ok = True
854    __props__ = ('algo', 'inplace', 'num_groups')
855
856    check_input = False
857    params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t,
858                             choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
859                             inplace=bool_t,
860                             handle=handle_type,
861                             num_groups=int_t)
862
863    def __init__(self, inplace=False, algo=None, num_groups=1):
864        DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gi.c"],
865                         "APPLY_SPECIFIC(conv_gi)")
866        self.inplace = bool(inplace)
867        if self.inplace:
868            self.destroy_map = {0: [2]}
869        if algo is None:
870            algo = config.dnn.conv.algo_bwd_data
871        self.algo = algo
872        assert cudnn.cudnnConvolutionBwdDataAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
873
874        self.conv_algo = cudnn.cudnnConvolutionBwdDataAlgo_t.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
875        if self.algo not in SUPPORTED_DNN_CONV_ALGO_RUNTIME:
876            self.conv_algo = self.algo
877        self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
878        self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
879        self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
880        self.num_groups = num_groups
881
882    def __setstate__(self, d):
883        self.__dict__.update(d)
884        if not hasattr(self, 'algo'):
885            self.algo = config.dnn.conv.algo_bwd_data
886        if not hasattr(self, 'inplace'):
887            self.inplace = False
888        if not hasattr(self, 'num_groups'):
889            self.num_groups = 1
890
891    def grad(self, inp, grads):
892        kerns, top, output, desc, alpha, beta = inp
893        img, = grads
894
895        img = gpu_contiguous(img)
896
897        d_kerns = GpuDnnConvGradW(num_groups=self.num_groups)(img, top, empty_like(kerns), desc)
898        d_top = GpuDnnConv(num_groups=self.num_groups)(img, kerns, empty_like(top), desc)
899        d_alpha = grad_not_implemented(self, 4, alpha)
900        d_beta = grad_not_implemented(self, 5, beta)
901
902        return (d_kerns * alpha, d_top * alpha, img * beta,
903                DisconnectedType()(), d_alpha, d_beta)
904
905    def connection_pattern(self, node):
906        # not connected to desc
907        return [[1], [1], [1], [0], [1], [1]]
908
909    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
910        ctx_name = infer_context_name(kern, topgrad, output)
911        kern = as_gpuarray_variable(kern, ctx_name)
912        topgrad = as_gpuarray_variable(topgrad, ctx_name)
913        output = as_gpuarray_variable(output, ctx_name)
914        if kern.type.ndim not in (4, 5):
915            raise TypeError('kern must be 4D or 5D tensor')
916        if topgrad.type.ndim not in (4, 5):
917            raise TypeError('topgrad must be 4D or 5D tensor')
918        if output.type.ndim not in (4, 5):
919            raise TypeError('output must be 4D or 5D tensor')
920
921        if (kern.type.ndim != topgrad.type.ndim or
922                kern.type.ndim != output.type.ndim):
923            raise TypeError("The number of dimensions of "
924                            "kern, topgrad and output must match")
925
926        if kern.type.ndim == 5 and self.algo not in (cudnn.conv3d_bwd_data_algorithms +
927                                                     SUPPORTED_DNN_CONV_ALGO_RUNTIME):
928            raise ValueError("convolution algo %s can't be used for "
929                             "3d convolutions", (self.algo,))
930
931        if (not isinstance(desc.type, CDataType) or
932                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
933            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
934
935        alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype)
936        beta = ensure_dt(beta, _zero, 'beta', kern.dtype)
937
938        return Apply(self, [kern, topgrad, output, desc, alpha, beta],
939                     [output.type()])
940
941    def infer_shape(self, node, shape):
942        return [shape[2]]
943
944
945# These internal implementations for dnn_conv, dnn_gradweight and dnn_gradinput
946# support alpha, beta and out as parameters. Public interfaces follow without
947# underscore prefix.
948
949def _dnn_conv(img, kerns, alpha=1, beta=0, out=None, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
950              conv_mode='conv', algo=None, precision=None, num_groups=1):
951    ctx_name = infer_context_name(img, kerns)
952
953    img = as_gpuarray_variable(img, ctx_name)
954    kerns = as_gpuarray_variable(kerns, ctx_name)
955
956    precision, dt = get_precision(precision, [img, kerns])
957
958    img = gpu_contiguous(img.astype(dt))
959    kerns = gpu_contiguous(kerns.astype(dt))
960
961    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
962                          conv_mode=conv_mode, precision=precision, num_groups=num_groups)(kerns.shape)
963    desc_op = desc.owner.op
964    # We can use Shape_i and bypass the infer_shape here as this is on
965    # the input of node and it will always be present.
966    ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
967    kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
968    out_shp = get_conv_output_shape(ishape, kshape, desc_op.border_mode, desc_op.subsample, filter_dilation=dilation)
969    out_shp = assert_conv_shape(out_shp)
970    if beta == 0:
971        real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
972    else:
973        assert out is not None
974        out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
975        check = Assert('GpuDnnConv: given output (for beta not null) does not have expected shape')
976        real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, out_shp)))
977    return GpuDnnConv(algo=algo, num_groups=num_groups)(img, kerns, real_out, desc, alpha, beta)
978
979
980def _dnn_gradweight(img, topgrad, kerns_shp, alpha=1, beta=0, out=None, border_mode='valid', subsample=(1, 1),
981                    dilation=(1, 1), conv_mode='conv', algo=None, precision=None, num_groups=1):
982    ctx_name = infer_context_name(img, topgrad)
983
984    img = as_gpuarray_variable(img, ctx_name)
985    topgrad = as_gpuarray_variable(topgrad, ctx_name)
986    kerns_shp = theano.tensor.as_tensor_variable(kerns_shp)
987
988    precision, dt = get_precision(precision, [img, topgrad], for_grad=True)
989
990    img = gpu_contiguous(img.astype(dt))
991    topgrad = gpu_contiguous(topgrad.astype(dt))
992
993    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
994                          conv_mode=conv_mode, precision=precision, num_groups=num_groups)(kerns_shp)
995    if beta == 0:
996        real_out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
997    else:
998        assert out is not None
999        out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
1000        check = Assert('GpuDnnConvGradW: given output (for beta not null) does not have expected shape')
1001        real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, kerns_shp)))
1002    return GpuDnnConvGradW(algo=algo, num_groups=num_groups)(img, topgrad, real_out, desc, alpha, beta)
1003
1004
1005def _dnn_gradinput(kerns, topgrad, img_shp, alpha=1, beta=0, out=None, border_mode='valid', subsample=(1, 1),
1006                   dilation=(1, 1), conv_mode='conv', algo=None, precision=None, num_groups=1):
1007    ctx_name = infer_context_name(kerns, topgrad)
1008
1009    kerns = as_gpuarray_variable(kerns, ctx_name)
1010    topgrad = as_gpuarray_variable(topgrad, ctx_name)
1011    img_shp = theano.tensor.as_tensor_variable(img_shp)
1012
1013    precision, dt = get_precision(precision, [kerns, topgrad], for_grad=True)
1014
1015    kerns = gpu_contiguous(kerns.astype(dt))
1016    topgrad = gpu_contiguous(topgrad.astype(dt))
1017
1018    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
1019                          conv_mode=conv_mode, precision=precision, num_groups=num_groups)(kerns.shape)
1020    if beta == 0:
1021        real_out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
1022    else:
1023        assert out is not None
1024        out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
1025        check = Assert('GpuDnnConvGradI: given output (for beta not null) does not have expected shape')
1026        real_out = check(out, theano.tensor.all(theano.tensor.eq(out.shape, img_shp)))
1027    return GpuDnnConvGradI(algo=algo, num_groups=num_groups)(kerns, topgrad, real_out, desc, alpha, beta)
1028
1029
1030def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
1031             conv_mode='conv', direction_hint=None, workmem=None,
1032             algo=None, precision=None, num_groups=1):
1033    """
1034    GPU convolution using cuDNN from NVIDIA.
1035
1036    The memory layout to use is 'bc01', that is 'batch', 'channel',
1037    'first dim', 'second dim' in that order.
1038
1039    Parameters
1040    ----------
1041    img
1042        Images to do the convolution over.
1043    kerns
1044        Convolution filters.
1045    border_mode
1046        One of 'valid', 'full', 'half'; additionally, the padding size
1047        could be directly specified by an integer or a pair of integers.
1048    subsample
1049        Perform subsampling of the output (default: (1, 1)).
1050    dilation
1051        Filter dilation factor. A dilation factor of d is equivalent to a
1052        convolution with d - 1 zeros inserted between neighboring filter
1053        values.
1054    conv_mode
1055        Perform convolution (kernels flipped) or cross-correlation.
1056        One of 'conv', 'cross' (default: 'conv').
1057    direction_hint
1058        Used by graph optimizers to change algorithm choice.
1059        By default, GpuDnnConv will be used to carry out the convolution.
1060        If border_mode is 'valid', subsample is (1, 1), dilation is (1, 1), and
1061        direction_hint is 'bprop weights', it will use GpuDnnConvGradW.
1062        If border_mode is 'full', subsample is (1, 1), dilation is (1, 1), and
1063        direction_hint is *not* 'forward!', it will use GpuDnnConvGradI.
1064        This parameter is used internally by graph optimizers and may be
1065        removed at any time without a deprecation period. You have been warned.
1066    algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
1067        Convolution implementation to use. Some of its values may
1068        require certain versions of cuDNN to be installed. Default is
1069        the value of :attr:`config.dnn.conv.algo_fwd`.
1070    precision : {'as_input_f32', 'as_input', 'float16', 'float32', 'float64'}
1071        Description of the dtype in which the computation of the convolution
1072        should be done. Possible values are 'as_input', 'float16', 'float32'
1073        and 'float64'. Default is the value of
1074        :attr:`config.dnn.conv.precision`.
1075    num_groups :
1076        Divides the image, kernel and output tensors into num_groups
1077        separate groups. Each which carry out convolutions separately
1078
1079
1080    .. warning:: The cuDNN library only works with GPUs that have a compute
1081        capability of 3.0 or higher. This means that older GPUs will not
1082        work with this Op.
1083
1084    """
1085
1086    if workmem is not None:
1087        if algo is not None:
1088            raise ValueError("You can't use both algo and workmem")
1089        warnings.warn("workmem is deprecated, use algo instead", stacklevel=2)
1090        algo = workmem
1091    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
1092    ctx_name = infer_context_name(img, kerns)
1093    if (border_mode == 'valid' and subsample == (1, 1) and dilation == (1, 1) and
1094            direction_hint == 'bprop weights' and num_groups == 1):
1095        # Special case: We are asked to use GpuDnnConvGradW. We need to set
1096        # up a suitable 'fake' convolution to compute the gradient for.
1097        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
1098        if conv_mode == 'conv':
1099            # We need to flip manually. These 'kerns' are not the kernels
1100            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
1101            kerns = kerns[:, :, ::-1, ::-1]
1102        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
1103        out_shp = (shape_i(kerns, 1, fgraph),
1104                   shape_i(img, 1, fgraph),
1105                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
1106                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1)
1107        out_shp = assert_conv_shape(out_shp)
1108        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
1109        precision, _ = get_precision(precision, [img, kerns], for_grad=True)
1110        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=(1, 1),
1111                              num_groups=num_groups,
1112                              conv_mode='cross', precision=precision)(out.shape)
1113        conv = GpuDnnConvGradW(num_groups=num_groups)(img, kerns, out, desc)
1114        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
1115
1116    elif (border_mode == 'full' and subsample == (1, 1) and
1117          direction_hint != 'forward!' and num_groups == 1):
1118        # Special case: We can be faster by using GpuDnnConvGradI to compute
1119        # the full convolution as the backward pass of a valid convolution.
1120        # We just need to set up a suitable 'fake' valid convolution.
1121        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
1122        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
1123        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
1124        out_shp = (shape_i(img, 0, fgraph),
1125                   shape_i(kerns, 1, fgraph),
1126                   shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0],
1127                   shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1])
1128        out_shp = assert_conv_shape(out_shp)
1129        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
1130        precision, _ = get_precision(precision, [img, kerns], for_grad=True)
1131        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), dilation=dilation,
1132                              num_groups=num_groups,
1133                              conv_mode=conv_mode, precision=precision)(kerns.shape)
1134        return GpuDnnConvGradI(num_groups=num_groups)(kerns, img, out, desc)
1135
1136    # Standard case: We use GpuDnnConv with suitable padding.
1137    return _dnn_conv(img, kerns, algo=algo, border_mode=border_mode, subsample=subsample, dilation=dilation,
1138                     conv_mode=conv_mode, precision=precision, num_groups=num_groups)
1139
1140
1141def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
1142               conv_mode='conv', direction_hint=None,
1143               algo=None, precision=None, num_groups=1):
1144    """
1145    GPU convolution using cuDNN from NVIDIA.
1146
1147    The memory layout to use is 'bc012', that is 'batch', 'channel',
1148    'first dim', 'second dim', 'third dim' in that order.
1149
1150    Parameters
1151    ----------
1152    img
1153        Images to do the convolution over.
1154    kerns
1155        Convolution filters.
1156    border_mode
1157        One of 'valid', 'full', 'half'; additionally, the padding size
1158        could be directly specified by an integer or a pair of integers.
1159    subsample
1160        Perform subsampling of the output (default: (1, 1, 1)).
1161    dilation
1162        Filter dilation factor. A dilation factor of d is equivalent to a
1163        convolution with d - 1 zeros inserted between neighboring filter
1164        values.
1165    conv_mode
1166        Perform convolution (kernels flipped) or cross-correlation.
1167        One of 'conv', 'cross' (default: 'conv').
1168    direction_hint
1169        Used by graph optimizers to change algorithm choice.
1170        By default, GpuDnnConv will be used to carry out the convolution.
1171        If border_mode is 'valid', subsample is (1, 1, 1), dilation is
1172        (1, 1, 1), and direction_hint is 'bprop weights', it will use
1173        GpuDnnConvGradW.
1174        If border_mode is 'full', subsample is (1, 1, 1), dilation is
1175        (1, 1, 1), and direction_hint is *not* 'forward!', it will use
1176        GpuDnnConvGradI.
1177        This parameter is used internally by graph optimizers and may be
1178        removed at any time without a deprecation period. You have been warned.
1179    algo : convolution implementation to use. Only 'none' is implemented
1180        for the conv3d. Default is the value of :attr:`config.dnn.conv.algo_fwd`.
1181    precision : {'as_input_f32', 'as_input', 'float16', 'float32', 'float64'}
1182        Description of the dtype in which the computation of the convolution
1183        should be done. Possible values are 'as_input', 'float16', 'float32'
1184        and 'float64'. Default is the value of
1185        :attr:`config.dnn.conv.precision`.
1186    num_groups :
1187        Divides the image, kernel and output tensors into num_groups
1188        separate groups. Each which carry out convolutions separately
1189
1190
1191    .. warning:: The cuDNN library only works with GPUs that have a compute
1192        capability of 3.0 or higher. This means that older GPUs will not
1193        work with this Op.
1194
1195    """
1196
1197    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
1198    ctx_name = infer_context_name(img, kerns)
1199    if (border_mode == 'valid' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and
1200            direction_hint == 'bprop weights' and num_groups == 1):
1201        # Special case: We are asked to use GpuDnnConvGradW. We need to set
1202        # up a suitable 'fake' convolution to compute the gradient for.
1203        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
1204        if conv_mode == 'conv':
1205            # We need to flip manually. These 'kerns' are not the kernels
1206            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
1207            kerns = kerns[:, :, ::-1, ::-1, ::-1]
1208        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
1209        out_shp = (shape_i(kerns, 1, fgraph),
1210                   shape_i(img, 1, fgraph),
1211                   shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
1212                   shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1,
1213                   shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1)
1214        out_shp = assert_conv_shape(out_shp)
1215        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
1216        precision, _ = get_precision(precision, [img, kerns], for_grad=True)
1217        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
1218                              num_groups=num_groups,
1219                              conv_mode='cross', precision=precision)(out.shape)
1220        conv = GpuDnnConvGradW(num_groups=num_groups)(img, kerns, out, desc)
1221        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
1222
1223    elif (border_mode == 'full' and subsample == (1, 1, 1) and
1224          direction_hint != 'forward!' and num_groups == 1):
1225        # Special case: We can be faster by using GpuDnnConvGradI to compute
1226        # the full convolution as the backward pass of a valid convolution.
1227        # We just need to set up a suitable 'fake' valid convolution.
1228        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
1229        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
1230        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
1231        out_shp = (shape_i(img, 0, fgraph),
1232                   shape_i(kerns, 1, fgraph),
1233                   shape_i(img, 2, fgraph) + (shape_i(kerns, 2, fgraph) - 1) * dilation[0],
1234                   shape_i(img, 3, fgraph) + (shape_i(kerns, 3, fgraph) - 1) * dilation[1],
1235                   shape_i(img, 4, fgraph) + (shape_i(kerns, 4, fgraph) - 1) * dilation[2])
1236        out_shp = assert_conv_shape(out_shp)
1237        out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
1238        precision, _ = get_precision(precision, [img, kerns], for_grad=True)
1239        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), dilation=dilation,
1240                              num_groups=num_groups,
1241                              conv_mode=conv_mode, precision=precision)(kerns.shape)
1242        return GpuDnnConvGradI(num_groups=num_groups)(kerns, img, out, desc)
1243
1244    # Standard case: We use GpuDnnConv with suitable padding.
1245    return _dnn_conv(img, kerns, algo=algo, border_mode=border_mode, subsample=subsample, dilation=dilation,
1246                     conv_mode=conv_mode, precision=precision, num_groups=num_groups)
1247
1248
1249def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
1250                   subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
1251                   precision=None, algo=None, num_groups=1):
1252    """
1253    TODO: document this
1254    """
1255    return _dnn_gradweight(img, topgrad, kerns_shp, border_mode=border_mode, subsample=subsample, dilation=dilation,
1256                           conv_mode=conv_mode, algo=algo, precision=precision, num_groups=num_groups)
1257
1258
1259def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
1260                     subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv',
1261                     precision=None, algo=None, num_groups=1):
1262    """
1263    3d version of dnn_gradweight
1264    """
1265    return dnn_gradweight(img, topgrad, kerns_shp, border_mode,
1266                          subsample, dilation, conv_mode, precision,
1267                          algo, num_groups)
1268
1269
1270def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
1271                  subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
1272                  precision=None, algo=None, num_groups=1):
1273    """
1274    TODO: document this
1275    """
1276    return _dnn_gradinput(kerns, topgrad, img_shp, border_mode=border_mode, subsample=subsample, dilation=dilation,
1277                          conv_mode=conv_mode, algo=algo, precision=precision, num_groups=num_groups)
1278
1279
1280def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
1281                    subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv',
1282                    precision=None, algo=None, num_groups=1):
1283    """
1284    3d version of `dnn_gradinput`.
1285    """
1286    return dnn_gradinput(kerns, topgrad, img_shp, border_mode, subsample,
1287                         dilation, conv_mode, precision, algo,
1288                         num_groups)
1289
1290
1291class GpuDnnPoolDesc(Op):
1292
1293    """
1294    This Op builds a pooling descriptor for use in the other
1295    pooling operations.
1296
1297    `ws`, `stride` and `pad` must have the same length.
1298
1299    Parameters
1300    ----------
1301    ws : tuple
1302        Window size.
1303    stride : tuple
1304        (dx, dy) or (dx, dy, dz).
1305    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
1306        The old deprecated name 'average' corresponds to 'average_inc_pad'.
1307    pad : tuple
1308        (padX, padY) or (padX, padY, padZ)
1309
1310    Note
1311    ----
1312    Not used anymore. Only needed to reload old pickled files.
1313    """
1314
1315    __props__ = ('ws', 'stride', 'mode', 'pad')
1316
1317    def c_headers(self):
1318        return ['cudnn.h', 'cudnn_helper.h']
1319
1320    def c_header_dirs(self):
1321        return [gpuarray_helper_inc_dir(), config.dnn.include_path]
1322
1323    def c_libraries(self):
1324        return ['cudnn']
1325
1326    def c_lib_dirs(self):
1327        return [config.dnn.library_path]
1328
1329    def do_constant_folding(self, node):
1330        return False
1331
1332    def __init__(self, ws=(1, 1), stride=(1, 1), mode='max', pad=(0, 0)):
1333        if mode == 'average':
1334            mode = 'average_inc_pad'
1335        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
1336        self.mode = mode
1337
1338        assert len(ws) == len(stride) and len(stride) == len(pad)
1339        assert len(ws) in (2, 3)
1340        self.ws = ws
1341        self.stride = stride
1342        self.pad = pad
1343
1344    def get_ndim(self):
1345        return len(self.ws)
1346
1347    def __setstate__(self, d):
1348        self.__dict__.update(d)
1349        if not hasattr(self, 'pad'):
1350            self.pad = (0, 0)
1351
1352    def make_node(self):
1353        node = Apply(self, [],
1354                     [CUDNNDataType("cudnnPoolingDescriptor_t",
1355                                    freefunc="cudnnDestroyPoolingDescriptor")()])
1356        # DebugMode cannot compare the values of CDataType variables, so by
1357        # default it returns False all the time. To prevent DebugMode from
1358        # complaining because of the MergeOptimizer, we make this variable
1359        # always compare to True.
1360        out = node.outputs[0]
1361        out.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
1362        return node
1363
1364    def c_code(self, node, name, inputs, outputs, sub):
1365        desc, = outputs
1366
1367        if self.mode == 'max':
1368            mode_flag = 'CUDNN_POOLING_MAX'
1369        elif self.mode == "average_inc_pad":
1370            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
1371        elif self.mode == "average_exc_pad":
1372            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
1373        else:
1374            raise NotImplementedError("Unsupported pooling model.")
1375
1376        return """
1377{
1378  cudnnStatus_t err;
1379
1380  if ((err = cudnnCreatePoolingDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
1381    PyErr_Format(PyExc_MemoryError, "could not allocate pooling "
1382                 "descriptor: %%s", cudnnGetErrorString(err));
1383    %(fail)s
1384  }
1385
1386  static const int win[%(nd)d] = {%(win)s};
1387  static const int pad[%(nd)d] = {%(pad)s};
1388  static const int str[%(nd)d] = {%(str)s};
1389
1390    err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, CUDNN_PROPAGATE_NAN, %(nd)d, win, pad, str);
1391
1392  if (err != CUDNN_STATUS_SUCCESS) {
1393    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
1394                 cudnnGetErrorString(err));
1395    %(fail)s
1396  }
1397}
1398""" % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
1399           nd=self.get_ndim(), win=', '.join(map(str, self.ws)),
1400           pad=', '.join(map(str, self.pad)),
1401           str=', '.join(map(str, self.stride)))
1402
1403    def c_code_cache_version(self):
1404        return (4, version())
1405
1406
1407class GpuDnnPoolBase(DnnBase):
1408
1409    """
1410    Abstract base class for GpuDnnPool and GpuDnnPoolGrad.
1411
1412    """
1413
1414    # c_file and c_function must be defined in sub-classes.
1415    c_file = None
1416    c_function = None
1417
1418    _f16_ok = True
1419    __props__ = ('mode',)
1420    check_input = False
1421    params_type = ParamsType(mode=cudnn.cudnnPoolingMode_t,
1422                             handle=handle_type)
1423
1424    def __init__(self, mode='max'):
1425        DnnBase.__init__(self, [self.c_file], self.c_function)
1426        if mode == 'average':
1427            mode = 'average_inc_pad'
1428        # Supported modes depend on runtime cuDNN version.
1429        assert cudnn.cudnnPoolingMode_t.has_alias(mode)
1430        self.mode = mode
1431
1432
1433class GpuDnnPool(GpuDnnPoolBase):
1434
1435    """
1436    Parameters
1437    ----------
1438    img : tensor
1439        The image 4d or 5d tensor.
1440    ws : tensor
1441        Window size.
1442    stride : tensor
1443        (dx, dy) or (dx, dy, dz).
1444    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
1445        The old deprecated name 'average' corresponds to 'average_inc_pad'.
1446    pad : tensor
1447        (padX, padY) or (padX, padY, padZ)
1448
1449    """
1450    c_file = "c_code/dnn_pool.c"
1451    c_function = "APPLY_SPECIFIC(dnn_pool)"
1452
1453    def make_node(self, img, ws, stride, pad):
1454        ctx_name = infer_context_name(img)
1455        img = as_gpuarray_variable(img, ctx_name)
1456
1457        ws = tensor.as_tensor_variable(ws)
1458        stride = tensor.as_tensor_variable(stride)
1459        pad = tensor.as_tensor_variable(pad)
1460        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
1461        assert ws.type.ndim == 1
1462
1463        return Apply(self, [img, ws, stride, pad], [img.type()])
1464
1465    def infer_shape(self, node, shape):
1466        w = node.inputs[1]
1467        s = node.inputs[2]
1468        p = node.inputs[3]
1469
1470        res = [shape[0][0], shape[0][1],
1471               (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
1472               (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
1473               ]
1474        if node.inputs[0].ndim == 5:
1475            res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
1476        return [res]
1477
1478    def L_op(self, inp, outputs, grads):
1479        img, ws, stride, pad = inp
1480        grad, = grads
1481
1482        grad = gpu_contiguous(grad)
1483
1484        out, = outputs
1485
1486        g_out = GpuDnnPoolGrad(mode=self.mode)(img, out, grad, ws, stride, pad)
1487
1488        return g_out, theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()(), theano.gradient.DisconnectedType()()
1489
1490    def connection_pattern(self, node):
1491        # not connected to parameters
1492        return [[1], [0], [0], [0]]
1493
1494
1495class GpuDnnPoolGrad(GpuDnnPoolBase):
1496
1497    """
1498    The pooling gradient.
1499
1500    Parameters
1501    ----------
1502    inp
1503        The input of the pooling.
1504    out
1505        The output of the pooling in the forward.
1506    out_grad
1507        Same size as out, but is the corresponding gradient information.
1508    ws : tensor variable
1509        Window size.
1510    stride : tensor variable
1511        (dx, dy) or (dx, dy, dz).
1512    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
1513        The old deprecated name 'average' corresponds to 'average_inc_pad'.
1514    pad : tensor
1515        (padX, padY) or (padX, padY, padZ)
1516
1517    """
1518    c_file = "c_code/dnn_pool_grad.c"
1519    c_function = "APPLY_SPECIFIC(dnn_pool_grad)"
1520
1521    def make_node(self, inp, out, out_grad, ws, stride, pad):
1522        ctx_name = infer_context_name(inp, out, out_grad)
1523        inp = as_gpuarray_variable(inp, ctx_name)
1524        assert (inp.ndim in [4, 5])
1525        out_grad = as_gpuarray_variable(out_grad, ctx_name)
1526        assert (out_grad.ndim in [4, 5])
1527        out = as_gpuarray_variable(out, ctx_name)
1528        assert(out.ndim in [4, 5])
1529
1530        assert (out_grad.ndim == inp.ndim)
1531        assert (inp.ndim == out.ndim)
1532
1533        ws = tensor.as_tensor_variable(ws)
1534        stride = tensor.as_tensor_variable(stride)
1535        pad = tensor.as_tensor_variable(pad)
1536        assert ws.type.ndim == stride.type.ndim and ws.type.ndim == pad.type.ndim
1537        assert ws.type.ndim == 1
1538
1539        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
1540
1541    def infer_shape(self, node, shape):
1542        return [shape[0]]
1543
1544
1545def dnn_pool(img, ws, stride=None, mode='max', pad=None):
1546    """
1547    GPU pooling using cuDNN from NVIDIA.
1548
1549    The memory layout to use is 'bc01', that is 'batch', 'channel',
1550    'first dim', 'second dim' in that order.
1551
1552    `ws`, `stride` and `pad` must have the same length.
1553
1554    Parameters
1555    ----------
1556    img
1557        Images to do the pooling over.
1558    ws : tuple
1559        Subsampling window size.  Should have 2 or 3 elements.
1560    stride : tuple
1561        Subsampling stride (default: (1, 1) or (1, 1, 1)).
1562    mode : {'max', 'average_inc_pad', 'average_exc_pad', 'sum', 'max_deterministic'}
1563        **NB**: 'max_deterministic' is supported since cuDNN v6.
1564    pad : tuple
1565        (padX, padY) or (padX, padY, padZ)
1566        default: (0, 0) or (0, 0, 0)
1567
1568
1569    .. warning:: The cuDNN library only works with GPU that have a compute
1570        capability of 3.0 or higher.  This means that older GPU will not
1571        work with this Op.
1572
1573    Notes
1574    -----
1575    This Op implements the ignore_border=True of max_pool_2d.
1576
1577    """
1578    img = gpu_contiguous(img)
1579    if stride is None:
1580        stride = (1,) * len(ws)
1581    if pad is None:
1582        pad = (0,) * len(ws)
1583    if mode == "sum":
1584        ret = GpuDnnPool(mode="average_inc_pad")(img, ws, stride, pad)
1585        context_name = ret.type.context_name
1586        window_elem = theano.tensor.prod(ws).astype(ret.dtype)
1587        return as_gpuarray_variable(ret * window_elem, context_name)
1588    return GpuDnnPool(mode=mode)(img, ws, stride, pad)
1589
1590
1591class GpuDnnSoftmaxBase(DnnBase):
1592
1593    """
1594    Op for the cuDNN Softmax.
1595
1596    Parameters
1597    ----------
1598    algo : {'fast', 'accurate', 'log'}
1599        Indicating whether, respectively, computations should be optimized for
1600        speed, for accuracy, or if cuDNN should rather compute the log-softmax instead.
1601    mode : {'instance', 'channel'}
1602        Indicating whether the softmax should be computed per image across 'c01'
1603        or per spatial location '01' per image across 'c'.
1604
1605    """
1606
1607    __props__ = ('mode', 'algo')
1608    # Neither inputs nor output types properties are used
1609    # neither in dnn_base.c nor in dnn_softmax*.c,
1610    # so we can disable input checking.
1611    check_input = False
1612    params_type = ParamsType(algo=cudnn.cudnnSoftmaxAlgorithm_t,
1613                             mode=cudnn.cudnnSoftmaxMode_t,
1614                             handle=handle_type)
1615
1616    def __init__(self, algo, mode):
1617        DnnBase.__init__(self, [self.file], self.c_func)
1618
1619        assert cudnn.cudnnSoftmaxAlgorithm_t.has_alias(algo)
1620        self.algo = algo
1621
1622        assert cudnn.cudnnSoftmaxMode_t.has_alias(mode)
1623        self.mode = mode
1624
1625    def infer_shape(self, node, shape):
1626        if self.direction == 'forward':
1627            return [shape[0]]
1628        else:
1629            return [shape[1]]
1630
1631
1632class GpuDnnSoftmax(GpuDnnSoftmaxBase):
1633
1634    """
1635    Op for the cuDNN Softmax.
1636
1637    algo : {'fast', 'accurate', 'log'}
1638        Indicating whether, respectively, computations should be optimized for
1639        speed, for accuracy, or if cuDNN should rather compute the log-softmax instead.
1640    mode : {'instance', 'channel'}
1641        Indicating whether the softmax should be computed per image across 'c01'
1642        or per spatial location '01' per image across 'c'.
1643
1644    """
1645    _f16_ok = True
1646    direction = "forward"
1647    file = "c_code/dnn_softmax.c"
1648    c_func = "APPLY_SPECIFIC(softmax)"
1649
1650    def make_node(self, x):
1651        x = as_gpuarray_variable(x, infer_context_name(x))
1652        assert x.ndim == 4
1653        return Apply(self, [x], [x.type()])
1654
1655    def L_op(self, inp, outputs, grads):
1656        x, = inp
1657        g_sm, = grads
1658        sm, = outputs
1659        return [GpuDnnSoftmaxGrad(
1660                self.algo,
1661                self.mode
1662                )(g_sm, sm)]
1663
1664
1665class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
1666
1667    """
1668    Op for the cuDNN SoftmaxGrad.
1669
1670    Parameters
1671    ----------
1672    algo
1673        'fast', 'accurate' or 'log' indicating whether, respectively,
1674        computations should be optimized for speed, for accuracy, or if cuDNN
1675        should rather compute the gradient of the log-softmax instead.
1676    mode
1677        'instance' or 'channel' indicating whether the softmax should
1678        be computed per image across 'c01' or per spatial location '01' per
1679        image across 'c'.
1680
1681    """
1682    _f16_ok = True
1683    direction = 'backward'
1684    file = "c_code/dnn_softmax_grad.c"
1685    c_func = "APPLY_SPECIFIC(softmax_grad)"
1686
1687    def make_node(self, dy, sm):
1688        ctx_name = infer_context_name(dy, sm)
1689        dy = as_gpuarray_variable(dy, ctx_name)
1690        sm = as_gpuarray_variable(sm, ctx_name)
1691        assert dy.ndim == 4
1692        assert sm.ndim == 4
1693        return Apply(self, [dy, sm], [sm.type()])
1694
1695
1696class GpuDnnReduction(DnnBase):
1697    check_input = False
1698    _f16_ok = True
1699    _cop_num_outputs = 2
1700
1701    __props__ = ('red_op', 'axis', 'acc_dtype', 'dtype', 'return_indices')
1702
1703    params_type = ParamsType(red_op=cudnn.cudnnReduceTensorOp_t,
1704                             acc_dtype=cudnn.cudnnDataType_t,
1705                             c_axis=uint32_t,
1706                             handle=handle_type)
1707
1708    def __init__(self, red_op, axis, acc_dtype, dtype, return_indices):
1709        DnnBase.__init__(self, ['c_code/dnn_redux.c'], 'APPLY_SPECIFIC(dnn_redux)')
1710        assert cudnn.cudnnReduceTensorOp_t.has_alias(red_op)
1711        self.red_op = red_op
1712        assert acc_dtype in ['float16', 'float32', 'float64']
1713        self.acc_dtype = acc_dtype
1714        assert dtype in ['float16', 'float32', 'float64']
1715        self.dtype = dtype
1716        # 8 is the current limit for cudnn
1717        if axis is not None:
1718            if len(axis) > 8:
1719                raise ValueError('Too many axes to reduce on')
1720            if any(a >= 8 for a in axis):
1721                raise ValueError('Axes larger than 8 not supported')
1722            axis = tuple(axis)
1723        # c_axis is a bitfield (1 to reduce)
1724        self.c_axis = self._convert_axis(axis)
1725        # axis is a list of axes to reduce on
1726        self.axis = axis
1727        if return_indices and (red_op != 'maximum' and red_op != 'minimum'):
1728            raise ValueError("Can't request indices for something other than"
1729                             " minimum or maximum")
1730        self.return_indices = return_indices
1731
1732    def _convert_axis(self, axis):
1733        if axis is None:
1734            return np.uint32(-1)
1735        else:
1736            return reduce(lambda a, b: a | b, map(lambda a: 1 << a, axis), 0)
1737
1738    def make_node(self, inp):
1739        ctx_name = infer_context_name(inp)
1740        inp = as_gpuarray_variable(inp, ctx_name)
1741        inp = gpu_contiguous(inp)
1742        if inp.ndim > 8:
1743            raise ValueError("cuDNN reduction doesn't support nd > 8")
1744        assert inp.dtype in ['float16', 'float32', 'float64']
1745
1746        # These restrictions where guessed from vague clues since
1747        # there is no actual documentation on this
1748        if inp.dtype == 'float64':
1749            assert self.acc_dtype == 'float64'
1750        if inp.dtype == 'float32':
1751            assert self.acc_dtype == 'float32'
1752        if inp.dtype == 'float16':
1753            assert self.acc_dtype != 'float64'
1754
1755        bcast = []
1756        for i in range(inp.ndim):
1757            if not (self.c_axis & (1 << i)):
1758                bcast.append(inp.broadcastable[i])
1759        outs = [inp.type.clone(dtype=self.dtype, broadcastable=bcast)()]
1760        if self.return_indices:
1761            outs.append(GpuArrayType(dtype='uint32', broadcastable=bcast,
1762                                     context_name=ctx_name)())
1763
1764        return Apply(self, [inp], outs)
1765
1766
1767class GpuDnnBatchNorm(DnnBase):
1768    """
1769    Base Op for cuDNN Batch Normalization.
1770
1771    Parameters
1772    ----------
1773    mode : {'per-activation', 'spatial'}
1774        Whether to normalize per activation (in this mode, bias and scale
1775        tensor dimensions are 1xCxHxW) or share normalization factors across
1776        spatial dimensions (in this mode, bias and scale tensor dimensions
1777        are 1xCx1x1).
1778    epsilon
1779        Epsilon value used in the batch normalization formula. Minimum allowed
1780        value is 1e-5 (imposed by cuDNN).
1781    running_average_factor : float
1782        Factor for updating the values or `running_mean` and `running_var`.
1783        If the factor is close to one, the running averages will update quickly,
1784        if the factor is close to zero it will update slowly.
1785    running_mean : tensor or None
1786        Previous value of the running mean. If this is given, the new value
1787        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
1788        will be returned as one of the outputs of this function.
1789        `running_mean` and `running_var` should either both be given or
1790        both be None.
1791    running_var : tensor or None
1792        Previous value of the running variance. If this is given, the new value
1793        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
1794        will be returned as one of the outputs of this function,
1795        where `m` is the product of lengths of the averaged-over dimensions.
1796        `running_mean` and `running_var` should either both be given or
1797        both be None.
1798    """
1799
1800    __props__ = ('mode', 'running_averages', 'inplace_running_mean',
1801                 'inplace_running_var', 'inplace_output')
1802    _cop_num_inputs = 7
1803    _cop_num_outputs = 5
1804    check_input = False
1805    params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t,
1806                             inplace_output=bool_t,
1807                             inplace_running_mean=bool_t,
1808                             inplace_running_var=bool_t,
1809                             handle=handle_type)
1810
1811    def __init__(self, mode='per-activation', running_averages=False,
1812                 inplace_running_mean=False, inplace_running_var=False,
1813                 inplace_output=False):
1814        DnnBase.__init__(self, ['c_code/dnn_batchnorm_base.c', 'c_code/dnn_batchnorm.c'],
1815                         'dnn_batchnorm_op')
1816
1817        assert cudnn.cudnnBatchNormMode_t.has_alias(mode)
1818        self.mode = mode
1819        self.running_averages = running_averages
1820        self.inplace_output = inplace_output
1821        self.inplace_running_mean = inplace_running_mean
1822        self.inplace_running_var = inplace_running_var
1823        self.destroy_map = {}
1824        if self.inplace_output:
1825            self.destroy_map[0] = [0]
1826        if self.running_averages and self.inplace_running_mean:
1827            self.destroy_map[3] = [5]
1828        if self.running_averages and self.inplace_running_var:
1829            self.destroy_map[4] = [6]
1830
1831    def __setstate__(self, d):
1832        self.__dict__.update(d)
1833        if not hasattr(self, 'running_average_factor'):
1834            self.running_average_factor = 0
1835        if not hasattr(self, 'running_averages'):
1836            self.running_averages = False
1837        if not (hasattr(self, 'inplace_running_mean') and
1838                hasattr(self, 'inplace_running_var') and
1839                hasattr(self, 'inplace_output')):
1840            self.inplace_running_mean = False
1841            self.inplace_running_var = False
1842            self.inplace_output = False
1843            self.destroy_map = {}
1844
1845    def infer_shape(self, node, shape):
1846        return [shape[0]] + [shape[1]] * (len(node.outputs) - 1)
1847
1848    def make_node(self, x, scale, bias, epsilon=1e-4,
1849                  running_average_factor=0.1,
1850                  running_mean=None, running_var=None):
1851        assert x.ndim == scale.ndim == bias.ndim
1852        assert x.ndim in (4, 5)
1853        assert self.running_averages == (running_mean is not None) == (running_var is not None)
1854        assert (running_mean is None or running_mean.ndim == x.ndim)
1855        assert (running_var is None or running_var.ndim == x.ndim)
1856        ctx_name = infer_context_name(x, scale, bias)
1857        x = as_gpuarray_variable(x, ctx_name)
1858        scale = as_gpuarray_variable(scale, ctx_name)
1859        bias = as_gpuarray_variable(bias, ctx_name)
1860        epsilon = as_scalar(epsilon).astype('float64')
1861        running_average_factor = as_scalar(running_average_factor).astype('float64')
1862        inputs = [x, scale, bias, epsilon, running_average_factor]
1863        output_types = [x.type(), scale.type(), scale.type()]
1864        if running_mean is not None and running_var is not None:
1865            inputs.append(as_gpuarray_variable(running_mean, ctx_name))
1866            inputs.append(as_gpuarray_variable(running_var, ctx_name))
1867            output_types.append(scale.type())
1868            output_types.append(scale.type())
1869        return Apply(self, inputs, output_types)
1870
1871    def L_op(self, inputs, outputs, grads):
1872        x, scale, bias, epsilon, running_average_factor = inputs[:5]
1873        dy = grads[0]
1874        _, x_mean, x_invstd = outputs[:3]
1875        disconnected_outputs = [
1876            DisconnectedType()(),  # epsilon
1877            DisconnectedType()()]  # running_average_factor
1878        # Optional running_mean and running_var.
1879        for i in range(5, len(inputs)):
1880            disconnected_outputs.append(DisconnectedType()())
1881        return GpuDnnBatchNormGrad(self.mode)(
1882            x, dy, scale, x_mean, x_invstd, epsilon) + disconnected_outputs
1883
1884    def connection_pattern(self, node):
1885        # Specificy that epsilon and running_average_factor are not connected to outputs.
1886        patterns = [[True, True, True],     # x
1887                    [True, True, True],     # scale
1888                    [True, True, True],     # bias
1889                    [False, False, False],  # epsilon
1890                    [False, False, False]]  # running_average_factor
1891        # Optional running_mean and running_var are only
1892        # connected to their new values.
1893        for i in range(5, len(node.inputs)):
1894            patterns[0].append(True)
1895            for pattern in patterns[1:]:
1896                pattern.append(False)
1897            patterns.append([False] * (3 + i - 5) + [True])
1898        return patterns
1899
1900
1901class GpuDnnBatchNormInference(DnnBase):
1902    """
1903    Base Op for cuDNN Batch Normalization.
1904
1905    Parameters
1906    ----------
1907    mode : {'per-activation', 'spatial'}
1908        Whether to normalize per activation (in this mode, bias and scale
1909        tensor dimensions are 1xCxHxW) or share normalization factors across
1910        spatial dimensions (in this mode, bias and scale tensor dimensions
1911        are 1xCx1x1).
1912    epsilon
1913        Epsilon value used in the batch normalization formula. Minimum allowed
1914        value is 1e-5 (imposed by cuDNN).
1915    """
1916
1917    __props__ = ('mode', 'inplace')
1918
1919    check_input = False
1920    params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t,
1921                             inplace=bool_t,
1922                             handle=handle_type)
1923
1924    def __init__(self, mode='per-activation', inplace=False):
1925        DnnBase.__init__(self, ['c_code/dnn_batchnorm_base.c', 'c_code/dnn_batchnorm_inf.c'],
1926                         'dnn_batchnorm_op')
1927
1928        assert cudnn.cudnnBatchNormMode_t.has_alias(mode)
1929        self.mode = mode
1930        self.inplace = bool(inplace)
1931        if self.inplace:
1932            self.destroy_map = {0: [0]}
1933
1934    def __setstate__(self, d):
1935        self.__dict__.update(d)
1936        if not hasattr(self, 'inplace'):
1937            self.inplace = False
1938
1939    def infer_shape(self, node, shape):
1940        return [shape[0]]
1941
1942    def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):
1943        ctx_name = infer_context_name(x, scale, bias, estimated_mean,
1944                                      estimated_variance)
1945        x = as_gpuarray_variable(x, ctx_name)
1946        scale = as_gpuarray_variable(scale, ctx_name)
1947        bias = as_gpuarray_variable(bias, ctx_name)
1948        estimated_mean = as_gpuarray_variable(estimated_mean, ctx_name)
1949        estimated_variance = as_gpuarray_variable(estimated_variance, ctx_name)
1950        epsilon = as_scalar(epsilon).astype('float64')
1951        assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim
1952        assert x.ndim in (4, 5)
1953        return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])
1954
1955    def grad(self, inputs, grads):
1956        x, scale, bias, est_mean, est_var, epsilon = inputs
1957        dy = grads[0]
1958
1959        if self.mode == "per-activation":
1960            axes = (0,)
1961        elif self.mode == "spatial":
1962            axes = (0,) + tuple(range(2, x.ndim))
1963        scale, bias, est_mean, est_var = (theano.tensor.addbroadcast(t, *axes)
1964                                          for t in (scale, bias, est_mean, est_var))
1965
1966        # define helper expressions
1967        est_var_eps = est_var + epsilon
1968        est_std = theano.tensor.sqrt(est_var_eps)
1969        two = theano.tensor.constant(2.)
1970
1971        # define and return gradients
1972        dx = dy * (scale / est_std)
1973        dscale = (dy * (x - est_mean)).sum(axes, keepdims=True) / est_std
1974        dbias = dy.sum(axes, keepdims=True)
1975        dmean = -dy.sum(axes, keepdims=True) * (scale / est_std)
1976        dvar = -(dy * (x - est_mean)).sum(axes, keepdims=True) * (scale / (two * est_var_eps * est_std))
1977        return [dx, dscale, dbias, dmean, dvar, DisconnectedType()()]
1978
1979    def connection_pattern(self, node):
1980        # Specificy that epsilon is not connected to outputs.
1981        return [[True], [True], [True], [True], [True], [False]]
1982
1983
1984class GpuDnnBatchNormGrad(DnnBase):
1985    __props__ = ('mode',)
1986
1987    check_input = False
1988    params_type = ParamsType(mode=cudnn.cudnnBatchNormMode_t,
1989                             handle=handle_type)
1990
1991    def __init__(self, mode='per-activation'):
1992        DnnBase.__init__(self, ['c_code/dnn_batchnorm_base.c', 'c_code/dnn_batchnorm_grad.c'],
1993                         'dnn_batchnorm_grad')
1994
1995        assert cudnn.cudnnBatchNormMode_t.has_alias(mode)
1996        self.mode = mode
1997
1998    def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
1999        ctx_name = infer_context_name(x, dy, scale, x_mean, x_invstd)
2000        x = as_gpuarray_variable(x, ctx_name)
2001        dy = as_gpuarray_variable(dy, ctx_name)
2002        scale = as_gpuarray_variable(scale, ctx_name)
2003        x_mean = as_gpuarray_variable(x_mean, ctx_name)
2004        x_invstd = as_gpuarray_variable(x_invstd, ctx_name)
2005        epsilon = as_scalar(epsilon).astype('float64')
2006        assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim
2007        assert x.ndim in (4, 5)
2008        return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon], [x.type(), scale.type(), scale.type()])
2009
2010    def infer_shape(self, node, shape):
2011        return [shape[0], shape[2], shape[2]]
2012
2013gpudata_type = CDataType('gpudata *', 'gpudata_release')
2014dropoutdesc_type = CUDNNDataType('cudnnDropoutDescriptor_t',
2015                                 'cudnnDestroyDropoutDescriptor')
2016
2017
2018class GpuDnnDropoutOp(DnnBase):
2019    __props__ = ('inplace',)
2020
2021    def __init__(self, inplace=False):
2022        DnnBase.__init__(self, ["c_code/dnn_dropout_fwd.c"], "dnn_dropout_fwd")
2023        self.inplace = inplace
2024        if self.inplace:
2025            self.destroy_map = {1: [2]}
2026
2027    def make_node(self, inp, descriptor, state):
2028        ctx_name = infer_context_name(inp)
2029        inp = as_gpuarray_variable(inp, ctx_name)
2030        return Apply(self, [inp, descriptor, state],
2031                     [inp.type(), state.type(), gpudata_type()])
2032
2033    def prepare_node(self, node, storage_map, compute_map, impl):
2034        assert self.inplace, "GpuDnnDropoutOp not inplace"
2035
2036
2037class _DropoutDescriptor(DnnBase):
2038    __props__ = ('context_name',)
2039
2040    def __init__(self, context_name):
2041        DnnBase.__init__(self, ["c_code/dnn_dropout_desc.c"], "dnn_dropout_desc")
2042        self.context_name = context_name
2043
2044    def dnn_context(self, node):
2045        return self.context_name
2046
2047    def do_constant_folding(self, node):
2048        return False
2049
2050    def make_node(self, dropout, seed, context_name):
2051        dropout = as_scalar(dropout).astype('float32')
2052        seed = as_scalar(seed).astype('uint64')
2053
2054        assert context_name == self.context_name
2055        # This is a dirty hack to pass the context because params is
2056        # occupied by the cudnn handle
2057        context = gpu_context_type.make_constant(get_context(context_name))
2058
2059        return Apply(self, [dropout, seed, context],
2060                     [dropoutdesc_type(),
2061                      GpuArrayType('uint8', (False,),
2062                                   context_name=context_name)()])
2063
2064    def c_code_cache_version_apply(self, node):
2065        # disable the cache since we can't pickle contexts
2066        return None
2067
2068
2069def _make_dropout_desc(dropout, seed, context_name):
2070    desc, states = theano.function(
2071        [],
2072        _DropoutDescriptor(context_name)(dropout, seed, context_name),
2073        theano.Mode(optimizer=None),
2074        profile=False)()
2075    return desc, states
2076
2077
2078def dropout(x, dropout=0.0, seed=4242):
2079    desc, states = _make_dropout_desc(dropout, seed, x.type.context_name)
2080    y, odesc = GpuDnnDropoutOp()(x, desc)
2081    return y, desc, odesc, states
2082
2083rnndesc_type = CUDNNDataType('cudnnRNNDescriptor_t',
2084                             'cudnnDestroyRNNDescriptor')
2085
2086
2087def as_i32(v):
2088    return as_scalar(v).astype('int32')
2089
2090
2091class _RNNDescriptor(DnnBase):
2092    __props__ = ('context_name',)
2093
2094    def __init__(self, context_name):
2095        if version() < 5005:
2096            raise RuntimeError("cudnn RNN require cudnn v5 final or higher.")
2097        DnnBase.__init__(self, ["c_code/dnn_rnn_desc.c"], "dnn_rnn_desc")
2098        self.context_name = context_name
2099
2100    def dnn_context(self, node):
2101        return self.context_name
2102
2103    def do_constant_folding(self, node):
2104        return False
2105
2106    def make_node(self, hidden_size, num_layers, ddesc, input_mode,
2107                  direction_mode, rnn_mode, dtype):
2108
2109        hidden_size = as_i32(hidden_size)
2110        num_layers = as_i32(num_layers)
2111
2112        if version() < 5005:
2113            raise RuntimeError("cudnn RNN require cudnn v5 final or higher.")
2114
2115        if input_mode == 'linear':
2116            input_mode = as_i32(0)
2117        elif input_mode == 'skip':
2118            input_mode = as_i32(1)
2119        else:
2120            raise ValueError("input_mode")
2121
2122        if direction_mode == 'unidirectional':
2123            direction_mode = as_i32(0)
2124        elif direction_mode == 'bidirectional':
2125            direction_mode = as_i32(1)
2126        else:
2127            raise ValueError("direction_mode")
2128
2129        if rnn_mode == 'rnn_relu':
2130            rnn_mode = as_i32(0)
2131        elif rnn_mode == 'rnn_tanh':
2132            rnn_mode = as_i32(1)
2133        elif rnn_mode == 'lstm':
2134            rnn_mode = as_i32(2)
2135        elif rnn_mode == 'gru':
2136            rnn_mode = as_i32(3)
2137        else:
2138            raise ValueError("rnn_mode")
2139
2140        dtype = as_i32(gpuarray.dtype_to_typecode(dtype))
2141
2142        return Apply(self, [hidden_size, num_layers,
2143                            dropoutdesc_type.make_constant(ddesc),
2144                            input_mode, direction_mode, rnn_mode, dtype],
2145                     [rnndesc_type()])
2146
2147
2148def _make_rnn_desc(hidden_size, num_layers, ddesc, rnn_mode,
2149                   input_mode, direction_mode, dtype, context_name):
2150    desc = theano.function(
2151        [],
2152        _RNNDescriptor(context_name)(hidden_size, num_layers, ddesc,
2153                                     input_mode, direction_mode,
2154                                     rnn_mode, dtype),
2155        theano.Mode(optimizer=None),
2156        profile=False)()
2157    return desc
2158
2159
2160class _RNNParamSize(DnnBase):
2161    __props__ = ('context_name',)
2162
2163    def __init__(self, context_name):
2164        DnnBase.__init__(self, ["c_code/dnn_rnn_paramsize.c"],
2165                         "dnn_rnn_paramsize")
2166        self.context_name = context_name
2167
2168    def dnn_context(self, node):
2169        return self.context_name
2170
2171    def do_constant_folding(self, node):
2172        return False
2173
2174    def make_node(self, desc, input_size, typecode):
2175        input_size = as_tensor_variable(input_size).astype('uint64')
2176        typecode = as_i32(typecode)
2177        return Apply(self, [rnndesc_type.make_constant(desc), input_size,
2178                            typecode],
2179                     [get_scalar_type('uint64')()])
2180
2181
2182def _get_param_size(desc, input_size, dtype, context_name):
2183    typecode = gpuarray.dtype_to_typecode(dtype)
2184    return theano.function(
2185        [],
2186        _RNNParamSize(context_name)(desc, input_size, typecode),
2187        theano.Mode(optimizer=None),
2188        profile=False)()
2189
2190
2191class _RNNSplitParams(DnnBase):
2192    __props__ = ('rnn_mode',)
2193
2194    def __init__(self, rnn_mode):
2195        DnnBase.__init__(self)
2196        self.rnn_mode = rnn_mode
2197
2198    def make_node(self, w, desc, layer, isize, typecode):
2199        w = as_gpuarray_variable(w, infer_context_name(w))
2200        assert w.ndim == 1
2201        layer = as_scalar(layer).astype('int32')
2202        isize = as_tensor_variable(isize).astype('uint64')
2203        assert isize.ndim == 1
2204        typecode = as_scalar(typecode).astype('int32')
2205        _1d = GpuArrayType(w.type.dtype, [False],
2206                           context_name=w.type.context_name)
2207        _2d = GpuArrayType(w.type.dtype, [False, False],
2208                           context_name=w.type.context_name)
2209        outputs = []
2210        if self.rnn_mode == 'rnn_relu' or self.rnn_mode == 'rnn_tanh':
2211            outputs.extend([_2d(), _1d()])  # input
2212            outputs.extend([_2d(), _1d()])  # recurrent
2213        elif self.rnn_mode == 'lstm':
2214            outputs.extend([_2d(), _1d()])  # input input
2215            outputs.extend([_2d(), _1d()])  # input forget
2216            outputs.extend([_2d(), _1d()])  # input newmem
2217            outputs.extend([_2d(), _1d()])  # input output
2218            outputs.extend([_2d(), _1d()])  # recur input
2219            outputs.extend([_2d(), _1d()])  # recur forget
2220            outputs.extend([_2d(), _1d()])  # recur newmem
2221            outputs.extend([_2d(), _1d()])  # recur output
2222        elif self.rnn_mode == 'gru':
2223            outputs.extend([_2d(), _1d()])  # input reset
2224            outputs.extend([_2d(), _1d()])  # input update
2225            outputs.extend([_2d(), _1d()])  # input newmem
2226            outputs.extend([_2d(), _1d()])  # recur reset
2227            outputs.extend([_2d(), _1d()])  # recur update
2228            outputs.extend([_2d(), _1d()])  # recur newmem
2229
2230        return Apply(self, [w, layer, rnndesc_type.make_constant(desc),
2231                            isize, typecode], outputs)
2232
2233    def c_code(self, node, name, inputs, outputs, sub):
2234        kw = dict(fail=sub['fail'], w=inputs[0], layer=inputs[1],
2235                  desc=inputs[2], isize=inputs[3], typecode=inputs[4],
2236                  handle=sub['params'])
2237        code = """
2238  cudnnTensorDescriptor_t xdesc;
2239  cudnnFilterDescriptor_t wdesc;
2240  cudnnFilterDescriptor_t odesc;
2241  size_t nshp[2];
2242  void *w;
2243  void *o;
2244  ptrdiff_t off;
2245#if CUDNN_VERSION < 7100
2246  size_t bshp;
2247#endif
2248  cudnnStatus_t err;
2249  cudnnDataType_t dt;
2250  cudnnTensorFormat_t tf;
2251  int nd;
2252  int dims[3];
2253  int strs[3];
2254
2255  if (PyArray_DIM(%(isize)s, 0) != 2) {
2256    PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
2257    %(fail)s;
2258  }
2259
2260  switch (%(typecode)s) {
2261  case GA_FLOAT:
2262    dt = CUDNN_DATA_FLOAT;
2263    break;
2264  case GA_DOUBLE:
2265    dt = CUDNN_DATA_DOUBLE;
2266    break;
2267  case GA_HALF:
2268    dt = CUDNN_DATA_HALF;
2269    break;
2270  default:
2271    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
2272    %(fail)s;
2273  }
2274
2275  err = cudnnCreateTensorDescriptor(&xdesc);
2276  if (err != CUDNN_STATUS_SUCCESS) {
2277    PyErr_SetString(PyExc_RuntimeError, "Could not create xdesc");
2278    %(fail)s;
2279  }
2280
2281  dims[0] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 0);
2282  dims[1] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 1);
2283  dims[2] = 1;
2284  strs[0] = dims[2] * dims[1];
2285  strs[1] = dims[2];
2286  strs[2] = 1;
2287
2288  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
2289  if (err != CUDNN_STATUS_SUCCESS) {
2290    cudnnDestroyTensorDescriptor(xdesc);
2291    PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %%s",
2292                 cudnnGetErrorString(err));
2293    %(fail)s;
2294  }
2295
2296  if (c_make_filter(%(w)s, &wdesc)) {
2297    cudnnDestroyTensorDescriptor(xdesc);
2298    %(fail)s
2299  }
2300
2301  err = cudnnCreateFilterDescriptor(&odesc);
2302  if (err != CUDNN_STATUS_SUCCESS) {
2303    PyErr_SetString(PyExc_RuntimeError, "could not create odesc");
2304    cudnnDestroyTensorDescriptor(xdesc);
2305    cudnnDestroyFilterDescriptor(wdesc);
2306    %(fail)s
2307  }
2308
2309  w = PyGpuArray_DEV_DATA(%(w)s);
2310  nshp[0] = PyGpuArray_DIM(%(w)s, 0);
2311  nshp[1] = 1;
2312        """ % kw
2313
2314        def get_params(id, m, b):
2315            kw2 = kw.copy()
2316            kw2['id'] = id
2317            kw2['m'] = m
2318            kw2['b'] = b
2319            return """
2320  err = cudnnGetRNNLinLayerBiasParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
2321  if (err != CUDNN_STATUS_SUCCESS) {
2322    cudnnDestroyTensorDescriptor(xdesc);
2323    cudnnDestroyFilterDescriptor(wdesc);
2324    cudnnDestroyFilterDescriptor(odesc);
2325    PyErr_SetString(PyExc_RuntimeError, "can't fetch bias for id %(id)s");
2326    %(fail)s
2327  }
2328  off = (intptr_t)o - (intptr_t)w;
2329  assert(off >= 0 && "bias");
2330
2331  err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
2332  if (err != CUDNN_STATUS_SUCCESS) {
2333    cudnnDestroyTensorDescriptor(xdesc);
2334    cudnnDestroyFilterDescriptor(wdesc);
2335    cudnnDestroyFilterDescriptor(odesc);
2336    PyErr_SetString(PyExc_RuntimeError, "could not get bias shape for id %(id)s");
2337    %(fail)s;
2338  }
2339  // We assume that the typecode matches
2340#if CUDNN_VERSION < 7100
2341  assert(dims[2] == 1 && "bias");
2342  assert(dims[1] == 1 && "bias");
2343  %(b)s = pygpu_view(%(w)s, Py_None);
2344  %(b)s->ga.offset += off;
2345  %(b)s->ga.dimensions[0] = dims[0];
2346  bshp = dims[0];
2347#else
2348  assert(dims[0] == 1 && "bias");
2349  assert(dims[2] == 1 && "bias");
2350  %(b)s = pygpu_view(%(w)s, Py_None);
2351  %(b)s->ga.offset += off;
2352  %(b)s->ga.dimensions[0] = dims[1];
2353#endif
2354  GpuArray_fix_flags(&%(b)s->ga);
2355
2356  err = cudnnGetRNNLinLayerMatrixParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
2357  if (err != CUDNN_STATUS_SUCCESS) {
2358    cudnnDestroyTensorDescriptor(xdesc);
2359    cudnnDestroyFilterDescriptor(wdesc);
2360    cudnnDestroyFilterDescriptor(odesc);
2361    PyErr_SetString(PyExc_RuntimeError, "can't fetch matrix for id %(id)s");
2362    %(fail)s
2363  }
2364  off = (intptr_t)o - (intptr_t)w;
2365  assert(off >= 0 && "matrix");
2366
2367  // This is 3d because of cudnn limitations.
2368  err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
2369  if (err != CUDNN_STATUS_SUCCESS) {
2370    cudnnDestroyTensorDescriptor(xdesc);
2371    cudnnDestroyFilterDescriptor(wdesc);
2372    cudnnDestroyFilterDescriptor(odesc);
2373    PyErr_SetString(PyExc_RuntimeError, "could not get matrix shape for id %(id)s");
2374    %(fail)s;
2375  }
2376
2377#if CUDNN_VERSION < 7100
2378  assert(dims[1] == 1 && "matrix");
2379  assert(dims[2] == 1 && "matrix");
2380  // We assume that the typecode matches
2381  %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
2382  %(m)s->ga.offset += off;
2383  assert(dims[0] %% bshp == 0);
2384  %(m)s->ga.dimensions[0] = dims[0] / bshp;
2385  %(m)s->ga.dimensions[1] = bshp;
2386#else
2387  assert(dims[0] == 1 && "matrix");
2388  // We assume that the typecode matches
2389  %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
2390  %(m)s->ga.offset += off;
2391  %(m)s->ga.dimensions[1] = dims[1];
2392  %(m)s->ga.dimensions[0] = dims[2];
2393#endif
2394  %(m)s->ga.strides[1] = %(m)s->ga.dimensions[0] * gpuarray_get_elsize(%(m)s->ga.typecode);
2395  GpuArray_fix_flags(&%(m)s->ga);
2396            """ % kw2
2397
2398        for i in range(len(outputs) // 2):
2399            code += get_params(i, outputs[2 * i], outputs[(2 * i) + 1])
2400
2401        code += """
2402  cudnnDestroyTensorDescriptor(xdesc);
2403  cudnnDestroyFilterDescriptor(wdesc);
2404  cudnnDestroyFilterDescriptor(odesc);
2405        """
2406        return code
2407
2408    def c_code_cache_version(self):
2409        return (5, version())
2410
2411
2412def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
2413    typecode = gpuarray.dtype_to_typecode(dtype)
2414    outs = _RNNSplitParams(rnn_mode)(w, desc, layer, input_size, typecode)
2415    outs = [theano.Out(o, borrow=True) for o in outs]
2416    return theano.function(
2417        [], outs,
2418        theano.Mode(optimizer=None),
2419        profile=False)()
2420
2421
2422class GpuDnnRNNOp(DnnBase):
2423    __props__ = ()
2424    _cop_num_inputs = 6
2425    _cop_num_outputs = 4
2426
2427    def __init__(self, rnn_mode, direction_mode):
2428        DnnBase.__init__(self, ["c_code/dnn_rnn_fwd.c"], 'dnn_rnn_fwd')
2429        self.rnn_mode = rnn_mode
2430        if direction_mode == 'bidirectional':
2431            self.num_dirs = 2
2432        elif direction_mode == 'unidirectional':
2433            self.num_dirs = 1
2434        else:
2435            raise ValueError('direction_mode is invalid (got %s)' % (direction_mode,))
2436
2437    def dnn_context(self, node):
2438        return node.outputs[1].type.context_name
2439
2440    def make_node(self, desc, w, x, hx, cx=None):
2441        if cx is None:
2442            context_name = infer_context_name(w, x, hx)
2443        else:
2444            context_name = infer_context_name(w, x, hx, cx)
2445
2446        w = as_gpuarray_variable(w, context_name)
2447        x = as_gpuarray_variable(x, context_name)
2448        hx = as_gpuarray_variable(hx, context_name)
2449        inputs = [desc, as_i32(self.num_dirs), w, x, hx]
2450        assert w.ndim == 1
2451        assert x.ndim == 3  # seqLength, minibatch, inputSize
2452        assert hx.ndim == 3  # numLayers, minibatch, hiddenSize * bidi
2453
2454        if self.rnn_mode == 'lstm':
2455            cx = as_gpuarray_variable(cx, context_name)
2456            assert cx.ndim == 3  # numLayers, minibatch, hiddenSize * bidi
2457            inputs.append(cx)
2458
2459        _3d = GpuArrayType(dtype=x.dtype, broadcastable=(False, False, False),
2460                           context_name=context_name)
2461        reserve = gpudata_type()
2462        y = _3d()  # seqLength, minibatch, hiddenSize * bidi
2463        hy = _3d()  # numLayers, miniBatch, hiddenSize * bidi
2464        outputs = [reserve, y, hy]
2465
2466        if self.rnn_mode == 'lstm':
2467            cy = _3d()  # numLayers, miniBatch, hiddenSize * bidi
2468            outputs.append(cy)
2469
2470        return Apply(self, inputs, outputs)
2471
2472    def L_op(self, inputs, outputs, output_grads):
2473        desc, numDirs, w, x, hx = inputs[:5]
2474        cx = inputs[5] if len(inputs) == 6 else None
2475        reserve, y, hy = outputs[:3]
2476        _, dy, dhy = output_grads[:3]
2477        dcy = output_grads[3] if len(output_grads) == 4 else None
2478        # Since the op return two outputs which contain essentially
2479        # the same information, the user will most likely only use one
2480        # of them. This leads to the situation that the other is
2481        # considered "disconnected" by theano in the gradient.
2482        # However we know that this isn't really the case so we fix it
2483        # here.
2484
2485        # If all the ys are disconnected, then you get a boring
2486        # gradient instead of an error.  But in that case you
2487        # shouldn't call this method anyway.
2488        if isinstance(dy.type, DisconnectedType):
2489            dy = as_gpuarray_variable(y.zeros_like(),
2490                                      context_name=y.type.context_name)
2491        if isinstance(dhy.type, DisconnectedType):
2492            dhy = None
2493        if dcy and isinstance(dcy.type, DisconnectedType):
2494            dcy = None
2495        dinputs = GpuDnnRNNGradInputs(rnn_mode=self.rnn_mode,
2496                                      grad_h=(dhy is not None),
2497                                      grad_c=(dcy is not None))(
2498            desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True)
2499        reserve2, dx, dhx = dinputs[:3]
2500        dw = GpuDnnRNNGradWeights()(
2501            desc, x, hx, y, reserve2, w)
2502        res = [DisconnectedType()(), DisconnectedType()(), dw, dx, dhx]
2503        if cx is not None:
2504            res.append(dinputs[3])  # dcx
2505        return res
2506
2507    def connection_pattern(self, node):
2508        deconn = [[False] * len(node.outputs)] * 2
2509        conn = [[True] * len(node.outputs)] * (len(node.inputs) - 2)
2510        return deconn + conn
2511
2512
2513class GpuDnnRNNGradInputs(DnnBase):
2514    __props__ = ('rnn_mode', 'grad_c', 'grad_h')
2515    _cop_num_inputs = 10
2516    _cop_num_outputs = 4
2517
2518    def __init__(self, rnn_mode, grad_h, grad_c):
2519        DnnBase.__init__(self, ['c_code/dnn_rnn_gi.c'], 'dnn_rnn_gi')
2520        self.rnn_mode = rnn_mode
2521        self.grad_h = grad_h
2522        self.grad_c = grad_c
2523        if self.grad_c:
2524            assert self.rnn_mode == 'lstm'
2525
2526    def dnn_context(self, node):
2527        return node.outputs[1].type.context_name
2528
2529    def make_node(self, desc, x, y, dy, dhy, dcy, w, hx, cx, reserve):
2530        # We trust the callers here
2531        xshp = as_scalar(x.shape[2]).astype('uint64')
2532        inputs = [desc, xshp, y, dy, w, hx, reserve]
2533        outputs = [reserve.type(), x.type(), hx.type()]
2534        if self.rnn_mode == 'lstm':
2535            inputs.append(cx)
2536            outputs.append(cx.type())
2537        if self.grad_h:
2538            inputs.append(dhy)
2539        if self.grad_c:
2540            inputs.append(dcy)
2541
2542        return Apply(self, inputs, outputs)
2543
2544    # We have special requirements so this is hooking into COp
2545    def format_c_function_args(self, inp, out):
2546        rinp = inp[:7]
2547        others = inp[7:]
2548        if self.rnn_mode == 'lstm':
2549            rinp.append(others.pop(0))
2550        else:
2551            rinp.append('NULL')
2552        if self.grad_h:
2553            rinp.append(others.pop(0))
2554        else:
2555            rinp.append('NULL')
2556        if self.grad_c:
2557            rinp.append(others.pop(0))
2558        else:
2559            rinp.append('NULL')
2560        assert len(others) == 0
2561        return COp.format_c_function_args(self, rinp, out)
2562
2563
2564class GpuDnnRNNGradWeights(DnnBase):
2565    __props__ = ()
2566
2567    def __init__(self):
2568        DnnBase.__init__(self, ['c_code/dnn_rnn_gw.c'], 'dnn_rnn_gw')
2569
2570    def make_node(self, desc, x, hx, y, reserve, w):
2571        # We trust the callers here
2572        wsize = as_scalar(w.shape[0]).astype('uint64')
2573        inputs = [desc, wsize, x, hx, y, reserve]
2574        outputs = [w.type()]
2575        return Apply(self, inputs, outputs)
2576
2577
2578class RNNBlock(object):
2579    """
2580    An object that allow us to use CuDNN RNN implementation.
2581    TODO: make an example how to use. You can check Theano tests
2582    test_dnn_rnn_gru() and test_dnn_rnn_lstm() in the file
2583    theano/gpuarray/tests/test_dnn.py for now.
2584
2585
2586    Parameters
2587    ----------
2588    dtype : data type of computation
2589    hidden_size : int
2590        hidden layer dimension.
2591    num_layers : int
2592        number of the recurrent layer you want to set.
2593    rnn_mode : {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'}
2594        rnn_relu: A single-gate recurrent neural network with a ReLU activation function.
2595
2596        .. math::
2597
2598        h_t=ReLU(W_ix_t+U_ih_{t-1}+b_{wi}+b_{Ri})
2599        rnn_tanh: A single-gate recurrent neural network with a tanh activation function.
2600
2601        .. math::
2602
2603        h_t=tanh(W_ix_t+U_ih_{t-1}+b_{wi}+b_{Ri})
2604
2605        lstm: A four-gate Long Short-Term Memory network with no peephole connections.
2606        gru: A three-gate network consisting of Gated Recurrent Units.
2607    input_mode : {'linear', 'skip'}
2608        linear: input will be multiplied by a biased matrix
2609        skip: No operation is performed on the input.  The size must match the hidden size.
2610    direction_mode : {'unidirectional', 'bidirectional'}
2611        unidirectional: The network operates recurrently from the first input to the last.
2612        bidirectional: The network operates from first to last then from last to first and concatenates the results at each layer.
2613
2614    """
2615
2616    def __init__(self, dtype, hidden_size, num_layers, rnn_mode,
2617                 input_mode='linear', direction_mode='unidirectional',
2618                 context_name=None):
2619        # This is not supported for any value other than 0, so don't change it
2620        ddesc, states = _make_dropout_desc(0, 4242, context_name)
2621        self.ddesc = ddesc
2622        self.dstates = states
2623        self.desc = _make_rnn_desc(hidden_size, num_layers,
2624                                   ddesc, rnn_mode, input_mode,
2625                                   direction_mode, dtype, context_name)
2626        self.rnn_mode = rnn_mode
2627        self.direction_mode = direction_mode
2628        self.context_name = context_name
2629        self.dtype = dtype
2630
2631    def get_param_size(self, input_size):
2632        """
2633        Get the size of the shared variable for the parameters of the RNN.
2634
2635        This will return a size (in items) necessary to store all the
2636        parameters for the RNN.  You should allocate a variable of
2637        that size to store those parameters.  The order and layout of
2638        the parameters is opaque.
2639
2640        Parameters
2641        ----------
2642        input_size: (int, int)
2643            Size of the input blocks
2644
2645        """
2646        bytesize = _get_param_size(self.desc, input_size, self.dtype,
2647                                   self.context_name)
2648        bytesize = int(bytesize)
2649        assert bytesize % np.dtype(self.dtype).itemsize == 0
2650        return bytesize // np.dtype(self.dtype).itemsize
2651
2652    def split_params(self, w, layer, input_size):
2653        """
2654        Split the opaque parameter block into components.
2655
2656        Parameters
2657        ----------
2658        w: GpuArraySharedVariable
2659            opaque parameter block
2660        layer: int
2661            ID of the layer
2662        input_size: (int, int)
2663            Size of the input blocks
2664
2665        """
2666        if not isinstance(w, GpuArraySharedVariable):
2667            raise TypeError("split_params only works on gpuarray shared variables")
2668        return _split_rnn_params(w, self.desc, layer, input_size, self.dtype, self.rnn_mode)
2669
2670    def apply(self, w, x, hx, cx=None):
2671        """
2672        Apply the RNN to some data
2673
2674        Parameters
2675        ----------
2676        w:
2677            opaque parameter block
2678        x:
2679            input
2680        hx:
2681            initial hidden state
2682        cx:
2683            initial cell state (for LSTM)
2684        """
2685        # Don't return the reserve as an output
2686        return GpuDnnRNNOp(self.rnn_mode, self.direction_mode)(
2687            rnndesc_type.make_constant(self.desc),
2688            w, x, hx, cx, return_list=True)[1:]
2689
2690
2691def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
2692                                  epsilon=1e-4, running_average_factor=0.1,
2693                                  running_mean=None, running_var=None):
2694    """
2695    Performs batch normalization of the given inputs, using the mean and
2696    variance of the inputs.
2697
2698    Parameters
2699    ----------
2700    mode : {'per-activation', 'spatial'}
2701        Whether to normalize per activation or share normalization factors
2702        across spatial dimensions (i.e., all dimensions past the second).
2703    gamma : tensor
2704        Learnable scale factors. Must match the dimensionality of `inputs`,
2705        but have sizes of `1` for all axes normalized over (i.e., in the first
2706        dimension for ``mode='per-activation'`, and additionally in all
2707        dimensions past the second for ``mode='spatial'``).
2708    beta : tensor
2709        Learnable biases. Must match the tensor layout of `gamma`.
2710    epsilon : float
2711        Epsilon value used in the batch normalization formula. Minimum allowed
2712        value is 1e-5 (imposed by cuDNN).
2713    running_average_factor : float
2714        Factor for updating the values or `running_mean` and `running_var`.
2715        If the factor is close to one, the running averages will update quickly,
2716        if the factor is close to zero it will update slowly.
2717    running_mean : tensor or None
2718        Previous value of the running mean. If this is given, the new value
2719        ``running_mean * (1 - r_a_factor) + batch mean * r_a_factor``
2720        will be returned as one of the outputs of this function.
2721        `running_mean` and `running_var` should either both be given or
2722        both be None.
2723    running_var : tensor or None
2724        Previous value of the running variance. If this is given, the new value
2725        ``running_var * (1 - r_a_factor) + (m / (m - 1)) * batch var * r_a_factor``
2726        will be returned as one of the outputs of this function,
2727        where `m` is the product of lengths of the averaged-over dimensions.
2728        `running_mean` and `running_var` should either both be given or
2729        both be None.
2730
2731    Returns
2732    -------
2733    out : tensor
2734        Batch-normalized inputs.
2735    mean : tensor
2736        Means of `inputs` across the normalization axes.
2737    invstd : tensor
2738        Inverse standard deviations of `inputs` across the normalization axes.
2739    new_running_mean : tensor
2740        New value of the running mean (only if both `running_mean` and
2741        `running_var` were given).
2742    new_running_var : tensor
2743        New value of the running variance (only if both `running_var` and
2744        `running_mean` were given).
2745
2746    Notes
2747    -----
2748    Requires cuDNN 5 and Theano 0.9dev2 or more recent.
2749
2750    For 4d tensors, returned values are equivalent to:
2751
2752    .. code-block:: python
2753
2754        axes = 0 if mode == 'per-activation' else (0, 2, 3)
2755        mean = inputs.mean(axes, keepdims=True)
2756        var = inputs.var(axes, keepdims=True)
2757        invstd = T.inv(T.sqrt(var + epsilon))
2758        out = (inputs - mean) * gamma * invstd + beta
2759
2760        m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
2761        running_mean = running_mean * (1 - running_average_factor) + \\
2762                       mean * running_average_factor
2763        running_var = running_var * (1 - running_average_factor) + \\
2764                      (m / (m - 1)) * var * running_average_factor
2765
2766    For 5d tensors, the axes are (0, 2, 3, 4).
2767    """
2768    ndim = inputs.ndim
2769    if gamma.ndim != ndim or beta.ndim != ndim:
2770        raise ValueError("gamma and beta must be of the same dimensionality "
2771                         "as inputs; got %d and %d instead of %d" %
2772                         (gamma.ndim, beta.ndim, ndim))
2773    if (running_mean is None) != (running_var is None):
2774        raise ValueError("running_mean and running_var must either both be "
2775                         "given or both be None")
2776    if running_mean is not None and running_mean.ndim != ndim:
2777        raise ValueError("running_mean must be of the same dimensionality "
2778                         "as inputs; got %d instead of %d" %
2779                         (running_mean.ndim, ndim))
2780    if running_var is not None and running_var.ndim != ndim:
2781        raise ValueError("running_var must be of the same dimensionality "
2782                         "as inputs; got %d instead of %d" %
2783                         (running_var.ndim, ndim))
2784    if epsilon < 1e-5:
2785        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
2786
2787    running_averages = (running_mean is not None and running_var is not None)
2788
2789    if ndim < 4:
2790        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
2791        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
2792        beta = theano.tensor.shape_padright(beta, 4 - ndim)
2793        if running_averages:
2794            running_mean = theano.tensor.shape_padright(running_mean, 4 - ndim)
2795            running_var = theano.tensor.shape_padright(running_var, 4 - ndim)
2796    elif ndim > 5:
2797        inputs_shape = inputs.shape
2798        params_shape = gamma.shape
2799        inputs = theano.tensor.flatten(inputs, 5)
2800        gamma = theano.tensor.flatten(gamma, 5)
2801        beta = theano.tensor.flatten(beta, 5)
2802        if running_averages:
2803            running_mean = theano.tensor.flatten(running_mean, 5)
2804            running_var = theano.tensor.flatten(running_var, 5)
2805
2806    batchnorm_op = GpuDnnBatchNorm(mode=mode, running_averages=running_averages)
2807    if running_averages:
2808        out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
2809            gpu_contiguous(inputs), gpu_contiguous(gamma),
2810            gpu_contiguous(beta), epsilon=epsilon,
2811            running_average_factor=running_average_factor,
2812            running_mean=gpu_contiguous(running_mean),
2813            running_var=gpu_contiguous(running_var))
2814        if new_running_mean.broadcastable != running_mean.broadcastable:
2815            new_running_mean = tensor.patternbroadcast(new_running_mean, running_mean.broadcastable)
2816        if new_running_var.broadcastable != running_var.broadcastable:
2817            new_running_var = tensor.patternbroadcast(new_running_var, running_var.broadcastable)
2818        result = (out, mean, invstd, new_running_mean, new_running_var)
2819    else:
2820        result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
2821                              gpu_contiguous(beta), epsilon=epsilon)
2822    if ndim < 4:
2823        result = tuple(theano.tensor.flatten(r, ndim) for r in result)
2824    elif ndim > 5:
2825        result = (theano.tensor.reshape(result[0], inputs_shape),) + tuple(
2826            theano.tensor.reshape(r, params_shape) for r in result[1:])
2827    return result
2828
2829
2830def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
2831                                 mode='per-activation', epsilon=1e-4):
2832    """
2833    Performs batch normalization of the given inputs, using the given mean and
2834    variance.
2835
2836    Parameters
2837    ----------
2838    mode : {'per-activation', 'spatial'}
2839        Whether to normalize per activation or share normalization factors
2840        across spatial dimensions (i.e., all dimensions past the second).
2841    gamma : tensor
2842        Scale factors. Must match the dimensionality of `inputs`, but have
2843        sizes of `1` for all axes normalized over (i.e., in the first dimension
2844        for ``mode='per-activation'`, and additionally in all dimensions past
2845        the second for ``mode='spatial'``).
2846    beta : tensor
2847        Biases. Must match the tensor layout of `gamma`.
2848    mean : tensor
2849        Means. Usually these are running averages computed during training.
2850        Must match the tensor layout of `gamma`.
2851    var : tensor
2852        Variances. Usually these are running averages computed during training.
2853        Must match the tensor layout of `gamma`.
2854    epsilon : float
2855        Epsilon value used in the batch normalization formula. Minimum allowed
2856        value is 1e-5 (imposed by cuDNN).
2857
2858    Returns
2859    -------
2860    out : tensor
2861        Batch-normalized inputs.
2862
2863    Notes
2864    -----
2865    Requires cuDNN 5 and Theano 0.9dev2 or more recent.
2866
2867    For 4d tensors, the returned value is equivalent to:
2868
2869    .. code-block:: python
2870
2871        axes = (0,) if mode == 'per-activation' else (0, 2, 3)
2872        gamma, beta, mean, var = (T.addbroadcast(t, *axes)
2873                                  for t in (gamma, beta, mean, var))
2874        out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
2875
2876    For 5d tensors, the axes would be (0, 2, 3, 4).
2877    """
2878    ndim = inputs.ndim
2879    if gamma.ndim != ndim or beta.ndim != ndim:
2880        raise ValueError("gamma and beta must be of the same dimensionality "
2881                         "as inputs; got %d and %d instead of %d" %
2882                         (gamma.ndim, beta.ndim, ndim))
2883    if mean.ndim != ndim or var.ndim != ndim:
2884        raise ValueError("mean and var must be of the same dimensionality "
2885                         "as inputs; got %d and %d instead of %d" %
2886                         (mean.ndim, var.ndim, ndim))
2887    if epsilon < 1e-5:
2888        raise ValueError("epsilon must be at least 1e-5, got %f" % epsilon)
2889
2890    if ndim < 4:
2891        inputs = theano.tensor.shape_padright(inputs, 4 - ndim)
2892        gamma = theano.tensor.shape_padright(gamma, 4 - ndim)
2893        beta = theano.tensor.shape_padright(beta, 4 - ndim)
2894        mean = theano.tensor.shape_padright(mean, 4 - ndim)
2895        var = theano.tensor.shape_padright(var, 4 - ndim)
2896    elif ndim > 5:
2897        inputs_shape = inputs.shape
2898        inputs = theano.tensor.flatten(inputs, 5)
2899        gamma = theano.tensor.flatten(gamma, 5)
2900        beta = theano.tensor.flatten(beta, 5)
2901        mean = theano.tensor.flatten(mean, 5)
2902        var = theano.tensor.flatten(var, 5)
2903    batchnorm_op = GpuDnnBatchNormInference(mode=mode)
2904    result = batchnorm_op(gpu_contiguous(inputs), gpu_contiguous(gamma),
2905                          gpu_contiguous(beta), gpu_contiguous(mean),
2906                          gpu_contiguous(var), epsilon=epsilon)
2907    if ndim < 4:
2908        result = theano.tensor.flatten(result, ndim)
2909    elif ndim > 5:
2910        result = theano.tensor.reshape(result, inputs_shape)
2911    return result
2912
2913
2914class GpuDnnTransformerGrid(DnnBase):
2915    """
2916    Grid generator Op for cuDNN Spatial Transformer.
2917    """
2918    __props__ = ()
2919    _cop_num_inputs = 2
2920    _cop_num_outputs = 1
2921    _f16_ok = True
2922    check_input = False
2923
2924    def __init__(self):
2925        DnnBase.__init__(self, ["c_code/dnn_sptf_grid.c"], "APPLY_SPECIFIC(dnn_sptf_grid)")
2926
2927    def make_node(self, theta, out_dims):
2928        """
2929        Create a grid generator node for a cuDNN Spatial Transformer
2930
2931        Parameters
2932        ----------
2933        theta : tensor
2934            Affine transformation tensor containing one affine transformation
2935            matrix per image. ``theta`` is usually generated by the localization
2936            network.
2937
2938        out_dims : tuple
2939            Dimensions of the transformed inputs, containing four elements, and is given
2940            by (N, C, H, W), where N is the number of inputs, C the number of channels,
2941            H and W are the height and width of each input.
2942        """
2943        context_name = infer_context_name(theta)
2944
2945        theta = gpu_contiguous(as_gpuarray_variable(theta, context_name))
2946        assert theta.dtype in ('float16', 'float32', 'float64')
2947        assert theta.ndim == 3
2948
2949        out_dims = cpu_contiguous(as_tensor_variable(out_dims))
2950        assert out_dims.dtype in theano.tensor.basic.integer_dtypes
2951        assert out_dims.ndim == 1
2952        # Ensure 64-bit ints are passed to the C code
2953        out_dims = theano.tensor.basic.cast(out_dims, 'int64')
2954        grid = GpuArrayType(dtype=theta.dtype,
2955                            broadcastable=(theta.type.ndim + 1) * (False,),
2956                            context_name=context_name)()
2957
2958        inputs = [theta, out_dims]
2959        outputs = [grid]
2960        return Apply(self, inputs, outputs)
2961
2962    def grad(self, inputs, grads):
2963        theta, out_dims = inputs
2964        dgrid = grads[0]
2965
2966        dtheta = GpuDnnTransformerGradT()(dgrid)
2967        return [dtheta, grad_not_implemented(self, 1, out_dims)]
2968
2969
2970class GpuDnnTransformerSampler(DnnBase):
2971    """
2972    Grid sampler Op for cuDNN Spatial Transformer.
2973    """
2974    __props__ = ()
2975    _cop_num_inputs = 2
2976    _cop_num_outputs = 1
2977    _f16_ok = True
2978    check_input = False
2979
2980    def __init__(self):
2981        DnnBase.__init__(self, ["c_code/dnn_sptf_sampler.c"], "APPLY_SPECIFIC(dnn_sptf_sampler)")
2982
2983    def make_node(self, img, grid):
2984        """
2985        Create a grid sampler node for a cuDNN Spatial Transformer
2986
2987        Parameters
2988        ----------
2989        img : tensor
2990            Images from which the pixels will be sampled. The implementation
2991            assumes the tensor is in NCHW format, where N is the number of images,
2992            C is the number of color channels, H is the height of the inputs, and
2993            W is width of the inputs.
2994
2995        grid : GpuDnnTransformerGrid
2996            Grid that contains the coordinates of the pixels to be sampled from
2997            the inputs images.
2998        """
2999        context_name = infer_context_name(img, grid)
3000
3001        img = gpu_contiguous(as_gpuarray_variable(img, context_name))
3002        if img.type.ndim != 4:
3003            raise TypeError('img must be a 4D tensor')
3004        elif img.dtype not in ('float16', 'float32', 'float64'):
3005            raise TypeError('img type must be floating-point')
3006
3007        grid = gpu_contiguous(as_gpuarray_variable(grid, context_name))
3008        if grid.type.ndim != 4:
3009            raise TypeError('grid must be a 4D tensor')
3010        elif grid.dtype not in ('float16', 'float32', 'float64'):
3011            raise TypeError('grid type must be floating-point')
3012
3013        out = GpuArrayType(dtype=img.dtype,
3014                           broadcastable=img.type.ndim * (False,),
3015                           context_name=context_name)()
3016
3017        inputs = [img, grid]
3018        outputs = [out]
3019        return Apply(self, inputs, outputs)
3020
3021    def grad(self, inputs, grads):
3022        img, grid = inputs
3023        dy = grads[0]
3024
3025        dimg, dgrid = GpuDnnTransformerGradI()(img, grid, dy)
3026        return [dimg, dgrid]
3027
3028
3029class GpuDnnTransformerGradI(DnnBase):
3030    """
3031    Gradient of inputs Op for cuDNN Spatial Transformer.
3032    """
3033    __props__ = ()
3034    _cop_num_inputs = 3
3035    _cop_num_outputs = 2
3036    _f16_ok = True
3037    check_input = False
3038
3039    def __init__(self):
3040        DnnBase.__init__(self, ["c_code/dnn_sptf_gi.c"], "APPLY_SPECIFIC(dnn_sptf_gi)")
3041
3042    def make_node(self, img, grid, dy):
3043        context_name = infer_context_name(img, grid, dy)
3044
3045        img = as_gpuarray_variable(gpu_contiguous(img), context_name)
3046        if img.ndim != 4:
3047            raise TypeError('img must have 4 dimensions.')
3048
3049        grid = as_gpuarray_variable(gpu_contiguous(grid), context_name)
3050        if img.ndim != grid.ndim:
3051            raise TypeError('grid should have the same number of dimensions as img')
3052
3053        dy = as_gpuarray_variable(dy, context_name)
3054        if dy.ndim != 4:
3055            raise TypeError('dy must have 4 dimensions.')
3056
3057        dimg = img.type()
3058        dgrid = grid.type()
3059
3060        inputs = [img, grid, dy]
3061        outputs = [dimg, dgrid]
3062
3063        return Apply(self, inputs, outputs)
3064
3065
3066class GpuDnnTransformerGradT(DnnBase):
3067    """
3068    Gradient of affine transformations Op for cuDNN Spatial Transformer.
3069    """
3070    __props__ = ()
3071    _cop_num_inputs = 1
3072    _cop_num_outputs = 1
3073    _f16_ok = True
3074    check_input = False
3075
3076    def __init__(self):
3077        DnnBase.__init__(self, ["c_code/dnn_sptf_gt.c"], "APPLY_SPECIFIC(dnn_sptf_gt)")
3078
3079    def make_node(self, dgrid):
3080        context_name = infer_context_name(dgrid)
3081
3082        dgrid = as_gpuarray_variable(dgrid, context_name)
3083        assert dgrid.dtype in ('float16', 'float32', 'float64')
3084        assert dgrid.ndim == 4
3085
3086        dtheta = GpuArrayType(dtype=dgrid.dtype,
3087                              broadcastable=(dgrid.type.ndim - 1) * (False,),
3088                              context_name=context_name)()
3089        inputs = [dgrid]
3090        outputs = [dtheta]
3091
3092        return Apply(self, inputs, outputs)
3093
3094
3095def dnn_spatialtf(img, theta, scale_width=1, scale_height=1):
3096    """
3097    GPU spatial transformer using cuDNN from NVIDIA.
3098
3099    Parameters
3100    ----------
3101    img : tensor
3102        Images to which the transformations will be applied. The implementation
3103        assumes the tensor is in NCHW format, where N is the number of images,
3104        C is the number of color channels, H is the height of the inputs, and
3105        W is width of the inputs.
3106    theta : tensor
3107        Affine transformation tensor containing one affine transformation
3108        matrix per image. ``theta`` is usually generated by the localization
3109        network.
3110    scale_height: float
3111        A float specifying the scaling factor for the height of the output
3112        image. A value of 1 will keep the original height of the input. Values
3113        larger than 1 will upsample the input. Values below 1 will downsample
3114        the input.
3115    scale_width: float
3116        A float specifying the scaling factor for the width of the output
3117        image. A value of 1 will keep the original width of the input. Values
3118        larger than 1 will upsample the input. Values below 1 will downsample
3119        the input.
3120
3121    Returns
3122    -------
3123    out : tensor
3124        Transformed images with width and height properly scaled.
3125
3126    Notes
3127    -----
3128    Currently, cuDNN only supports 2D transformations with 2x3 affine
3129    transformation matrices.
3130
3131    Bilinear interpolation is the only grid sampler method available.
3132    """
3133    out_dims = (img.shape[0], img.shape[1],
3134                theano.tensor.ceil(img.shape[2] * scale_height),
3135                theano.tensor.ceil(img.shape[3] * scale_width))
3136    out_dims = tuple([as_scalar(v).astype('int64') for v in out_dims])
3137    # Setup spatial transformer
3138    grid = GpuDnnTransformerGrid()(theta, out_dims)
3139    sampler = GpuDnnTransformerSampler()(img, grid)
3140    return sampler
3141
3142
3143def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
3144    if (not isinstance(op, (AbstractConv2d,
3145                            AbstractConv2d_gradWeights,
3146                            AbstractConv2d_gradInputs))):
3147        return
3148
3149    if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
3150        return None
3151
3152    if op.unshared:
3153        return None
3154
3155    if isinstance(op.border_mode, tuple) and any(isinstance(p, tuple) for p in op.border_mode):
3156        # Asymmetric padding not yet supported
3157        return None
3158
3159    inp1 = inputs[0]
3160    inp2 = inputs[1]
3161
3162    if not dnn_available(inp1.type.context_name):
3163        return
3164
3165    if op.filter_flip:
3166        conv_mode = 'conv'
3167    else:
3168        conv_mode = 'cross'
3169
3170    if isinstance(op, AbstractConv2d):
3171        rval = dnn_conv(inp1, inp2,
3172                        border_mode=op.border_mode,
3173                        subsample=op.subsample,
3174                        dilation=op.filter_dilation,
3175                        direction_hint='forward!',
3176                        conv_mode=conv_mode,
3177                        num_groups=op.num_groups)
3178    elif isinstance(op, AbstractConv2d_gradWeights):
3179        shape = (inp2.shape[1], inp1.shape[1] // op.num_groups,
3180                 inputs[2][0], inputs[2][1])
3181        rval = dnn_gradweight(inp1, inp2, shape,
3182                              border_mode=op.border_mode,
3183                              subsample=op.subsample,
3184                              dilation=op.filter_dilation,
3185                              conv_mode=conv_mode,
3186                              num_groups=op.num_groups)
3187    elif isinstance(op, AbstractConv2d_gradInputs):
3188        shape = (inp2.shape[0], inp1.shape[1] * op.num_groups,
3189                 inputs[2][0], inputs[2][1])
3190        rval = dnn_gradinput(inp1, inp2, shape,
3191                             border_mode=op.border_mode,
3192                             subsample=op.subsample,
3193                             dilation=op.filter_dilation,
3194                             conv_mode=conv_mode,
3195                             num_groups=op.num_groups)
3196    return [rval]
3197
3198
3199def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
3200    if (not isinstance(op, (AbstractConv3d,
3201                            AbstractConv3d_gradWeights,
3202                            AbstractConv3d_gradInputs))):
3203        return
3204
3205    if version(raises=False) < 6000 and op.filter_dilation != (1, 1, 1):
3206        return None
3207
3208    inp1 = inputs[0]
3209    inp2 = inputs[1]
3210
3211    if not dnn_available(inp1.type.context_name):
3212        return
3213
3214    if op.filter_flip:
3215        conv_mode = 'conv'
3216    else:
3217        conv_mode = 'cross'
3218
3219    if isinstance(op, AbstractConv3d):
3220        rval = dnn_conv3d(inp1, inp2,
3221                          border_mode=op.border_mode,
3222                          subsample=op.subsample,
3223                          dilation=op.filter_dilation,
3224                          direction_hint='forward!',
3225                          conv_mode=conv_mode,
3226                          num_groups=op.num_groups)
3227    elif isinstance(op, AbstractConv3d_gradWeights):
3228        shape = (inp2.shape[1], inp1.shape[1] // op.num_groups,
3229                 inputs[2][0], inputs[2][1], inputs[2][2])
3230        rval = dnn_gradweight3d(inp1, inp2, shape,
3231                                border_mode=op.border_mode,
3232                                subsample=op.subsample,
3233                                dilation=op.filter_dilation,
3234                                conv_mode=conv_mode,
3235                                num_groups=op.num_groups)
3236    elif isinstance(op, AbstractConv3d_gradInputs):
3237        shape = (inp2.shape[0], inp1.shape[1] * op.num_groups,
3238                 inputs[2][0], inputs[2][1], inputs[2][2])
3239        rval = dnn_gradinput3d(inp1, inp2, shape,
3240                               border_mode=op.border_mode,
3241                               subsample=op.subsample,
3242                               dilation=op.filter_dilation,
3243                               conv_mode=conv_mode,
3244                               num_groups=op.num_groups)
3245    return [rval]
3246
3247
3248@local_optimizer([AbstractConv2d, AbstractConv3d])
3249def local_abstractconv_cudnn(node):
3250    ctx = infer_context_name(*node.inputs)
3251    if not isinstance(node.inputs[0].type, GpuArrayType):
3252        return
3253    if node.op.unshared:
3254        return None
3255    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
3256        # Asymmetric padding not yet supported
3257        return None
3258    if isinstance(node.op, AbstractConv2d):
3259        with inherit_stack_trace(node.outputs):
3260            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
3261    elif isinstance(node.op, AbstractConv3d):
3262        with inherit_stack_trace(node.outputs):
3263            return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
3264
3265
3266@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
3267def local_abstractconv_cudnn_alt(node):
3268    if(not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
3269       AbstractConv2d_gradInputs))):
3270        return
3271
3272    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
3273        return None
3274    if node.op.unshared:
3275        return None
3276    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
3277        # Asymmetric padding not yet supported
3278        return None
3279    inp1 = node.inputs[0]
3280    inp2 = node.inputs[1]
3281
3282    if not dnn_available(inp1.type.context_name):
3283        return
3284
3285    op = node.op
3286    border_mode = node.op.border_mode
3287    subsample = node.op.subsample
3288    filter_dilation = node.op.filter_dilation
3289    num_groups = node.op.num_groups
3290    precision, _ = get_precision(None, [inp1, inp2])
3291
3292    if node.op.filter_flip:
3293        conv_mode = 'conv'
3294    else:
3295        conv_mode = 'cross'
3296
3297    if isinstance(op, AbstractConv2d):
3298        if border_mode == 'half' or subsample != (1, 1) or num_groups != 1:
3299            return None
3300        if border_mode == 'full':
3301            direction_hint = 'bprop inputs'
3302        elif border_mode == 'valid' and filter_dilation == (1, 1):
3303            direction_hint = 'bprop weights'
3304        else:
3305            return None
3306
3307        rval = dnn_conv(inp1, inp2,
3308                        border_mode=border_mode,
3309                        subsample=subsample,
3310                        dilation=filter_dilation,
3311                        direction_hint=direction_hint,
3312                        conv_mode=conv_mode,
3313                        num_groups=num_groups)
3314
3315    elif isinstance(op, AbstractConv2d_gradWeights):
3316        if(border_mode == 'valid' and subsample == (1, 1) and
3317           filter_dilation == (1, 1) and num_groups == 1):
3318            img = gpu_contiguous(inp1)
3319            topgrad = gpu_contiguous(inp2)
3320            ctx_name = infer_context_name(img, topgrad)
3321            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
3322            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
3323            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
3324            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
3325            out_shp = get_conv_output_shape(ishape,
3326                                            tshape,
3327                                            border_mode=border_mode,
3328                                            subsample=subsample,
3329                                            filter_dilation=filter_dilation)
3330
3331            out_shp = assert_conv_shape(out_shp)
3332            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
3333            desc = GpuDnnConvDesc(border_mode=border_mode,
3334                                  subsample=subsample,
3335                                  dilation=filter_dilation,
3336                                  conv_mode='cross',
3337                                  precision=precision)(out.shape)
3338
3339            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc)
3340            if conv_mode == 'conv':
3341                conv = conv[:, :, ::-1, ::-1]
3342
3343            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
3344        else:
3345            return None
3346
3347    elif isinstance(op, AbstractConv2d_gradInputs):
3348        if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
3349            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
3350            topgrad = gpu_contiguous(inp2)
3351            ctx_name = infer_context_name(kerns, topgrad)
3352            conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
3353            desc = GpuDnnConvDesc(border_mode='full',
3354                                  subsample=subsample,
3355                                  dilation=filter_dilation,
3356                                  conv_mode=conv_mode,
3357                                  precision=precision)(kerns.shape)
3358
3359            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
3360            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
3361            shape = get_conv_output_shape(tshape,
3362                                          kshape,
3363                                          border_mode='full',
3364                                          subsample=subsample,
3365                                          filter_dilation=filter_dilation)
3366
3367            shape = assert_conv_shape(shape)
3368            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
3369            rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns, out, desc)
3370        else:
3371            return None
3372
3373    return [rval]
3374
3375
3376@local_optimizer([AbstractConv3d, AbstractConv3d_gradWeights, AbstractConv3d_gradInputs])
3377def local_abstractconv3d_cudnn_alt(node):
3378    if(not isinstance(node.op, (AbstractConv3d,
3379                                AbstractConv3d_gradWeights,
3380                                AbstractConv3d_gradInputs))):
3381        return
3382
3383    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1, 1):
3384        return None
3385    inp1 = node.inputs[0]
3386    inp2 = node.inputs[1]
3387
3388    if not dnn_available(inp1.type.context_name):
3389        return
3390
3391    op = node.op
3392    border_mode = node.op.border_mode
3393    subsample = node.op.subsample
3394    filter_dilation = node.op.filter_dilation
3395    num_groups = node.op.num_groups
3396    precision, _ = get_precision(None, [inp1, inp2])
3397
3398    if node.op.filter_flip:
3399        conv_mode = 'conv'
3400    else:
3401        conv_mode = 'cross'
3402
3403    if isinstance(op, AbstractConv3d):
3404        if border_mode == 'half' or subsample != (1, 1, 1) or num_groups > 1:
3405            return None
3406        if border_mode == 'full':
3407            direction_hint = 'bprop inputs'
3408        elif border_mode == 'valid' and filter_dilation == (1, 1, 1):
3409            direction_hint = 'bprop weights'
3410        else:
3411            return None
3412
3413        rval = dnn_conv3d(inp1, inp2,
3414                          border_mode=border_mode,
3415                          subsample=subsample,
3416                          dilation=filter_dilation,
3417                          direction_hint=direction_hint,
3418                          conv_mode=conv_mode)
3419
3420    elif isinstance(op, AbstractConv3d_gradWeights):
3421        if(border_mode == 'valid' and subsample == (1, 1, 1) and
3422           filter_dilation == (1, 1, 1) and num_groups == 1):
3423            img = gpu_contiguous(inp1)
3424            topgrad = gpu_contiguous(inp2)
3425            ctx_name = infer_context_name(img, topgrad)
3426            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
3427            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3, 4))
3428            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
3429            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
3430            out_shp = get_conv_output_shape(ishape,
3431                                            tshape,
3432                                            border_mode=border_mode,
3433                                            subsample=subsample,
3434                                            filter_dilation=filter_dilation)
3435
3436            out_shp = assert_conv_shape(out_shp)
3437            out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
3438            desc = GpuDnnConvDesc(border_mode=border_mode,
3439                                  subsample=subsample,
3440                                  dilation=filter_dilation,
3441                                  conv_mode='cross',
3442                                  num_groups=num_groups,
3443                                  precision=precision)(out.shape)
3444
3445            conv = GpuDnnConv(algo=None, num_groups=num_groups)(
3446                img, topgrad, out, desc)
3447            if conv_mode == 'conv':
3448                conv = conv[:, :, ::-1, ::-1, ::-1]
3449
3450            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
3451        else:
3452            return None
3453
3454    elif isinstance(op, AbstractConv3d_gradInputs):
3455        if border_mode == 'valid' and subsample == (1, 1, 1) and num_groups == 1:
3456            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3, 4))
3457            topgrad = gpu_contiguous(inp2)
3458            ctx_name = infer_context_name(kerns, topgrad)
3459            conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
3460            desc = GpuDnnConvDesc(border_mode='full',
3461                                  subsample=subsample,
3462                                  dilation=filter_dilation,
3463                                  conv_mode=conv_mode,
3464                                  num_groups=num_groups,
3465                                  precision=precision)(kerns.shape)
3466
3467            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
3468            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
3469            shape = get_conv_output_shape(tshape,
3470                                          kshape,
3471                                          border_mode='full',
3472                                          subsample=subsample,
3473                                          filter_dilation=filter_dilation)
3474
3475            shape = assert_conv_shape(shape)
3476            out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape)
3477            rval = GpuDnnConv(algo=None, num_groups=num_groups)(
3478                topgrad, kerns, out, desc)
3479        else:
3480            return None
3481
3482    return [rval]
3483
3484
3485@local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
3486def local_abstractconv_gw_cudnn(node):
3487    ctx = infer_context_name(*node.inputs)
3488    if not isinstance(node.inputs[0].type, GpuArrayType):
3489        return
3490    if node.op.unshared:
3491        return None
3492    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
3493        # Asymmetric padding not yet supported
3494        return None
3495    if isinstance(node.op, AbstractConv2d_gradWeights):
3496        with inherit_stack_trace(node.outputs):
3497            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
3498    elif isinstance(node.op, AbstractConv3d_gradWeights):
3499        with inherit_stack_trace(node.outputs):
3500            return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
3501
3502
3503@local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
3504def local_abstractconv_gi_cudnn(node):
3505    ctx = infer_context_name(*node.inputs)
3506    if not isinstance(node.inputs[0].type, GpuArrayType):
3507        return
3508    if node.op.unshared:
3509        return None
3510    if isinstance(node.op.border_mode, tuple) and any(isinstance(p, tuple) for p in node.op.border_mode):
3511        # Asymmetric padding not yet supported
3512        return None
3513    if isinstance(node.op, AbstractConv2d_gradInputs):
3514        with inherit_stack_trace(node.outputs):
3515            return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
3516    elif isinstance(node.op, AbstractConv3d_gradInputs):
3517        with inherit_stack_trace(node.outputs):
3518            return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
3519
3520
3521@inplace_allocempty(GpuDnnConv, 2)
3522def local_dnn_conv_inplace(node, inputs):
3523    return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
3524
3525
3526@inplace_allocempty(GpuDnnConvGradW, 2)
3527def local_dnn_convgw_inplace(node, inputs):
3528    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
3529
3530
3531@inplace_allocempty(GpuDnnConvGradI, 2)
3532def local_dnn_convgi_inplace(node, inputs):
3533    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]
3534
3535optdb.register('local_dnna_conv_inplace',
3536               tensor.opt.in2out(local_dnn_conv_inplace,
3537                                 local_dnn_convgw_inplace,
3538                                 local_dnn_convgi_inplace,
3539                                 name="local_dnna_conv_inplace"),
3540               70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
3541
3542
3543@register_opt('cudnn')
3544@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
3545def local_dnn_conv_alpha_merge(node, *inputs):
3546    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
3547
3548
3549@register_opt('cudnn')
3550@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
3551def local_dnn_convw_alpha_merge(node, *inputs):
3552    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
3553
3554
3555@register_opt('cudnn')
3556@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
3557def local_dnn_convi_alpha_merge(node, *inputs):
3558    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
3559
3560
3561@register_opt('cudnn')
3562@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
3563def local_dnn_conv_output_merge(node, *inputs):
3564    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
3565    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
3566
3567
3568@register_opt('cudnn')
3569@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
3570def local_dnn_convw_output_merge(node, *inputs):
3571    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
3572    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
3573
3574
3575@register_opt('cudnn')
3576@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
3577def local_dnn_convi_output_merge(node, *inputs):
3578    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
3579    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]
3580
3581
3582def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
3583    if not dnn_available(ctx_name):
3584        return
3585    if not op.ignore_border:
3586        return
3587    img, ws, stride, pad = inputs
3588    nd = op.ndim
3589    if nd not in (2, 3):
3590        return
3591    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
3592    mode = op.mode
3593    # dnn_pool expects exactly 2 non-pooling dimensions
3594    if img.ndim == nd + 2:
3595        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
3596    else:
3597        # reshape to 4D or 5D with 2 non-pooling dimensions
3598        img_padded = pad_dims(img, 2, nd)
3599        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
3600        return unpad_dims(ret_padded, img, 2, nd)
3601pool_db.register("local_gpua_pool_dnn_alternative",
3602                 op_lifter([Pool])(local_gpua_pool_dnn_alternative),
3603                 'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
3604                 position=0)
3605pool_db2.register("local_gpua_pool_dnn_alternative",
3606                  local_optimizer([Pool])(local_gpua_pool_dnn_alternative),
3607                  'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
3608                  position=0)
3609
3610
3611def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
3612    if not dnn_available(ctx_name):
3613        return
3614    if not op.ignore_border:
3615        return
3616    inp, out, out_grad, ws, stride, pad = inputs
3617    nd = op.ndim
3618    if nd not in (2, 3):
3619        return
3620    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
3621    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
3622    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
3623    mode = op.mode
3624
3625    # the GPU ops expect exactly 2 non-pooling dimensions
3626    if inp.ndim == nd + 2:
3627        return GpuDnnPoolGrad(mode=mode)(inp,
3628                                         out,
3629                                         out_grad,
3630                                         ws,
3631                                         stride,
3632                                         pad)
3633    else:
3634        # reshape to 4D or 5D with 2 non-pooling dimensions
3635        inp_padded = pad_dims(inp, 2, nd)
3636        out_padded = pad_dims(out, 2, nd)
3637        out_grad_padded = pad_dims(out_grad, 2, nd)
3638        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
3639                                               out_padded,
3640                                               out_grad_padded,
3641                                               ws,
3642                                               stride,
3643                                               pad)
3644        return unpad_dims(ret_padded, inp, 2, nd)
3645pool_db.register("local_gpua_pool_dnn_grad_stride",
3646                 op_lifter([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
3647                 'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
3648                 position=0)
3649pool_db2.register("local_gpua_pool_dnn_grad_stride",
3650                  local_optimizer([MaxPoolGrad])(local_gpua_pool_dnn_grad_stride),
3651                  'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
3652                  position=0)
3653
3654
3655def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
3656    if not dnn_available(ctx_name):
3657        return
3658    if not op.ignore_border:
3659        return
3660    inp, out_grad, ws, stride, pad = inputs
3661    nd = op.ndim
3662    if nd not in (2, 3):
3663        return
3664    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
3665    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
3666    mode = op.mode
3667
3668    # the GPU ops expect exactly 2 non-pooling dimensions
3669    if inp.ndim == nd + 2:
3670        # We reuse out_grad because cuDNN does not use the value of the `out`
3671        # argument but still checks its shape for average pooling. This
3672        # has been observed in v2 and v3 as far as I know.
3673        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad)
3674    else:
3675        # reshape to 4D or 5D with 2 non-pooling dimensions
3676        inp_padded = pad_dims(inp, 2, nd)
3677        out_grad_padded = pad_dims(out_grad, 2, nd)
3678        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded,
3679                                               out_grad_padded,
3680                                               out_grad_padded,
3681                                               ws,
3682                                               stride,
3683                                               pad)
3684        return unpad_dims(ret_padded, inp, 2, nd)
3685pool_db.register("local_gpua_avg_pool_dnn_grad_stride",
3686                 op_lifter([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
3687                 'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
3688                 position=0)
3689pool_db2.register("local_gpua_avg_pool_dnn_grad_stride",
3690                  local_optimizer([AveragePoolGrad])(local_gpua_avg_pool_dnn_grad_stride),
3691                  'gpuarray', 'fast_compile', 'fast_run', 'cudnn',
3692                  position=0)
3693
3694
3695@register_opt('cudnn', 'fast_compile')
3696@local_optimizer([GpuSoftmax])
3697def local_softmax_dnn(node):
3698    if isinstance(node.op, GpuSoftmax):
3699        if not dnn_available(node.outputs[0].type.context_name):
3700            return
3701        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
3702        ins = gpu_contiguous(ins)
3703        out = GpuDnnSoftmax('accurate', 'channel')(ins)
3704        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
3705        return [out]
3706
3707
3708@register_opt('cudnn', 'stabilize')
3709@local_optimizer([GpuElemwise])
3710def local_log_softmax_dnn(node):
3711    # This looks for GpuDnnSoftmax so we know that we have cudnn.
3712    if (isinstance(node.op, GpuElemwise) and
3713            isinstance(node.op.scalar_op, Log) and
3714            node.inputs[0].owner and
3715            isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
3716            len(node.inputs[0].clients) == 1):
3717        softmax_node = node.inputs[0].owner
3718        new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
3719        return [new_softmax(softmax_node.inputs[0])]
3720
3721
3722@register_opt('cudnn', 'fast_compile')
3723@op_lifter([LogSoftmax])
3724@register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
3725def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
3726    # Transform the input in the format expected by GpuDnnSoftmax
3727    inp = inputs[0]
3728    if inp.ndim != 2:
3729        return
3730    if not dnn_available(ctx_name):
3731        return
3732
3733    inp = inp.dimshuffle(0, 1, 'x', 'x')
3734    inp.tag.context_name = ctx_name
3735
3736    # Apply GpuDnnSoftmax and return the result
3737    out = GpuDnnSoftmax('log', 'channel')(gpu_contiguous(inp))
3738    return [out.dimshuffle(0, 1)]
3739
3740
3741@register_opt('cudnn', 'fast_compile')
3742@op_lifter([SoftmaxGrad])
3743@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
3744def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
3745    if not dnn_available(ctx_name):
3746        return
3747    ins = []
3748    for n in inputs:
3749        n = as_gpuarray_variable(n, ctx_name)
3750        if n.ndim != 2:
3751            return
3752        ins.append(n.dimshuffle(0, 'x', 1, 'x'))
3753
3754    out = GpuDnnSoftmaxGrad('accurate', 'instance')(
3755        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
3756    return [out.dimshuffle(0, 2)]
3757
3758
3759@register_opt('cudnn')
3760@local_optimizer([GpuCAReduceCuda])
3761def local_dnn_reduction(node):
3762    if not isinstance(node.op, GpuCAReduceCuda):
3763        return
3764
3765    if not dnn_available(node.inputs[0].type.context_name):
3766        return
3767
3768    if version(raises=False) < 6000:
3769        return
3770
3771    if node.inputs[0].ndim > 8:
3772        return
3773
3774    acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)
3775
3776    if node.inputs[0].dtype != node.outputs[0].dtype:
3777        # We can mix float16 and float32, but not float64.
3778        if (node.inputs[0].dtype == 'float64' or
3779                node.outputs[0].dtype == 'float64'):
3780            return
3781        if acc_dtype != 'float32':
3782            return
3783
3784    if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
3785        return
3786
3787    if (node.inputs[0].dtype == 'float64' and acc_dtype != 'float64'):
3788        return
3789
3790    if (node.inputs[0].dtype == 'float32' and acc_dtype != 'float32'):
3791        return
3792
3793    if (node.inputs[0].dtype == 'float16' and acc_dtype == 'float64'):
3794        return
3795
3796    def _identity(a):
3797        return a
3798
3799    def _square(a):
3800        return GpuElemwise(theano.scalar.basic.sqr)(a)
3801
3802    scal = node.op.scalar_op.name
3803    post = _identity
3804
3805    if node.op.pre_scalar_op is not None:
3806        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
3807            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
3808                scal = 'norm2'
3809                post = _square
3810            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
3811                scal = 'norm1'
3812            else:
3813                return
3814        elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and
3815                isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)):
3816            scal = 'absmax'
3817        else:
3818            return
3819
3820    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
3821        return
3822
3823    with inherit_stack_trace(node.outputs):
3824        ret = GpuDnnReduction(scal,
3825                              node.op.axis,
3826                              acc_dtype,
3827                              node.op.dtype,
3828                              False)(node.inputs[0])
3829        return [post(ret)]
3830
3831
3832@register_opt('cudnn')
3833@local_optimizer([GpuMaxAndArgmax])
3834def local_cudnn_maxandargmax(node):
3835    if not isinstance(node.op, GpuMaxAndArgmax):
3836        return
3837
3838    if not dnn_available(node.inputs[0].type.context_name):
3839        return
3840
3841    if version(raises=False) < 6000:
3842        return
3843
3844    if node.inputs[0].ndim > 8:
3845        return
3846
3847    if node.inputs[0].dtype != node.outputs[0].dtype:
3848        return
3849
3850    if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
3851        return
3852
3853    # order of the axes influences the output indices
3854    if (node.op.axis is not None and
3855            tuple(sorted(node.op.axis)) != node.op.axis):
3856        return
3857
3858    max, arg = GpuDnnReduction('maximum', node.op.axis, node.outputs[0].dtype,
3859                               node.outputs[0].dtype, True)(node.inputs[0])
3860
3861    # cudnn can only return int32 indices
3862    return (max, as_gpuarray_variable(arg.astype('int64'),
3863                                      node.outputs[1].type.context_name))
3864
3865
3866@register_opt('cudnn', 'fast_compile')
3867@op_lifter([Argmax])
3868@register_opt2([Argmax], 'fast_compile', 'cudnn')
3869def local_dnn_argmax(op, ctx_name, inputs, outputs):
3870    if not dnn_available(ctx_name):
3871        return
3872
3873    if version(raises=False) < 6000:
3874        return
3875
3876    if inputs[0].ndim > 8:
3877        return
3878
3879    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
3880        return
3881
3882    # order of the axes influences the output indices
3883    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
3884        return
3885
3886    max, arg = GpuDnnReduction('maximum', op.axis, inputs[0].dtype,
3887                               inputs[0].dtype, True)(*inputs)
3888
3889    return [as_gpuarray_variable(arg.astype('int64'), ctx_name)]
3890
3891
3892class NoCuDNNRaise(Optimizer):
3893
3894    def apply(self, fgraph):
3895        """
3896        Raise a error if cudnn can't be used.
3897
3898        """
3899        for c in list_contexts():
3900            if not dnn_available(c):
3901                # Make an assert error as we want Theano to fail, not
3902                # just skip this optimization.
3903                raise AssertionError(
3904                    "cuDNN optimization was enabled, but Theano was not able "
3905                    "to use it for context " + str(c) + ". We got this error: \n" +
3906                    dnn_available.msg)
3907
3908gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
3909
3910
3911def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
3912    x, scale, bias, epsilon, running_average_factor = inputs[:5]
3913    running_mean = inputs[5] if len(inputs) > 5 else None
3914    running_var = inputs[6] if len(inputs) > 6 else None
3915
3916    # convert axes to cuDNN mode
3917    axes = tuple(op.axes)
3918    if axes == (0,):
3919        mode = 'per-activation'
3920    elif axes == (0,) + tuple(range(2, x.ndim)):
3921        mode = 'spatial'
3922    else:
3923        return None
3924
3925    try:
3926        eps = theano.tensor.get_scalar_constant_value(epsilon)
3927    except theano.tensor.NotScalarConstantError:
3928        return None
3929    if eps < 1e-5:
3930        return None
3931    try:
3932        running_average_factor = theano.tensor.get_scalar_constant_value(running_average_factor)
3933    except theano.tensor.NotScalarConstantError:
3934        return None
3935
3936    ctx = infer_context_name(*inputs)
3937    if not dnn_available(ctx):
3938        return
3939    x = as_gpuarray_variable(x, context_name=ctx)
3940    scale = as_gpuarray_variable(scale, context_name=ctx)
3941    bias = as_gpuarray_variable(bias, context_name=ctx)
3942
3943    inputs = [x, scale, bias, mode, eps, running_average_factor]
3944    if running_mean is not None and running_var is not None:
3945        inputs.append(running_mean)
3946        inputs.append(running_var)
3947
3948    results = list(dnn_batch_normalization_train(*inputs))
3949
3950    return results
3951
3952
3953@register_inplace()
3954@local_optimizer([GpuDnnBatchNorm], inplace=True)
3955def local_batch_norm_inplace_output(node):
3956    if isinstance(node.op, GpuDnnBatchNorm) and not node.op.inplace_output:
3957        return GpuDnnBatchNorm(mode=node.op.mode,
3958                               running_averages=node.op.running_averages,
3959                               inplace_running_mean=node.op.inplace_running_mean,
3960                               inplace_running_var=node.op.inplace_running_var,
3961                               inplace_output=True)(*node.inputs)
3962
3963
3964@register_inplace()
3965@local_optimizer([GpuDnnBatchNorm], inplace=True)
3966def local_batch_norm_inplace_running_mean(node):
3967    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_mean:
3968        return GpuDnnBatchNorm(mode=node.op.mode,
3969                               running_averages=node.op.running_averages,
3970                               inplace_running_mean=True,
3971                               inplace_running_var=node.op.inplace_running_var,
3972                               inplace_output=node.op.inplace_output)(*node.inputs)
3973
3974
3975@register_inplace()
3976@local_optimizer([GpuDnnBatchNorm], inplace=True)
3977def local_batch_norm_inplace_running_var(node):
3978    if isinstance(node.op, GpuDnnBatchNorm) and node.op.running_averages and not node.op.inplace_running_var:
3979        return GpuDnnBatchNorm(mode=node.op.mode,
3980                               running_averages=node.op.running_averages,
3981                               inplace_running_mean=node.op.inplace_running_mean,
3982                               inplace_running_var=True,
3983                               inplace_output=node.op.inplace_output)(*node.inputs)
3984
3985
3986@register_inplace()
3987@local_optimizer([GpuDnnBatchNormInference], inplace=True)
3988def local_batch_norm_inference_inplace(node):
3989    if isinstance(node.op, GpuDnnBatchNormInference) and not node.op.inplace:
3990        return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]
3991
3992
3993def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
3994    x, dy, scale, x_mean, x_invstd, epsilon = inputs
3995
3996    # input on gpu?  TODO what about the output?
3997    x_on_gpu = (isinstance(x.type, GpuArrayType) or
3998                (x.owner and isinstance(x.owner.op, HostFromGpu)))
3999    dy_on_gpu = (isinstance(dy.type, GpuArrayType) or
4000                 (dy.owner and isinstance(dy.owner.op, HostFromGpu)))
4001    if not (x_on_gpu or dy_on_gpu):
4002        return None
4003
4004    # convert axes to cuDNN mode
4005    axes = tuple(op.axes)
4006    if axes == (0,):
4007        mode = 'per-activation'
4008    elif axes == (0,) + tuple(range(2, x.ndim)):
4009        mode = 'spatial'
4010    else:
4011        return None
4012
4013    ndim = x.ndim
4014    if ndim < 4:
4015        x = theano.tensor.shape_padright(x, 4 - ndim)
4016        dy = theano.tensor.shape_padright(dy, 4 - ndim)
4017        scale = theano.tensor.shape_padright(scale, 4 - ndim)
4018        x_mean = theano.tensor.shape_padright(x_mean, 4 - ndim)
4019        x_invstd = theano.tensor.shape_padright(x_invstd, 4 - ndim)
4020    elif ndim > 5:
4021        x_shape = x.shape
4022        params_shape = scale.shape
4023        x = theano.tensor.flatten(x, 5)
4024        dy = theano.tensor.flatten(dy, 5)
4025        scale = theano.tensor.flatten(scale, 5)
4026        x_mean = theano.tensor.flatten(x_mean, 5)
4027        x_invstd = theano.tensor.flatten(x_invstd, 5)
4028
4029    try:
4030        eps = theano.tensor.get_scalar_constant_value(epsilon)
4031    except theano.tensor.NotScalarConstantError:
4032        return None
4033    if eps < 1e-5:
4034        return None
4035
4036    ctx = infer_context_name(*inputs)
4037    if not dnn_available(ctx):
4038        return
4039    x = as_gpuarray_variable(x, context_name=ctx)
4040    dy = as_gpuarray_variable(dy, context_name=ctx)
4041    scale = as_gpuarray_variable(scale, context_name=ctx)
4042    x_mean = as_gpuarray_variable(x_mean, context_name=ctx)
4043    x_invstd = as_gpuarray_variable(x_invstd, context_name=ctx)
4044
4045    g_wrt_inputs, g_wrt_scale, g_wrt_bias = \
4046        GpuDnnBatchNormGrad(mode)(x, dy, scale, x_mean, x_invstd, eps)
4047
4048    if ndim < 4:
4049        g_wrt_inputs = theano.tensor.flatten(g_wrt_inputs, ndim)
4050        g_wrt_scale = theano.tensor.flatten(g_wrt_scale, ndim)
4051        g_wrt_bias = theano.tensor.flatten(g_wrt_bias, ndim)
4052    elif ndim > 5:
4053        g_wrt_inputs = theano.tensor.reshape(g_wrt_inputs, x_shape)
4054        g_wrt_scale = theano.tensor.reshape(g_wrt_scale, params_shape)
4055        g_wrt_bias = theano.tensor.reshape(g_wrt_bias, params_shape)
4056
4057    return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
4058
4059
4060def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
4061    x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs
4062
4063    axes = tuple(op.axes)
4064    if axes == (0,):
4065        mode = 'per-activation'
4066    elif axes == (0,) + tuple(range(2, x.ndim)):
4067        mode = 'spatial'
4068    else:
4069        return None
4070
4071    try:
4072        eps = theano.tensor.get_scalar_constant_value(epsilon)
4073    except theano.tensor.NotScalarConstantError:
4074        return None
4075    if eps < 1e-5:
4076        return None
4077
4078    ctx = infer_context_name(*inputs)
4079    if not dnn_available(ctx):
4080        return
4081    x = as_gpuarray_variable(x, context_name=ctx)
4082    scale = as_gpuarray_variable(scale, context_name=ctx)
4083    bias = as_gpuarray_variable(bias, context_name=ctx)
4084    estimated_mean = as_gpuarray_variable(estimated_mean, context_name=ctx)
4085    estimated_variance = as_gpuarray_variable(estimated_variance, context_name=ctx)
4086
4087    out = dnn_batch_normalization_test(x, scale, bias, estimated_mean, estimated_variance,
4088                                       mode, eps)
4089
4090    return [out]
4091